#R serous NH model -
#NB: For windows machines, you'll need to replace "quartz()" with "win.graph()" or something like that (to open a new window for a graph). I think that's the only Mac/windows anomaly, but I'm not certain of this.
# ANALYSIS OVERVIEW
# I) Kaplan-Meir Analysis of tumor progression as a function of tumor size (diameter).
# Note that the resulting 1000 KM curves are used in later sections as models tumors.
# II) Growth Rate Optimization
# a) Early growth rate (growth of CIS,I,II tumors)
# b) Late growth rate (growth of Stage III+ tumors)
# III) Plotting Model tumor Natural Histories
# IV) Modeling ED Test Performance
# a) Sensitivity Analysis
# b) Mortality reduction
#Outline: First we do a Kaplan-Meier analysis of tumor progression to Stage II or beyond, and a separate analysis of tumor progression to Stage III or beyond. Then we calculate sensitivity for detection of tumors, which would otherwise be detected after progression to Stage II, before they progress to Stage II. Then we use the Kaplan-Meier analysis of progression to Stage III+ as a function of size, estimates for the distribution of serous tumor sizes at diagnosis, our estimate from the prevalence and incidence data of the duration of the "occult" period and the "occult CIS, Stage I or Stage II" period, and the size distribution of early-stage tumors found at PBSO, to optimize growth rate and intercept parameters in an exponential growth model for early-stage tumors. Next we use 1000 model tumor "natural histories" (of growth, progresssion and diagnosis) to estimate sensitivity in detecting tumors before they progress to Stage III or IV as a function of the size threshold for detection and the frequency of a screening test (counting only tumors that would otherwise be detected - presumably due to signs and symptoms - only after they progress to Stage III or IV). To calculate the reduction in 5 year mortality as a function of size threshold and screening interval, we use the models for growth and the Kaplan-Meier analyses of stage progression vs. tumor size to estimate the stage distribution of tumors at detection or diagnosis as a function of the size threshold for detection by a screening test and the frequency of screening. We multiply the fraction of tumors detected at each stage by the stage-specific 5-year survival, to get the overall survival under the specified screening model. Do the same with the stage distribution expected in the absence of a screen (using either the van Nagell 2007 paper as the source for the tumor size-at-diagnosis distribution in a screened population, or assuming that tumors are diagnosed at a diameter of 9±1cm). (percent 5-year survival with specified screen- percent 5-year survival without screen)/(1-percent 5-year survival without screen) gives the percent reduction in 5-year mortality.
##########################################################################
# Input data from literature
#-------------------------------------------------------
#To model performance of early detection based on the change in the distribution of stages achieved by a specified screening frequency and sensitivity we need to specify survival as a function of stage. The values can be plugged in from whatever source we can find, but I've opted to use data from the SEER 1994-2000 database, and considering their "localized", "regional" and "distant" categories to represent CIS&StageI, Stage II and Stage III+, respectively, the 5-year survival values are 93%, 74% and 34%, respectively.
Stage1survival<-93
Stage2survival<-74
Stage34survival<-34
#Number of new serous cancers diagnosed per year (in US) based on ACS Cancer Facts and Figures 2008: 21650 new cases of invasive ovarian cancer, and Seidman, JD, et al. Int J. Gyne Path 2007: 67.8% serous (using a conservative definition of serous cancer; with a more inclusive definition, the value was 78%.)
AnnualovarianCA<-21650
Fractionserous<-0.678
Annualcases<-AnnualovarianCA*Fractionserous
#values for duration (in years) of each stage from prevalence/incidence analysis. Confidence intervals and probability distributions for these duration estimates are generated by a separate program "Supplemental File 1.R" using a Bayesian strategy.
StageCIS1duration<-3.6
Stage2duration<-0.9
Stage34duration<-0.9
Earlystageduration<-4.3
#dxv is a vector of 1000 tumor-size values, representing an estimate of the distribution of sizes of tumors at clinical diagnosis (including conventional screens like CA125 and TVUS) in a high-risk population. These sizes are derived by first sampling with replacement from the sizes reported for tumors presumed to be serous that were detected by screening in the study reported by van Nagell et al., 2007, and using mean=8cm as our size estimate for the 9 tumors in that screening study that were not detected by screening but were diagnosed clinically between screens. The distribution of the log10(diameters) is then smoothed with a normal kernel with sd=0.1.
dxv<-vector("numeric", length=1000)
dxv[]<-10*sample(c(rep(8,9),3.3,3.7,6.8,4.1,4.6,3.7,3.1,5.6,6.3,3.4,3.8,8.1,8.5,6.7,5.5,8.4,8.6),size=1000,replace=TRUE)
#smoothing step
dxv<-10^(log10(dxv)+rnorm(1000,mean=0,sd=0.1))
summary(dxv)
#plot the resulting model for the distribution of tumor sizes at diagnosis in a closely-monitored (eg., high-risk) population.
library(lattice)
quartz()
densityplot(dxv,type="l",xlim=c(0,200),xlab="Tumor diameter",ylab="Relative Frequency",main="Assumed Distribution of Tumor Sizes
at Diagnosis in High-Risk Population",col="red")
# dx is a vector of 1000 tumor-size values, representing the distribution of sizes of clinically-detected tumors in the normal, unscreened population. These tumors are assumed to have sizes log-normally distributed around 8 cm with a with the standard deviation of the log10(diameters) equal to 0.15. This is generally consistent with published and unpublished estimates of the diamters of serous tumors at clinical presentation. You can enter alternative values to explore how this would change the predictions. The mean and SD parameters for this model can be adjusted by inputting alternative values for "meandiameteratdx" and "SDsizeatdx", respectively.
# input the geometric mean of the tumor diameter at diagnosis, (in mm) ('meandiameteratdx') and the standard deviation of the log10 of tumor diameters at diagnosis ("SDsizeatdx") to be used for modeling sizes at diagnosis in normal-risk patient population (model will assume that tumor diameters follow a log-normal distribution).
meandiameteratdx<-80
SDlogdiam<-0.15
dx<-vector("numeric", length=1000)
dx[]<-10^(rnorm(1000,mean=log10(meandiameteratdx),sd=SDlogdiam))
summary(dx)
#plot the resulting model for the distribution of tumor sizes at diagnosis in a routinely-monitored (eg., normal-risk) population.
quartz()
densityplot(dx,type="l",xlim=c(0,200),xlab="Tumor diameter",ylab="Relative Frequency",main="Assumed Distribution of Tumor Sizes
at Diagnosis in Normal-Risk Population",col="red")
#Input vectors of sizes of early and late tumors from the PBSO data summary.
#The earliertumorsizes vector represents the reported sizes of Stage CIS or I serous tumors discovered by PBSO in BRCA1 women, in the publications evaluated in our study of the natural history of serous ovarian cancer. The number of elements in this vector is larger than the number of tumors at each stage, because for each tumor whose size was given only as "microscopic", I generated 21 different sizes corresponding to the sizes of the 21 different tumors that were called "microscopic" (i.e. undetectable by gross examination and discovered only by microscopic examination), but for which the actual sizes were reported. For the tumors in each set whose sizes were explicitly reported, the corresponding sizes are represented 21 times. This gives us a distribution of sizes in which the specifically reported sizes are represented proportionately, and the tumor sizes defined only as "microscopic" are modeled exactly after the sizes of tumors that were called "microscopic" but for which the actual sizes were reported. The same is done for the earlytumorsizes vector, which represents the reported sizes of CIS, Stage I or StageII serous tumors discovered by PBSO in BRCA1 women, in the publications evaluated in our study of the natural history of serous ovarian cancer, and for the stage2sizes vector, which represents the reported sizes of Stage II serous tumors discovered by PBSO in BRCA1 women, in the publications evaluated in our study of the natural history of serous ovarian cancer.
microscopic<-c(8,7,1.587401052,2,2,1.2,1,1,1.6,1,1,3.174802104,10,5,8,3,2.908181412,2.2,0.9,2,4)
#microscopic is the vector of 21 tumor sizes that were specified for occult tumors discovered only on microscopic examination of PBSO specimens.
earliertumorsizes<-c(rep(2,21),rep(4,21),rep(4,21),rep(4,21),rep(8,21),rep(15,21),rep(1,21),rep(7,21),rep(8,21),rep(2.2,21),rep(0.9,21),rep(7,21),rep(8,21),rep(microscopic,12))
#earliertumorsizes includes sizes for all CIS and Stage I occult tumors.
length(earliertumorsizes)/21
stage2sizes<-c(microscopic,rep(1.6,21),rep(1.6,21),rep(50,21),rep(3.2,21),rep(3,21))
length(stage2sizes)/21
#earlytumorsizes includes all CIS, stage I and stage II tumors.
earlytumorsizes<-c(earliertumorsizes,stage2sizes)
#The latetumorsizes vector represents the reported sizes of Stage III or IV serous tumors discovered by PBSO in BRCA1 women, in the publications evaluated in our study of the natural history of serous ovarian cancer.
latetumorsizes<-c(100,10,5,15,15,10)
#The stages vector has a value for each tumor discovered by PBSO in BRCA1 women, in the publications evaluated in our study of the natural history of serous ovarian cancer. The value is 3 if the tumor was found at Stage III or IV, 2 if the tumor was found at stage II and zero otherwise.
stages<-c(3,3,3,3,3,3,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
########################################################################################
# KAPLAN-MEIR ANALYSIS OF tumor PROGRESSION AS A FUNCTION OF SIZE
# There are four versions of this analysis:
# 1) Definition of tumor Progression = Stage III+; Data Source = raw (unsmoothed PBSO data)
# 2) Definition of tumor Progression = Stage III+; Data Source = smoothed PBSO data
# 3) Definition of tumor Progression = Stage II+; Data Source = raw (unsmoothed PBSO data)
# 4) Definition of tumor Progression = Stage II+; Data Source = smoothed PBSO data
#First analyze sensitivity in detection of cancers before they progress to Stage II or beyond. The earliertumorsizes vector represents the reported sizes of Stage CIS or I serous tumors discovered by PBSO in BRCA1 women, in the publications evaluated in our study of the natural history of serous ovarian cancer.
# Generate 1000 model tumor histories
# Model Features/Assumptions:
# A) Time of progression for each tumor is corrected to account for the fact that tumors progressed to their stage at detection at some time prior to the time of detection/diagnosis.
# B) tumors are assumed to all follow the same exponential growth pattern, with different rates for early stage (CIS,I,II) and late stage (III,IV). The early and late curve equation constants are determined below.
# (1) It 'corrects' the time at progression to late stage in order to account for the fact that the tumors progressed to their stage at discovery at some unknown time before they were discovered. The correction is based on the assumption that tumors spent an amount of time normally distributed around the typical stage duration before either progressing or becoming clinically apparent - ie. stage II tumors progressed from stage I at times normally distributed around 'stage2duration'; stage III and IV tumors progressed (from stage II) at times normally distributed around "Stage34duration" years before clinical diagnosis etc.
# (2) It assumes that "early-stage" tumors grow exponentially at one rate defined by a growth rate and intercept, and that "late-stage" tumors grow exponentially at another rate. The intercept essentially specifies the size at which tumors are detectable by histopathological examination of PBSO specimens. The early growth model also has an intercept "earlyintercept" that corresponds to the size at which tumors first become detectable by meticulous histopathological examination of PBSO specimens. The late growth model doesn't have an intercept, since this phase begins at whatever size the tumor progresses from "early" to "late". The growth rate estimates for "early-stage" (CIS, Stage I and Stage II), and "late stage" tumors (Stage III and IV) come from growth model optimizations (see below). Since the growth rates ("earlygrowthrate" and "lategrowthrate", respectively)are needed to generate the KM curves, which in turn are used later in the growth model optimizations, this is an recursive process, but it converges quickly. Note that these growth rates use base 10. The model is: size (in mm)=10^(ta+b) where t is time in years, a is the growth rate and b is the intercept. (b corresponds to log10 of the smallest tumor diameter detectable by meticulous examination of a PBSO)
# Calculate stage2 & stage34 intervals to be used in correcting for progression prior to time of diagnosis.
stage34interval<- matrix(nrow=37,ncol=1000)
stage2interval<- matrix(nrow=37,ncol=1000)
uniform1000<-matrix(nrow=37,ncol=1000)
for (j in 1:37)
{
uniform1000[j,]<-runif(1000,min=0,max=1)
stage34interval[j,]<-10^rnorm(1000,mean=log10(Stage34duration),sd=0.05)
stage2interval[j,]<-10^rnorm(1000,mean=log10(Stage2duration),sd=0.05)
}
#In this model, I'm assuming that "early-stage" tumors grow exponentially at one rate defined by a growth rate and intercept the intercept essentially specifies the size at which tumors are detectable by histopathological examination of PBSO specimesn, and "late-stage" tumors grow exponentially at another rate. The growth rate estimates for "early-stage" (CIS, Stage I and Stage II), and "late stage" tumors (Stage III and IV) come from growth model optimizations (see below). Since the growth rates ("earlygrowthrate" and "lategrowthrate", respectively)are needed to generate the KM curves, which in turn are used later in the growth model optimizations, this is an recursive process, but it converges quickly. The early growth model also has an intercept "earlyintercept" that corresponds to the size at which tumors first become detectable by meticulous histopathological examination of PBSO specimens. The late growth model doesn't have an intercept, since this phase begins at whatever size the tumor progresses from "early" to "late". Note that these growth rates use base 10. The model is: size (in mm)=10^(ta+b) where t is time in years, a is the growth rate and b is the intercept. (b corresponds to log10 of the smallest tumor diameter detectable by meticulous examination of a PBSO)
# Start with growth rates from previous rounds of modeling
lategrowthrate<-0.55
earlygrowthrate<-0.37 #from previous round of modeling.
# -----------------------------------------------------------------------------
#KM module:
#Although a KM curve produced using sizes smoothed with a normal kernel will be used for the main calculations of sensitivity and changes in survival, it's useful to look at the KM curve for the unsmoothed size data (still corrected and implicitly "smoothed" a bit for the delay between progression and discovery). Here I generate the unsmoothed KM curve, correcting for the delay between progression of the tumors to Stage II+ and their discovery as Stage II, III or IV tumors by PBSO (average of 0.45 years => size correction factor =10^(some random time averaging 0.45 years)*lategrowthrate); based on the first round of modeling, which gave a base10 growth rate constant of "lategrowthrate" for StageIII and IV tumors combined).
# Create a table of 1000 bootstrap samples of stages: "bootstage", and sizes: "bootsizeCIS1"(CIS, and Stage I), "bootsizeCIS12" (CIS, Stage I and II), "bootsize2" (Stage II), and "bootsize34"(Stage III and IV), respectively, by sampling with replacement. "boosize2sm" and "bootsize34sm" are smoothed versions of the corresponding tables.
bootstage<-matrix(nrow=37,ncol=1000)
bootstage2<-matrix(nrow=37,ncol=1000)
bootstage234<-matrix(nrow=37,ncol=1000)
bootstage34<-matrix(nrow=37,ncol=1000)
bootone<-matrix(nrow=37,ncol=1000)
bootsizeCIS1<-matrix(nrow=37, ncol=1000)
bootsize2<-matrix(nrow=37,ncol=1000)
bootsizeCIS12<-matrix(nrow=37,ncol=1000)
bootsize34<-matrix(nrow=37,ncol=1000)
bootsize234<-matrix(nrow=37,ncol=1000)
bootsizesm<-matrix(nrow=37,ncol=1000)
bootone[,]<-1
for(i in 1:1000)
{
bootstage[,i]<-sample(stages,37,replace=TRUE)
bootsizeCIS1[,i]<-sample(earliertumorsizes,37,replace=TRUE)
bootsize2[,i]<-sample(stage2sizes,37,replace=TRUE)
bootsize34[,i]<-sample(latetumorsizes,37,replace=TRUE)
}
# Correct tumor sizes for the interval between progression and detection by PBSO using stage34 interval (above) and the late growth rates from the model. We assume for this correction that tumors grow at a rate of "earlygrowthrate" at StageII and "lategrowthrate" at Stages III and IV.
bootsize34<-bootsize34*10^(-lategrowthrate*(uniform1000*stage34interval))
#Now generate matrix "bootstage34" by counting only stage III and IV tumors, and matrix "bootstage234" by counting stage II, III and IV tumors, and matrix "bootstage2" by counting only stage II tumors; and in each case assigning a value of 1 for included tumors and 0 for excluded tumors.
for (i in 1:1000)
{
for (j in 1:37)
{
if (bootstage[j,i]==2)bootstage2[j,i]<-1 else bootstage2[j,i]<-0
if (bootstage[j,i]<3)bootstage34[j,i]<-0 else bootstage34[j,i]<-1
if (bootstage[j,i]<2)bootstage234[j,i]<-0 else bootstage234[j,i]<-1
}
}
#bootsize is assigned sizes from either the bootsizeCIS1,bootsize2 or bootsize34 bootstrap distributions depending on the values of bootstage234,bootstage2 and bootstage34.
bootsize<-((bootone-bootstage234)*bootsizeCIS1)+(bootstage2*bootsize2)+(bootstage34*bootsize34)
# Need "survival" library for the KM analysis.
library(survival)
# KM Version 1: Modeling tumor progression as a function of tumor size: Progression definition= Stage III,IV; Data = raw (unsmoothed) PBSO sizes. # This analysis was used for Figure 2 panel B.
# KMbootstrap34raw is a matrix in which the columns are 1000 Kaplan-Meier "survival" (i.e. fraction of tumors not yet progressed to Stage III+) estimates as a function of size in mm (row number) derived from the 1000 sets of size/stage bootstrap samples of the PBSO data. Each row represents a tumor diameter, in mm (i.e., row 1 represents 1 mm). It has 251 rows to ensure that even unreasonably large sizes can be accommodated.
KMbootstrap34raw<-matrix(nrow=251,ncol=1000)
KMbootstrap34raw[,]<-0
for(i in 1:1000)
{
KMi<-survfit(Surv(bootsize[,i],bootstage34[,i]))
temp<-summary(KMi,times=seq(0,250,1))
L=length(temp$surv)
KMbootstrap34raw[1:L,i]<-temp$surv
}
KMtable34raw<-matrix(nrow=250,ncol=5)
for (i in 1:250)
{
KMtable34raw[i,1]<-quantile(KMbootstrap34raw[i,1:1000],0.05)
KMtable34raw[i,2]<-quantile(KMbootstrap34raw[i,1:1000],0.25)
KMtable34raw[i,3]<-quantile(KMbootstrap34raw[i,1:1000],0.5)
KMtable34raw[i,4]<-quantile(KMbootstrap34raw[i,1:1000],0.75)
KMtable34raw[i,5]<-quantile(KMbootstrap34raw[i,1:1000],0.95)
}
# Summarize and plot results for KM v1: Modeling tumor progression as a function of tumor size (V1): Progression definition= Stage III,IV; Data = raw (unsmoothed) PBSO sizes.
#KM table is the raw material for the Kaplan-Meier curve summary derived from 1000 KM curves based on 1000 bootstrap samples using the raw, unsmoothed data from PBSOs. Each row number corresponds to a tumor diameter in mm. The columns correspond to the 2.5, 25, 50, 75, and 97.5 percentiles for the fraction not yet advanced to Stage III or IV at each tumor diameter. In other words, the KM curves will represent the percent of tumors not-yet-progressed as a function of tumor diameter, the "3" column represents the median KM curve from the 1000 simulations and the (1,5) and (2,4) columns represent the 95% and the 50% (IQR) confidence intervals respectively.
print(100*KMtable34raw[1:250,],digits=1)
#Plot the median, 25%ile and 75%ile KM curves.
quartz(height=7,width=7)
par(mfrow=c(2,2),pin=c(2,2),cex.main=0.9,mgp=c(2.2,0.8,0))
Color<-c("tomato","black","tomato")
width<-c(0.5,1,0.5)
plot(KMtable34raw[,2],type="l",xlim=c(0,100),ylim=c(0,1),ylab="Probability Stage CIS, I or II",xlab="Tumor Diameter (mm)",col=Color[1],lwd=width[1],main="Summary of Kaplan-Meier Analysis of 1000
Bootstrap Samples of Occult Tumor Sizes
with No Smoothing")
for (i in 3:4)
lines(KMtable34raw[,i],type="l",xlim=c(0,100),ylim=c(0,1),col=Color[i-1],lwd=width[i-1])
#KMbootstrap34sm---------------------------------------------
# KM Version 2: Modeling tumor progression as a function of tumor size: Progression definition= Stage III,IV; Data = SMOOTHED PBSO sizes (produced by adding to the log10 sizes a normal kernel - rnorm(mean=0, sd=0.2), and correcting for delay between progression to Stage III+ and detection by PBSO as above.
# This analysis was used for Figure 2 panel B.
smooth<-matrix(nrow=37,ncol=1000)
for (i in 1:1000)
{
smooth[,i]<-rnorm(37,mean=0,sd=0.2)
}
bootsizesm[,]<-10^(log10(bootsize[,])+smooth[,])
KMbootstrap34sm<-matrix(nrow=251,ncol=1000)
KMbootstrap34sm[,]<-0
for(i in 1:1000)
{
KMi<-survfit(Surv(bootsizesm[,i],bootstage34[,i]))
temp<-summary(KMi,times=seq(0,250,1))
L=length(temp$surv)
KMbootstrap34sm[1:L,i]<-temp$surv
}
KMtable34sm<-matrix(nrow=250,ncol=5)
for (i in 1:250)
{
KMtable34sm[i,1]<-quantile(KMbootstrap34sm[i,1:1000],0.05)
KMtable34sm[i,2]<-quantile(KMbootstrap34sm[i,1:1000],0.25)
KMtable34sm[i,3]<-quantile(KMbootstrap34sm[i,1:1000],0.5)
KMtable34sm[i,4]<-quantile(KMbootstrap34sm[i,1:1000],0.75)
KMtable34sm[i,5]<-quantile(KMbootstrap34sm[i,1:1000],0.95)
}
# Summarize and plot results for KM v2: Modeling tumor progression as a function of tumor size: Progression definition= Stage III,IV; Data = SMOOTHED PBSO sizes.
print(100*KMtable34sm[1:250,],digits=1)
#Plot the median, 25%ile, 75%ile KM curves.
Color<-c("tomato","black","tomato")
width<-c(0.5,1,0.5)
plot(KMtable34sm[,2],type="l",xlim=c(0,100),ylim=c(0,1),ylab="Probability Stage CIS, I or II",xlab="Tumor Diameter (mm)",col=Color[1],lwd=width[1],main="Figure 2")
for (i in 3:4)
lines(KMtable34sm[,i],type="l",xlim=c(0,100),ylim=c(0,1),col=Color[i-1],lwd=width[i-1])
# -----------------------------------------------------------
# KM Version 3: Modeling tumor progression as a function of tumor size: Progression definition= Stage II,III,IV; Data = raw (unsmoothed) PBSO sizes
# In this analysis we need to further correct for the delay between progression of the tumors to Stage II+ and their discovery as Stage II tumors by PBSO. The early growth rate and late growth rate are from an early round of modeling (also set above). We assume for this correction that tumors grow at a rate of "earlygrowthrate" at StageII and "lategrowthrate" at Stages III and IV.
# This analysis was used for Supplemental Figure S2 panel A.
for(i in 1:1000)
{
bootsizeCIS1[,i]<-sample(earliertumorsizes,37,replace=TRUE)
bootsize2[,i]<-sample(stage2sizes,37,replace=TRUE)
bootsize34[,i]<-sample(latetumorsizes,37,replace=TRUE)
}
# Correct tumor sizes for the interval between progression and detection by PBSO using stage34 interval (above) and the late growth rates from the model. We assume for this correction that tumors grow at a rate of "earlygrowthrate" at StageII and "lategrowthrate" at Stages III and IV.
bootsize34<-bootsize34*10^(-lategrowthrate*(uniform1000*stage34interval+earlygrowthrate*stage2interval))
bootsize2<-bootsize2*10^(-earlygrowthrate*(uniform1000*stage2interval))
bootsize<-((bootone-bootstage234)*bootsizeCIS1)+(bootstage34*bootsize34)+(bootstage2*bootsize2)
KMbootstrap234raw <-matrix(nrow=251,ncol=1000)
KMbootstrap234raw[,]<-0
for(i in 1:1000)
{
KMi<-survfit(Surv(bootsize[,i],bootstage234[,i]))
temp<-summary(KMi,times=seq(0,250,1))
L=length(temp$surv)
KMbootstrap234raw[1:L,i]<-temp$surv
}
KMtable234raw<-matrix(nrow=250,ncol=5)
for (i in 1:250)
{
KMtable234raw[i,1]<-quantile(KMbootstrap234raw[i,1:1000],0.05)
KMtable234raw[i,2]<-quantile(KMbootstrap234raw[i,1:1000],0.25)
KMtable234raw[i,3]<-quantile(KMbootstrap234raw[i,1:1000],0.5)
KMtable234raw[i,4]<-quantile(KMbootstrap234raw[i,1:1000],0.75)
KMtable234raw[i,5]<-quantile(KMbootstrap234raw[i,1:1000],0.95)
}
# Summarize and plot results for KM v3: Modeling tumor progression as a function of tumor size: Progression definition= Stage II,III,IV; Data = raw (unsmoothed) PBSO sizes.
print(100* KMtable234raw[1:250,],digits=1)
# Plot the median, 25%ile, 75%ile KM curves.
Color<-c("tomato","black","tomato")
width<-c(0.5,1,0.5)
plot(KMtable234raw[,2],type="l",xlim=c(0,100),ylim=c(0,1),ylab="Probability CIS or Stage I",xlab="Tumor Diameter (mm)",col=Color[1],lwd=width[1],main="Summary of Kaplan-Meier Analysis of 1000
Bootstrap Samples of Occult Tumor Sizes
with No Smoothing")
for (i in 3:4)
lines(KMtable234raw[,i],type="l",xlim=c(0,100),ylim=c(0,1),col=Color[i-1],lwd=width[i-1])
# --------------------------------------------------------------------
# KM Version 4: Modeling tumor progression as a function of tumor size: Progression definition= Stage II,III,IV; Data = SMOOTHED PBSO sizes
# This analysis was used for Supplemental Figure S2 panel B.
bootsizesm[,]<-10^(log10(bootsize[,])+smooth[,])
KMbootstrap234sm<-matrix(nrow=251,ncol=1000)
KMbootstrap234sm[,]<-0
for(i in 1:1000)
{
KMi<-survfit(Surv(bootsizesm[,i],bootstage234[,i]))
temp<-summary(KMi,times=seq(0,250,1))
L=length(temp$surv)
KMbootstrap234sm[1:L,i]<-temp$surv
}
KMtable234sm<-matrix(nrow=250,ncol=5)
for (i in 1:250)
{
KMtable234sm[i,1]<-quantile(KMbootstrap234sm[i,1:1000],0.05)
KMtable234sm[i,2]<-quantile(KMbootstrap234sm[i,1:1000],0.25)
KMtable234sm[i,3]<-quantile(KMbootstrap234sm[i,1:1000],0.5)
KMtable234sm[i,4]<-quantile(KMbootstrap234sm[i,1:1000],0.75)
KMtable234sm[i,5]<-quantile(KMbootstrap234sm[i,1:1000],0.95)
}
# Summarize and plot results for KM v4: Modeling tumor progression as a function of tumor size: Progression definition= Stage II,III,IV; Data = raw (unsmoothed) PBSO sizes. (Supplemental Figure S2)
# Plot is the same as previous KM plots.
print(100* KMtable234sm[1:250,],digits=1)
#Plot the median, 25%ile, 75%ile KM curves.
Color<-c("tomato","black","tomato")
width<-c(0.5,1,0.5)
plot(KMtable234sm[,2],type="l",xlim=c(0,100),ylim=c(0,1),ylab="Probability CIS or Stage I",xlab="Tumor Diameter (mm)",col=Color[1],lwd=width[1],main="Figure S3")
for (i in 3:4)
lines(KMtable234sm[,i],type="l",xlim=c(0,100),ylim=c(0,1),col=Color[i-1],lwd=width[i-1])
##############################################################################################
# GROWTH CURVE OPTIMIZATION
# EARLY STAGE GROWTH CURVE (CIS, I, II)
# The goal of this section is to derive slope and intercept parameters for a first-order exponential growth model for the tumors during the time they are either CIS, stage I or stage II. We assume that the growth of each tumor follows the same pattern during these early stages.
#growthcurve optimization. The next step is to derive slope and intercept parameters for a first-order exponential growth model for the tumors during the time they are either CIS, stage I or stage II. We make the assumption that the growth of each tumor follows the same first-order model with a constant growth rate during these early stages.
# Analysis Overview:
# 1) Determine the sizes at progression for 1000 model tumors.
# 2) Determine the optimal pair of growth rate and intercept for the early growth curve based on agreement with observed PBSO data.
# Note: The size at progression is determined both for progression to Stage III+ (Version1) and to Stage II+ (Version2)
#Start by generating the "progressionpt" matrix, which uses the results from the 1000 KM analyses (of 1000 smoothed bootstrap samples of the actual size data) as the source for generating the sizes at progression for 1000 tumors (by assigning one patient to each of the 1000 KM curves derived above, and using a random number between 0-1 to choose where that patient falls in the percentile scale of size at progression (i.e. a value of 0.01 means the patient is in the first percentile, meaning that her tumor was in the first 1% to progress). The corresponding size is identified from the patients' assigned KMcurve and this is the value assigned to that row in the 2nd column of the progressionpt matrix. The value in the first column corresponds to the "age" of the tumor at the time of progression, with age zero corresponding to the earliest time at which the tumor is detectable by microscopic examination of a PBSO specimen, and the age corresponding to that tumor size derived from the tumor growth model parameters (see below).
# Size at progression for 1000 model tumors - Version 1: Progression = Stage III+; Data source = smoothed PBSO data.
# Each of the 1000 KM curves derived above (KM version 2) is used to determine the size at progression for a model tumor/patient. Use a random number between 0-1 to choose where that patient falls in the percentile scale of size at progression (i.e. a value of 0.01 means the patient is in the first percentile, meaning that her tumor was in the first 1% to progress). The corresponding size is identified from the patients' assigned KMcurve and stored in the 2nd column of the progressionpt matrix.
# "run" is a matrix filled with numbers randomly sampled from a uniform distribution between 0 and 1.
run<-matrix(nrow=251,ncol=1000)
sizeprog<-vector("numeric",length=1000)
sizeprog[]<-1
for (i in 1:1000)
{
run[,i]<-runif(251,min=0,max=1)
for (j in 1:250)
{
if (((KMbootstrap34sm[j,i]-KMbootstrap34sm[j+1,i])/KMbootstrap34sm[j,i])>run[j,i])break
sizeprog[i]<-j
}
}
summary(sizeprog,digits=2)
progressionpt<-matrix(nrow=1000,ncol=2)
progressionpt[,2]<-sizeprog
# Determine the growth rate and intercept for the early growth curve.
# Modeling Approach:
# 1) Find the top 10 parameter pairs - growth rate ("earlygrowthrate") and intercept ("earlyintercept") - that minimize the RMS difference between observed sizes of early stage PBSO tumors and the model's predicted sizes of early stage tumors
# 2) Select from among the top 10 models from step 1 - choose the one that predicts an average duration of the early stage closest to 4.3 years (the estimate from the prevalence and incidence data).
# Model features/assumptions:
# tumor size at clinical diagnosis are modeled based on van Nagell (2007). For cancers detected upon screening in the study, we used the reported tumor size. Unfortunately, sizes of cancers detected between screens were not reported, so we used mean = 8 for these tumors. The distribution was smoothed as described above.
# Note that the starting values for the optimization of both the growth rate and intercept were chosen based on a preliminary round of optimization using the same strategy. This is not encoded here as the iteration converges in a couple of rounds.
endpoints<-vector("numeric",length=1000)#The size at which the tumor either progresses to Stage III+ or is diagnosed (either of which ends the early occult period).
progyear<-vector("numeric",length=1000)#The time in years during which a tumor is detectable by PBSO and still early stage.
earlysizedistribution<-vector("numeric",length=200000)
for (i in 1:1000)
endpoints[i]<-log10(min(progressionpt[i,2], dxv[i]))#Size at progression or diagnosis, whichever comes first.
optimize<-matrix(nrow=576,ncol=4)
Egrowthcurve <-vector(mode="numeric",length=200)#A vector of tumor sizes in mm as a function of time (in units of 1/20 year)
Egrowthcurve <--1
earlysizes<-vector(mode="numeric",length=1000)
earlysizes<-sort(log10(sample(earlytumorsizes,1000,replace=TRUE)),decreasing=FALSE)#earlysizes is a sample of 1000 log10(tumor sizes) from the smoothed bootstrap distribution of sizes of early tumors from the PBSOs, sorted from small to large
earlytest<-vector(mode="numeric",length=1000)
t2<-0 #t2 counts number of cycles of optimization of (growthrate,intercept)
# a is the parameter for incrementing the growthrate (by a*0.0025 per cycle with a starting value of astart).
astart<-0.32
alast<-24
for (a in 1:alast)
{
growthrate<-astart+a*0.0025
#b is the parameter for incrementing the intercept (by 0.005 per cycle with a starting value of bstart).
bstart<--0.34
blast<-24
for (b in 1:blast)
{
intercept<-bstart+b*0.005
t2<-t2+1
#j counts time in 1/20 year increments
maxj<-0
earlysizedistribution[]<-0
# Populate 'working' Egrowthcurve - the model earlygrowth curve based on the current growth parameters (a,b). The Egrowthcurve[i] is the size of the tumor at i/20 years.
# Variable j counts time in 1/20 year increments and stops counting at either i) 200 or ii) when the model calculates size at that age to be >250 (mm diameter)
for (j in 1:200)
{
Egrowthcurve[j]<-(growthrate*j/20)+intercept
if (10^Egrowthcurve[j]>250) break
maxj<-max(maxj,j)
}
# Populate 'working' earlysizedistribution - sizes of the 1000 model early tumors prior to progression (to Stage III+) based on the current model.
# "total" keeps track of number of pre-progression timepoints (sum across all tumors)
#Egrowthcurve is the model growth curve for slope=astart+a*0.0025 and intercept=bstart+b*0.005
#i increments individual tumor growthcurves/progression simulations. earlysizedistribution contains the log10(sizes) for each time "j" (1/20th of a year increments), for each tumor "i", up till that tumor progresses or is diagnosed.
total<-0
for (i in 1:1000)
{
progyear[i]<-(endpoints[i]-intercept)/growthrate
if (round(20*progyear[i])<1) next#avoid values of zero.
if (round(20*progyear[i])>maxj) next#don't waste cycles on times corresponding to unreasonable sizes.
for (time in 1: round(20*progyear[i]))
{
earlysizedistribution[total+1]<-Egrowthcurve[time]
total<-total+ 1
}
}
# For each set of parameters (a,b), calculate the RMS difference between the distribution of pre-progression sizes of the model tumors and those of early stage PBSO tumors.
earlytest<-sort(sample(earlysizedistribution[1:total],1000,replace=TRUE),decreasing=FALSE)
sumofsquares<-0
for (s in 1:1000)
sumofsquares<-sumofsquares+(earlytest[s]-earlysizes[s])^2
optimize[t2,1]<-growthrate
optimize[t2,2]<-intercept
optimize[t2,3]<-sumofsquares
optimize[t2,4]<-total/20000#since total counts time before diagnosis in increments of 1/20 year for 1000 model tumors, divide by 20,000 to get the average number of years a tumor spends as an occult tumor (detectable by histopathology, but not clinically apparent).
}
}
# Order the models by RMS difference closest to zero and display the corresponding growth rate, interecept, RMS difference and average duration values.
best<-order(optimize[,3])#A vector in which the row indexes are ranked from lowest to highest values in column 3 (RMS differences between model and observed size distribution).
optimized<-(optimize[best,])
print(optimized,digits=2)
# Take the top 10 models (min RMS) and choose the one that gives a window of opportunity closest to 4
best1<-order(abs(optimized[1:10,4]-Earlystageduration))#A vector in which the row indexes are ranked based on how closely the model's predicted duration of the early occult phase matches the value estimated from the prevalence and incidence data.
optimized1<-optimized[best1,]
earlygrowthrate<-optimized1[1,1]
print(earlygrowthrate,digits=2)
earlyintercept<-optimized1[1,2]
print(earlyintercept,digits=2)
earlysizedistribution[]<-0
# Populate 'final' Egrowthcurve - the model earlygrowth curve based on the optimized growth parameters. The value at index i is the size of the tumor at that age = i/20 years.
for (j in 1:200)
{
Egrowthcurve[j]<-(earlygrowthrate*j/20)+earlyintercept
}
#Egrowthcurve is the model growth curve for slope=astart+a*0.0025 and intercept=bstart+b*0.005
#i increments individual tumor growthcurves/progression simulations. earlysizedistribution contains the log10(sizes) for each time "j" (1/20th of a year increments), for each tumor "i", up till that tumor progresses or is diagnosed.
# Populate earlysizedistribution - sizes of the 1000 model early tumors prior to progression (to Stage III+) based on the optimized parameters.
# total keeps track of number of pre-progression timepoints (sum across all tumors)
total<-0
for (i in 1:1000)
{
progyear[i]<-(endpoints[i]-earlyintercept)/earlygrowthrate
if (round(20*progyear[i])<1) next
for (time in 1: round(20*progyear[i]))
{
earlysizedistribution[total+1]<-Egrowthcurve[time]
total<-total+ 1
}
}
#earlysizes is a sample of 1000 log10(tumor sizes) from the smoothed bootstrap distribution of sizes of early tumors from the PBSOs.
earlytest<-sort(sample(earlysizedistribution[1:total],1000,replace=TRUE),decreasing=FALSE)
quartz(height=9, width=5)
par(mfrow=c(2,1),pin=c(3,3))
# Plot the distribution of sizes of early tumors predicted by the model (red curve) versus those observed in PBSOs (blue points)
plot(1:1000,earlytest,type="l",col="red",lwd=1,xlab="sizes ranked",ylab="log10(diameter in mm)",main="model (red) vs observed (blue)
Stage CIS,I & II tumor sizes")
points(1:1000,earlysizes,col="blue",cex=1, bg="blue",pch=21)
# ########################################################################################
# LATE GROWTH CURVE OPTIMIZATION
# We assume that all late stage tumors (Stage III,IV) follow the same exponential growth curve.
# Approach:
# 1) Determine which tumors were clinically diagnosed before progression. There are two version of this analysis:
# Version 1: High risk (frequently monitored population)
# 2) Find the late growth rate parameter that gives the best match to the observed interval of 0.9 months average between progression and clinical detection. This part is only done for the high-risk population.
#Now figure out growth curve for late tumors. The idea here is to take the distribution of sizes at which tumors progress (from the KM analysis) and by successive approximation find values for the rate constant, in an exponential model relating the tumor size to time since progression, that gives the best match to the observed interval of 0.9 months average between progression and clinical detection.
# late growth rate is calculated as the average (log10(fold growth as an occult Stage III or IV tumor)/time tumors spend as occult Stage III or IV tumors) for 1000 tumor "late occult growth histories" generated by bootstrap sampling from the van Nagell size at diagnosis data and deriving size at progression from the 1000 model tumor histories based on the bootstrap KM analysis.
sumslope<-0
#i increments successive individual tumor "late occult growth histories"
for (i in 1:1000)
{
if (progressionpt[i,2]>dxv[i])next
sumslope<-sumslope+(log10(dxv[i]/progressionpt[i,2]))/Stage34duration
}
Lategrowthrate<-sumslope/1000
print(Lategrowthrate, digits=2)
# Now generate a plot comparing the distribution of sizes of Stage III and IV occult cancers predicted by the model with the distribution of sizes of StageIII and IV tumors found at PBSO.
# Populate Lgrowthcurve such that the value at index i, week is the size of tumor i at week "week".
Lgrowthcurve<-matrix(nrow=200,ncol=1000)
Lgrowthcurve[,]<--1
for (i in 1:1000)
{
dxsize<-sample(dxv,1,replace=TRUE)
for (week in 1:200)
{
if (progressionpt[i,2]*(10^Lategrowthrate*week/52)>dxsize)break
Lgrowthcurve[week,i]<- progressionpt[i,2]*(10^Lategrowthrate*week/52)
}
}
#Generate vector latedist containing all the sizes for each week after progression for each tumor
latedist<-vector("numeric",length=200000)
total<-0
for (i in 1:1000)
{
for (week in 1:200)
{
if (Lgrowthcurve[week,i]<0)break
total<-total+1
latedist[total]<-Lgrowthcurve[week,i]
}
}
# Plot the predicted (red curve) and observed (blue points) distribution of sizes of tumors that are late stage but pre-diagnosis (diagnosis in a frequently monitored population)
plot(seq(1,100*length(latetumorsizes))/(100*length(latetumorsizes)),log10(sort(rep(latetumorsizes,100),decreasing=FALSE)),type="p",col="blue",cex=1,pch=19,xlab="sizes ranked",ylab="log10(diameter in mm)",main="model (red) vs observed (blue) Stage III & IV tumor sizes")
lines(seq(1,length(log10(sort(latedist[1:total],decreasing=FALSE))))/length(log10(sort(latedist[1:total],decreasing=FALSE))),log10(sort(latedist[1:total],decreasing=FALSE)),type="l",col="red",lwd=1)
# Summary of late growth results:
# The results indicate that a first order growth rate constant (base 10) of about 0.5 gives a reasonable fit to the observed duration of 0.9 years.
# Since the growth rate parameter for advanced-stage tumors is about 10^0.5y and 10^0.5=3, this implies that after progression to Stage III or IV, tumors grow about 3-fold per year in diameter, or about 31-fold in volume (about 2-fold in diameter and 6-fold in volume per 6 months).# Print the fraction of tumors diagnosed before progression to stage III or IV (in high-risk population) as predicted by the model.
# Determine the estimate from the early growth model for the fraction of tumors that progress to Stage III or IV before diagnosis, using the size distribution at clinical diagnosis estimated for closely-monitored patients (i.e. from the 2007 van Nagell paper) "prediagnosisVN", and for a normal risk population "prediagnosisNR"
# Populate 'prediagnosis': start with 0 and change to 1 if size at progression > size at dx.
prediagnosisVN<-vector(mode="numeric",length=1000)
prediagnosisVN[]<-0
prediagnosisNR<-vector(mode="numeric",length=1000)
prediagnosisNR[]<-0
for (i in 1:1000)
{
if (progressionpt[i,2]>dx[i])prediagnosisNR[i]<-1
if (progressionpt[i,2]>dxv[i])prediagnosisVN[i]<-1
}
# Print the fraction of tumors predicted by the model to be diagnosed before they progress to Stage III or IV in a closely monitored population.
print(mean(prediagnosisVN),digits=2)
# Print the fraction of tumors diagnosed before progression to stage III or IV (in normal risk population)
print(mean(prediagnosisNR),digits=2)
# ###################################################################################################
# PLOTTING MODEL tumor NATURAL HISTORIES (GROWTH, PROGRESSION, DIAGNOSIS)
# Using the early and late growth curve parameters optimized above, we can plot model tumor natural histories. Specifically, we will plot tumor size as a function of time and indicate points of progression and clinical detection. Since clinical detection depends on whether or not the pop. is closely monitored we consider two scenarios:
# 1) High-risk (closely monitored) population: sizes as diagnosis derived from van Nagell 2007-
# 2) Normal-risk population: sizes at diagnosis log-normally distributed about 8cm.
#diagnosis is a matrix of the sizes at diagnosis of serous cancers in the 2007 Van Nagell paper (column 2), and the duration (in years) of the occult period (column 1), derived from the sizes at diagnosis using the growth curve models.
diagnosis<-matrix(nrow=1000,ncol=2)
diagnosis[,2]<-dxv[]
diagnosis[,1]<-(log10(diagnosis[,2])-earlyintercept)/earlygrowthrate
for (i in 1:1000)
#revise the progressionpt (year when tumor progresses) as a function of size using the new growth rate and intercept parameters.
{
progressionpt[i,1]<- (log10(progressionpt[i,2])-earlyintercept)/earlygrowthrate
}
# Generate and plot natural histories of 50 model tumors - Version 1: high-risk/frequently monitored population.
# size is a matrix of 100 simulated growth curves, with each row representing a month, with values corresponding to the diameters of the tumors, the 50 "model" tumors are represented by the successive curves starting at monthly intervals (i.e. tumor 1 (column 1) starts (i.e. begins being potentially detectable by PBSO) at month 1, while tumor 50 (column 50) starts at month 50).
size<-matrix(nrow=200,ncol=100)
for (curve in 1:100)
{
for (month in 1:200)
{
if ((month-curve)diagnosis[2*i,2])points((2*i/12)+diagnosis[2*i,1],diagnosis[2*i,2],type="p",col="limegreen", cex=1,pch=19)else
{
points((2*i/12)+diagnosis[2*i,1],diagnosis[2*i,2],type="p",col="red",cex=1,pch=19)
points((2*i/12)+progressionpt[2*i,1],progressionpt[2*i,2],col="blue",cex=0.4, bg="blue",pch=24)
}
# Generate and plot natural histories of 50 model tumors - Version 2: normal risk/unmonitored population. The key is the same as above.
diagnosis<-matrix(nrow=1000,ncol=2)
diagnosis[,2]<-dx[]
size<-matrix(nrow=200,ncol=100)
for (curve in 1:100)
{
for (month in 1:200)
{
if ((month-curve)diagnosis[(2*i-1),2])points(((2*i-1)/12)+diagnosis[(2*i-1),1],diagnosis[(2*i-1),2],type="p",col="limegreen", cex=1,pch=19)else {
points(((2*i-1)/12)+diagnosis[(2*i-1),1],diagnosis[(2*i-1),2],type="p",col="red",cex=1,pch=19)
points(((2*i-1)/12)+progressionpt[(2*i-1),1],progressionpt[(2*i-1),2],col="blue",cex=0.4, bg="blue",pch=24)
}
###########################################################################################
# EARLY DETECTION TEST PERFORMANCE MODELING
# Sensitivity Analysis Overview:
# This section calculates the sensitivity of hypothetical early detection testing scenarios. ED test sensitivity is defined as the fraction of tumors that are detected before they either i) progress to 'late' stage or ii) are clinically detectable. Only tumors that would otherwise have been detected clinicially at a 'late' stage are considered as candidates for early detection.
# The three variables considered are:
# 1) Limit of detection (tumor diameter) of the test
# 2) Frequency of the test
# 3) Alternative clinical diagnosis scenario to which the test scenario is being compared. We consider two diagnosis scenarios which the ED test must 'beat':
# A) Clinical diagnosis is modeled a frequently monitored (high-risk) population
# B) Clinical diagnosis is modeled in a normal risk (unscreened) population.
# Algorith overview:
# 1) Determine the 'age' at progression for the 1000 model tumors (from KM analysis above)
# 2) Start the 1000 model tumors growing at random times over a period of 24 months so that the "phasing" of the tumors at the times of the screens, relative to their "start-time" is uniformly randomly distributed.
# 3) Implement the hypothetical screen at defined intervals.
# 4) Calculate overall test sensitivity - the fraction of model tumors that at any screening event satisfy all three of the following: i) pre-progression, ii) pre-clinical diagnosis and iii) larger than the limits of detection of the test.
#Sensitivity module. For the analysis of sensitivity vs. the size threshold and frequency of a hypothetical screen, we'll consider both a screened population that's already closely monitored (i.e. a high-risk population), modeling the tumor size at clinical diagnosis in this population on the van Nagell 2007 data, and a normal-risk population for which we model the sizes at clinical diagnosis as a log normal distribution with mean=8cm.
#First the "high-risk" population... We generate a set of 1000 model "natural histories" (growth curves, time of progression and diagnosis), deriving the sizes at progression from early to late stage from the KM curve and using the growth rate model parameters derived from the simulation and the sizes at diagnosis from the Van Nagell 2007 data (and assuming that the tumors that were not picked up by screening but were diagnosed based on symptoms were a geometric mean of 8 cm in size (roughly what the Toronto data and literature suggest for size at diagnosis due to symptomatic presentation). Here we assume that if not detected by a screen, the tumors are detected based on signs or symptoms. Based on 1000 model early tumors whose growth and time/size at progression are simulated here, estimate the fraction of tumors that are detected before they progress (i.e. the sensitivity of detection) as a function of the threshold size at which a hypothetical screening assay can detect a tumor. The tumor growth curves are started at random over a period of 24 months, so that the "phasing" of the tumors at the times of the screens, relative to their "start-time" is uniformly randomly distributed. Hypothetical screening intervals of 3, 6, 12 or 24 months are considered. The question is what fraction of tumors are simultaneously above the threshold for detection, and not yet advanced to stage III or beyond, at the time of a screen. Note that in evaluating "sensitivity" we only want to take credit for early detection of tumors that wouldn't otherwise have been detected clinically at an early stage.
# Sensitivity Analysis Version 1: ED test vs High Risk/frequent monitoring Clinical Dx (Progression = Stage III+)
#badactorsHR counts the tumors that would present at advanced stages if not detected by a screen (in carefully-monitored and general populations, respectively).
badactorsHR<-0
for (i in 1:1000)
# I) Revise estimates of the age (in years) at progression for 1000 model tumors. Use the newly derived "earlygrowthrate" and "earlyintercept" values from the early growth curve optimization above to derive the age from the size at progression (which was derived from the KM analysis).
{
progressionpt[i,1]<- (log10(min(progressionpt[i,2], dxv[i]))-earlyintercept)/earlygrowthrate
# II) Determine which tumors are eligible for detection by our hypothetical ED test. Tumors are only eligible if they would have progressed to late stage at a size smaller than the size at which they would have been clinically diagnosed.
#here we're modeling diagnosis pattern based on the Van Nagell 2007 report.
if (dxv[i]>=progressionpt[i,2]) badactorsHR<-badactorsHR+1
}
# III) Start the clock ticking for the 1000 tumors out at random times over a period of 24 months, so that they are distributed randomly in their growth phases at the time of observations at intervals of up to 24 months.
# Populate 'GrowthByMonth' in which the value at row j, column i, is the size of tumor i at month j in that tumor's life.
GrowthByMonth<-matrix(nrow=200,ncol=1000)
GrowthByMonth[,]<- -1
progmonth<-vector(mode="numeric",length=1000)
for (i in 1:1000)
{
startmonth<-24*runif(1,min=0,max=1)
progmonth[i]<-round(startmonth+(12*progressionpt[i,1]))
for (j in 1:progmonth[i])
{
GrowthByMonth[j,i]<-10^((earlygrowthrate*(j-startmonth)/12)+earlyintercept)
}
}
# IV) Apply the hypothetical ED tests to the 1000 model tumors at the specified screening interval. At each test point, calculate the fraction of 'eligible' tumors are simultaneously: i) pre-progression ii) pre-clinical diagnosis, and iii) larger than the limit of detection. 'Eligible' tumors are those that would otherwise have progressed to late stage prior to clinical diagnosis.
# Figure out what fraction of the occult-tumor simulation/time points (i.e. pre-clinical diagnosis timepoints) are pre-progression, pre-clinical diagnosis, and larger than the limit of detection.
# Evalutate screening intervals of 3,6,12,24 months. Clinical Dx model = High-risk/frequently monitored; Post-progression = Stage III+
# Screening interval = 3 months
#sensitivity as a function of threshold
total<-vector("numeric",length=100)
sensitivityHR<-matrix(nrow=100,ncol=4)
total[]<-0
for (threshold in 1:100)
{
for (sim in 1:1000)
{
if (dxv[sim]<=progressionpt[sim,2]) next
{
for (test in seq(3,96,3))
{
if(GrowthByMonth[test,sim]=progressionpt[i,2]) badactorsNR<-badactorsNR+1
}
# Calculate "GrowthByMonth" matrix as above.
GrowthByMonth<-matrix(nrow=200,ncol=1000)
GrowthByMonth[,]<- -1
progmonth<-vector(mode="numeric",length=1000)
for (i in 1:1000)
{
startmonth<-24*runif(1,min=0,max=1)
progmonth[i]<-round(startmonth+(12*progressionpt[i,1]))
for (j in 1:progmonth[i])
{
GrowthByMonth[j,i]<-10^((earlygrowthrate*(j-startmonth)/12)+earlyintercept)
}
}
# Evaluate screening intervals of 3,6,12,24 months. Clinical Dx model = Normal-risk/unmonitored; Post-progression = Stage III+
# Screening interval = 3 months
total<-vector("numeric",length=100)
sensitivityNR<-matrix(nrow=100,ncol=4)
total[]<-0
for (threshold in 1:100)
{
for (sim in 1:1000)
{
if (dx[sim]<=progressionpt[sim,2]) next
{
for (test in seq(3,96,3))
{
if(GrowthByMonth[test,sim]run[j,i])break
}
sizeprog1[i]<-j
}
summary(sizeprog1,digits=2)
progressionpt1<-matrix(nrow=1000,ncol=2)
progressionpt1[,2]<-sizeprog1
progressionpt1[i,1]<- (log10(progressionpt1[i,2])-earlyintercept)/earlygrowthrate
#-----------------------------------------------------------------------
# Infer stage at diagnosis for model tumors based on whether or not progression to Stage 2 and Stage 3 happened at size > or < size at diagnosis.
for (sim in 1:1000)
{
if (dxv[sim]>=sizeprog1[sim])Stagedxv[sim]<-2
if (dxv[sim]>=sizeprog[sim])Stagedxv[sim]<-3
if (dx[sim]>=sizeprog1[sim])Stagedx[sim]<-2
if (dx[sim]>=sizeprog[sim])Stagedx[sim]<-3
twov[sim]<-min(2,Stagedxv[sim])
two[sim]<-min(2,Stagedx[sim])
threev[sim]<-min(3,Stagedxv[sim])
three[sim]<-min(3,Stagedx[sim])
}
#GrowthByMonth is populated for 1000 model tumors in the same way as was previously done for the sensitivity analysis.
for (sim in 1:1000)
{
startmonth<-24*runif(1,min=0,max=1)
for (month in 1:200)
GrowthByMonth[month,sim]<-10^((earlygrowthrate*(month-startmonth)/12)+earlyintercept)
}
# ------------------------------------------
# Mortality Analysis Version 1: ED Test Vs High Risk (frequently monitored) model of Clinical Diagnosis
# For each screening interval X. months, populate matrix ScreenX of size 100 rows x 1000 colums.
# ScreenX (eg Screen3) is constructed such that the value at row i, column j, indicates the stage at detection (where a value of 1 means CIS or Stage I, 2 means Stage II and 3 means Stage III) of model tumor j with a test with limit of detection i (tumor diameter).
# Screening interval = 3 months
for (threshold in 1:100)
{
for (sim in 1:1000)
{
Screen3[threshold,sim]<-threev[sim]
if (threshold>=dxv[sim])next
if (threshold>=sizeprog[sim])next
for (test in seq(3,96,3))
{
if (GrowthByMonth[test,sim]>sizeprog[sim])break
if (GrowthByMonth[test,sim]=dxv[sim])next
if (threshold>=sizeprog[sim])next
for (test in seq(6,96,6))
{
if (GrowthByMonth[test,sim]>sizeprog[sim])break
if (GrowthByMonth[test,sim]=dxv[sim])next
if (threshold>=sizeprog[sim])next
for (test in seq(12,96,12))
{
if (GrowthByMonth[test,sim]>sizeprog[sim])break
if (GrowthByMonth[test,sim]=dxv[sim])next
if (threshold>=sizeprog[sim])next
for (test in seq(24,96,24))
{
if (GrowthByMonth[test,sim]>sizeprog[sim])break
if (GrowthByMonth[test,sim]=dx[sim])next
if (threshold>=sizeprog[sim])next
for (test in seq(3,96,3))
{
if (GrowthByMonth[test,sim]>sizeprog[sim])break
if (GrowthByMonth[test,sim]=dx[sim])next
if (threshold>=sizeprog[sim])next
for (test in seq(6,96,6))
{
if (GrowthByMonth[test,sim]>sizeprog[sim])break
if (GrowthByMonth[test,sim]=dx[sim])next
if (threshold>=sizeprog[sim])next
for (test in seq(12,96,12))
{
if (GrowthByMonth[test,sim]>sizeprog[sim])break
if (GrowthByMonth[test,sim]=dx[sim])next
if (threshold>=sizeprog[sim])next
for (test in seq(24,96,24))
{
if (GrowthByMonth[test,sim]>sizeprog[sim])break
if (GrowthByMonth[test,sim]sizeprog1[sim]) if (sizesizeprog[sim]) Stagedistribution[size,three[sim]]<-Stagedistribution[size,three[sim]]+1
}
}
print(Stagedistribution/10,digits=1)
# Plot stage distribution as a function of detection size.
quartz()
Color<-c("blue","purple","red")
for (i in 1:3)
plot(Stagedistribution[,i]/10,type="l",xlim=c(0,100),ylim=c(0,100),ylab="Percentage at each stage",xlab="Diameter (mm) at screen detection",col=Color[i],lwd=1.0,main="Stage Distribution as a Function of
Tumor Diameter at Detection in a
Normal-Risk Population")
for (i in 1:3)
lines(Stagedistribution[,i]/10,type="l",xlim=c(0,100),ylim=c(0,100),col=Color[i],lwd=1.0)
# Calculate improvements in number of 5-year survivors in the US as a function of limit of detection and screening interval. Use "Annualcases" (based on ACS Cancer Facts and Figures 2008: 21650 new cases of invasive ovarian cancer, and Seidman, JD, et al. Int J. Gyne Path 2007: 67.8% serous (using a conservative definition of serous cancer; with a more inclusive definition, the value was 78%.) to represent the overall number of patients per year who can benefit. First assume clinical diagnosis at sizes corresponding to van Nagell 2007 distribution, then assuming diagnosis at mean diameter 9 cm, SD=1cm.
SurvivorsvthresholdHR<-matrix(nrow=100,ncol=4)
SurvivorsvthresholdNR<-matrix(nrow=100,ncol=4)
SurvivorsvthresholdHR<-Annualcases*(SurvivalvthresholdHR-SurvivalHR)/100
SurvivorsvthresholdNR<-Annualcases*(SurvivalvthresholdNR-SurvivalNR)/100
print(SurvivorsvthresholdHR,digits=1)
print(SurvivorsvthresholdNR,digits=1)
# Plot increase in number of survivors as a function of minimum detectable size for 4 screening intervals, assuming clinical diagnosis at size distribution derived from van Nagell 2007.
quartz(height=6,width=8)
par(mfrow=c(1,2),pin=c(2.5,2.5),mgp=c(2.2,0.8,0))
Color<-c("blue","springgreen4","purple","red")
width<-c(0.5,0.5,2,0.5)
plot(SurvivorsvthresholdHR[,1],type="l",xlim=c(0,100),ylim=c(0,7000),ylab="Additional 5-year Survivors",xlab="Detectable Tumor Diameter (mm)",col=Color[1],lwd=width[1],cex.main=0.8,main="Additional 5-Year Survivors (US) versus
Minimum Detectable Tumor Diameter in a
Closely-Monitored Population")
for (i in 2:4)
lines(SurvivorsvthresholdHR[,i],type="l",xlim=c(0,100),ylim=c(0,7000),col=Color[i],lwd=width[i])
# Plot increase in number of survivors as a function of minimum detectable size for 4 screening intervals, this time assuming clinical diagnosis at a size distribution with geometric mean=8cm.
plot(SurvivorsvthresholdNR[,1],type="l",xlim=c(0,100),ylim=c(0,7000),ylab="Additional 5-year Survivors",xlab="Detectable Tumor Diameter (mm)",col=Color[1],lwd=width[1],cex.main=0.8,main="Additional 5-Year Survivors (US)
versus Minimum Detectable Tumor Diameter
in a Normal-Risk Population")
for (i in 2:4)
lines(SurvivorsvthresholdNR[,i],type="l",xlim=c(0,100),ylim=c(0,7000),col=Color[i],lwd=width[i])
quartz(width=3,height=3)
par(pin=c(1.45,0.6),bty="n")
plot(1:4,rep(1,4),ylab="",xaxt="n",yaxt="n",cex.main=1.5,main="Color Key", cex.axis=1.5,xlab="screening interval
(months)", xlim=c(0.5,4.5),xaxp=c(-0.5,4.5,1), cex.axis=0.5,pin=c(1.45,0.6),mgp=c(3,0,0))
axis(side=1,col="white",at=1:4,labels=c("3","6","12","24"))
for (color in 1:4)
points(color,1,col=Color[color],pch=22,bg=Color[color],xaxt="n",yaxt="n",cex=6)