## How to make epidemiological training infectious.
## Bellan et al. 2012 PLoS Biology
##
## NOTE: This file can be converted to an R script by replacing the .txt suffix with a .R
## suffix. Individual R files, additional exercises, and future code updates are available
## for download from the MMED website at:
## http://lalashan.mcmaster.ca/theobio/mmed/index.php/MMF
##
##
######################################################################
## Figure 1 - Epidemic curve showing S, I, R_effective over time
######################################################################
## Set working directory
## Change below to the directory where you saved Datasets S1-S4:
setwd("~/Documents/MMF/MMFsupinfo/")
## Load epidemic data
epidat <- read.csv("Dataset_S1_MMFlinelist2010.csv")
## Make sure that data is in POSIXct format for R to understand time
epidat$begin.inf <- as.POSIXct(epidat$begin.inf)
epidat$end.inf <- as.POSIXct(epidat$end.inf)
## For individuals who did not return their forms we do not know when
## they recovered. We assume that they recovered 2 days after
## infection (as this was the instructed recovery time)
epidat$end.inf[is.na(epidat$end.inf)] <- epidat$begin.inf[is.na(epidat$end.inf)]+3600*48
## Create vector of times at which any event occurred (someone became
## infected or recovered)
event.times <- c(epidat$begin.inf, epidat$end.inf)
event.times <- na.omit(unique(event.times)) # remove duplicates and NAs
event.times <- event.times[order(event.times)]
## Create a vector of time ticks from the first to the last day for
## plotting the time series
time.ticks <- seq(as.POSIXct("2010-05-24 12:00"),as.POSIXct("2010-06-01 12:00"), by = 3600*24)
event.times <- c(min(time.ticks), event.times) # add a time 12 hours earlier than the epidemic
## Initialize a data frame to create an epidemic time series where
## rows relate to specific events (infections or recoveries). This
## allows calculation of how many susceptible, infected & recovered
## individuals were in the population at any given time since we know
## the true population size (all participants)
dat <- data.frame(tt = event.times, sus = NA, inf = NA, rec = NA)
## All 46 people at MMED 2010 started out as susceptible
dat[1,2:4] <- c(46, 0 , 0)
## For each time at which an event happened
for(ii in 2:nrow(dat))
{
tt <- dat$tt[ii]
## Figure out how many new infections were during that event
new.inf <- sum(epidat$begin.inf == tt, na.rm = T)
## Figure out how many new recoveries were during that event
new.rec <- sum(epidat$end.inf == tt, na.rm = T)
## Calculate new state variables
dat$sus[ii] <- dat$sus[ii-1]-new.inf
dat$inf[ii] <- dat$inf[ii-1]+new.inf-new.rec
dat$rec[ii] <- dat$rec[ii-1]+new.rec
}
# add a last point at the end of the time.ticks to show the flatline
# at the end of the epidemic
dat <- rbind(dat, dat[nrow(dat),])
dat$tt[nrow(dat)] <- max(time.ticks)
## Find R0 from the empirical infectious contact distribution. The
## below file has a row for every infectious contact showing the
## individual id of the source and recipient individuals as well as if
## the infectious contact resulted in an infection (i.e. if the
## recipient was susceptible)
cont <- read.csv("Dataset_S2_MMFcontacts2010.csv")
## Tabulate how many times each ID came up as a source of infection
## from the contact tracing data
cont.dist <- table(factor(cont$source, levels = epidat$id))
## Average the number of infectious contacts made by each infected
## individual to estimate R0. Note this is more representative of the
## actual R0 than the instructed value in the outbreak because it
## takes into the account that not everyone followed directions.
r0est <- mean(cont.dist)
## Calculate susceptible proportion and Reffective at each time
dat$susprop <- dat$sus / rowSums(dat[,2:4])
dat$reff <- r0est*dat$susprop
## Initialize pdf
pdf("Bellan et al - Figure 1.pdf", width = 5.5, height = 4)
layout(matrix(c(1:4),2,2)) ### Layout plot panels
par(mar=c(1,4,1,2)) ### Set margins
## Plot infecteds over time
plot(dat$tt, dat$inf, type="s", col="red", ylim = c(0,60), lwd = 1, bty="n", lty = 1,
xlab = "", ylab = "# people", axes = F)
axis(2, at = seq(0,60, by = 20))
axis.POSIXct(1, at = time.ticks, las = 2, format = "%a", label = FALSE)
lines(dat$tt, dat$sus, type = "s", col = "black", lwd = 1, lty = 2) #add S
lines(dat$tt, dat$rec, type = "s", col = "blue", lwd = 1, lty = 3) #add R
par(mar=c(4,4,1,2)) ## Change margins
## Plot Reff over time
plot(dat$tt, dat$reff, type = "s", col = "purple", lty = 1, lwd = 1,
xlab = "", ylab = expression(R[eff]), yaxt="n", xaxt = "n",
ylim = c(.5,2), bty="n")
axis.POSIXct(1, at = time.ticks, las = 2, format = "%a")
axis(2, seq(.5,2,by=.5))
## Draw a dashed line at the threshold Reffective=1
abline(h = 1, lty = 2)
## Draw an arrow pointing out where R0 is
arrows(time.ticks[4], r0est, time.ticks[1], r0est, length = .2)
text(time.ticks[4], r0est, pos = 4, expression(R[0]))
######################################################################
## Repeat the above for the 2011 outbreak
epidat <- read.csv("Dataset_S3_MMFlinelist2011.csv")
epidat$begin.inf <- as.POSIXct(epidat$begin.inf)
epidat$end.inf <- as.POSIXct(epidat$end.inf)
epidat$end.inf[is.na(epidat$end.inf)] <- epidat$begin.inf[is.na(epidat$end.inf)]+3600*48
event.times <- c(epidat$begin.inf, epidat$end.inf)
event.times <- na.omit(unique(event.times))
event.times <- event.times[order(event.times)]
time.ticks <- seq(as.POSIXct("2011-05-29 12:00"),as.POSIXct("2011-06-07 12:00"), by = 3600*24)
dat <- data.frame(tt = event.times, sus = NA, inf = NA, rec = NA)
## 4 people were exposed by the instructor but at two different
## times so we only have 2 infecteds at the first event time
dat[1,2:4] <- c(57-14-2, 2 , 14)
for(ii in 2:nrow(dat))
{
tt <- dat$tt[ii]
new.inf <- sum(epidat$begin.inf == tt, na.rm = T)
new.rec <- sum(epidat$end.inf == tt, na.rm = T)
dat$sus[ii] <- dat$sus[ii-1]-new.inf
dat$inf[ii] <- dat$inf[ii-1]+new.inf-new.rec
dat$rec[ii] <- dat$rec[ii-1]+new.rec
}
dat <- na.omit(dat)
dat <- rbind(dat, dat[nrow(dat),])
dat$tt[nrow(dat)] <- max(time.ticks)
## R_effective panel
cont11 <- read.csv("Dataset_S4_MMFcontacts2011.csv")
num.inf <- as.vector(na.omit(cont11$source[!is.na(cont11$recipient)]))
num.inf <- factor(num.inf, levels = as.vector(na.omit(unique(cont11[,1]))))
tab <- table(num.inf)
r0est <- mean(tab)
dat$susprop <- dat$sus / rowSums(dat[,2:4])
dat$reff <- r0est*dat$susprop
par(mar=c(1,4,1,2))
plot(dat$tt, dat$inf, type="s", col="red", ylim = c(0,60), lwd = 1, bty="n",
xlab = "time (hours)", ylab = "", axes = F)
axis(2, at = seq(0,60, by = 20), labels = FALSE)
lines(dat$tt, dat$sus, type = "s", col = "black", lwd = 1, lty = 2)
lines(dat$tt, dat$rec, type = "s", col = "blue", lwd = 1, lty = 3)
axis.POSIXct(1, at = time.ticks, las = 2, format = "%a", label = FALSE)
legend("topright", c("Susceptible","Infected","Recovered/Immune"),
lty = c(2,1,3), col = c("black","red","blue"), lwd = 1, bty = "n", cex = .8)
par(mar=c(4,4,1,2))
plot(dat$tt, dat$reff, type = "s", col = "purple", lty = 1, lwd = 1,
xlab = "", ylab = "", axes = F,
ylim = c(0.5,2), bty="n")
axis.POSIXct(1, at = time.ticks, las = 2, format = "%a", label = TRUE)
axis(2, at = seq(.5,2, by = .5), labels = FALSE)
abline(h = 1, lty = 2)
text(time.ticks[5], r0est, pos = 4, expression(R[0]))
arrows(time.ticks[5], r0est, time.ticks[2], r0est, length = .2)
dev.off()
## Note: Minor formatting changes were made to the figure in Adobe
## Illustrator after production in R.
######################################################################
## Figure 2 - Odds Ratios & CIs for MMF 2011 outbreak, illustrating
## confounding bias and information bias.
######################################################################
## Change below to the directory where you saved Dataset S5:
setwd("~/Documents/R files/MMF/MMFsupinfo/")
## load data
dat <- read.csv("Dataset_S5_MMFRF2011.csv")
## examine the first few rows of the data
head(dat)
## The below is a function which will do a multiple logistic
## regression on a data set given the outcome (dependent variable) and
## the exposure(s) (independent variable) names which refer to columns
## in the data frame. The odds ratios and profile likelihood
## confidence intervals are then given in table or data frame format
## to be used in the figure-making code below. The subset option
## allows us to only use a particular subset of the data frame rows
## (for example, to look at how selection bias can change the results)
conf.string <- function(outcome = "sero",
exposure = "lastyr",
subset = NULL,
format.tab=F)
{
## create a character string of the form "output ~ exposure1 + exposure2
mod <- paste(outcome, "~", paste(exposure, collapse = " + "))
## do a logistic regression
out <- glm(mod, dat, family = binomial, subset = subset)
## exponentiate regression coefficients to get maximum likelihood
## estimates of odds ratios
or <- exp(out$coeff)
## use confint() to get profile likelihood confidence intervals
## around coefficients & exponentiate to make them on odds ratio
## scale
cis <- exp(confint(out))
## create a data frame with results
frame <- data.frame(or, lower = cis[,1], upper = cis[,2])[exposure,]
## option to format the data as OR (CI) to produce a CSV type
## table. Leave False if making a figure instead
if(format.tab==T){
## only show two significant figures in CSV
frame <- signif(frame, 2)
frame <- data.frame(or = frame[,1],
ci = paste("(",frame[,2], ", ", frame[,3],")", sep=""))
}
return(frame)
}
## Initialize pdf
pdf("Bellan et al - Figure 2.pdf", width = 4.75, height = 4)
## first, setting graphical parameters:
par(mar=c(1,4,0,1)) # margins
lwd <- 2 # line width
pch <- 15 # point type (square)
cex <- 1.5 # point size
offset <- .4 # offset for arrows showing bias
len <- .1 # length of arrow tips
## Initialize the plot, (not actually plotting any points due to
## type="n"). Note that y-axis is on a log scale since we are showing
## odds ratios.
plot(0,.1,type="n", bty="n", log="y", ylim =c(.002,500), yaxt="n",
xlim=c(0,12), xaxt="n",xlab="", ylab="Odds Ratio")
ticks <- c(.01,.1, 1,10, 100) # y axis ticks
axis(2, at = c(1/seq(10,100, by=10),1/1:10,2:10, seq(10,100,by=10)), labels = NA)
axis(2, at = ticks, las = 2, labels = as.character(ticks)) # make y-axis
abline(h=1, col ="black", lty=3, lwd=lwd) # put dotted line at null hypothesis (OR=1)
xes <- c(1,5,7,11) # locations of points to plot on x-axis
## Use the function conf.string() created above to do a multiple
## logistic regression with lastyr, gender, and movie in the analysis
## even though only the former actually affects the outcome. The
## outcome (case definition) we use here is "sero" - short for serological,
## referring to the fact that we know everyone's true infection status even
## if they were asymptomatic. This type of information could be achieved in
## practice if a serological assay could identify recent infections (e.g.,
## through detection of IgM antibodies).
serall <- conf.string("sero",c("lastyr","gend","mov"))
points(xes[1],serall$or[1], pch =pch, col ="black", cex=cex)
arrows(xes[1],serall$lower[1],xes[1],serall$upper[1], col="black",angle=90, code=3, length=len, lwd=lwd)
## Information (Nondifferential misclassification) bias - Panel A
## Same as above but now the outcome is "rep" short for reported,
## indicating our case definition now only includes symptomatic
## (reported) cases.
repall <- conf.string("rep",c("lastyr","gend","mov"))
points(xes[2],repall$or[1], pch =pch, col ="dark gray", cex=cex)
arrows(xes[2],repall$lower[1],xes[2],repall$upper[1], col="dark gray",angle=90, code=3, length=len, lwd=lwd)
arrows(xes[1] + offset, serall$or[1], xes[2]-offset, repall$or[1], length =.07, col = "blue", lwd=lwd)
## Confounding - Panel B
## Now we focus on whether participants who arrived a day or
## more early to the site of the workshop were at a greater risk of
## infection. We first plot the analysis with lastyr (and other
## variables) included and find no significant effect of arriving
## early. We then do the same analysis but as a univariate regression
## and show that arriving early does significantly increase your risk
## of arriving early. This is because individuals who had attended
## the clinic last year (and were therefore immune) were less likely
## to arrive early. This is an example of confounding bias, with early
## not really having an effect but appearing to when not adjusted for,
## due to the relationship between lastyr and early as well as lastyr
## and sero.
ser.arrive.multi <- conf.string("sero",c("early","lastyr","gend","mov"))
points(xes[3],ser.arrive.multi$or[1], pch =pch, col ="black", cex=cex)
arrows(xes[3],ser.arrive.multi$lower[1],xes[3],ser.arrive.multi$upper[1], col="black",angle=90, code=3, length=len, lwd=lwd)
ser.arrive <- conf.string("sero",c("early"))
points(xes[4],ser.arrive$or[1], pch =pch, col ="dark gray", cex=cex)
arrows(xes[4],ser.arrive$lower[1],xes[4],ser.arrive$upper[1], col="dark gray",angle=90, code=3, length=len, lwd=lwd)
## Arrows showing bias
arrows(xes[3] + offset, ser.arrive.multi$or[1], xes[4] - offset, ser.arrive$or[1], length =.07, col = "blue", lwd=lwd)
dev.off()
## Note: Minor formatting changes were made to the figure in Adobe
## Illustrator after production in R.
######################################################################
## Figure 3 - Stochastic simulation of MMF outbreaks for varying
## proportions of individuals with immunity (e.g., vaccinated) at the
## start of the outbreak.
######################################################################
## Set local working directory
## You may want to change to the directory below:
setwd("~/Documents/R files/MMF/MMFsupinfo/")
pdf("Bellan et al - Figure 3.pdf", width = 5.5, height = 3)
par(mar=c(4,4,1,1), mfrow=c(1,2))
## This for() loop loops through the two panels of the plot. We write it this way since the code for each panel is basically identical except panel A only does 5 time series and B does 1000 for each of 20 vaccination proportions. When panelA is TRUE (the first time around) we get panel A, and when FALSE we get panel B.
for(panelA in c(T,F))
{
## Make a sequence of vaccinated proportions to simulate outbreaks at
vacc.prop.seq <- seq(0,1,by = .05)
num.sims <- 1000 # number of simulations at each vaccinated proportion
if(panelA)
{ # if panel A just do 5 sims at one vaccinated proportion
vacc.prop.seq <- 0.25
num.sims <- 5
}
init.infect <- 1 # initial number of infected people in the population
## Set up matrix to fill in final epidemic size for each simulation
## for each vaccination proportion
N <- 57 # number participants
R0 <- 1.82 # R0 for 2011 outbreak as calculated in Figure 1 code
time.ticks <- seq(as.POSIXct("2011-05-29 12:00"),as.POSIXct("2011-06-07 12:00"), by = 3600*24)
infper <- 1 # infectious period in days
beta <- R0/(N*infper) # force of infection
## cumulative incidence matrix, fill in for each simulation. Each
## row is a simulation, each column corresponds to a different
## vaccinated proportion
cum.inc.matrix <- matrix(0, num.sims, length(vacc.prop.seq))
## for each vaccinated proportion
for(vv in 1:length(vacc.prop.seq))
{
vacc.prop <- vacc.prop.seq[vv] # set immune proportion at the beginning
## Give us an update on progress so we know about how long its taking.
print(paste("Starting simulations for a vaccinated proportion of",vacc.prop))
# initial immune individuals, must be rounded since individuals are discrete
init.immune <- round(vacc.prop*(N-init.infect))
## Panel A
if(panelA) # if plotting time series
{
## initialize empty plot
plot(0,0, xlim = c(min(time.ticks), max(time.ticks)), ylim = c(0,N/2),
type = "n", xaxt="n", ylab="# infected", xlab = "", bty = "n")
## make time axis, label by weekday
axis.POSIXct(1, at = time.ticks, las = 2, format = "%a", label = TRUE)
}
for(ss in 1:num.sims) # do num.sims number of outbreaks at that vacc.prop
{
## initialize dataframe to add rows to
sim <- data.frame(tt = min(time.ticks), S = N - init.infect - init.immune,
I = init.infect, R = init.immune)
while(sim$I[nrow(sim)] > 0) # while there are still infected individuals
{
lr <- nrow(sim) #last row
## This simulation is done using the Gillespie algorithm - for more
## details on how this algorithm works, visit the MMED website's
## Tutorials page
## Total event rate
event.rate <- beta*sim$I[lr]*sim$S[lr] + 1/infper*sim$I[lr]
## new time
new.tt <- sim$tt[lr] + 3600*24*rexp(1, event.rate)
## if the event is a new recovery
if(runif(1) < ( (1/infper*sim$I[lr]) / event.rate))
{
new.ss <- sim$S[lr]
new.ii <- sim$I[lr] - 1
new.rr <- sim$R[lr] + 1
cum.inc.matrix[ss,vv] <- cum.inc.matrix[ss,vv] + 1
}else{ # the event is a new infection
new.ss <- sim$S[lr] - 1
new.ii <- sim$I[lr] + 1
new.rr <- sim$R[lr]
}
## add new empty row and fill it in with the new time and new state variables
sim <- rbind(sim, rep(NA,4))
sim$tt[lr+1] <- new.tt
sim[lr+1,2:4] <- c(new.ss, new.ii, new.rr)
} # end while loop
## if plotting time series, add it ot the plot
if(panelA) lines(sim$tt, sim$I, type = "s", col = rainbow(num.sims)[ss])
} # end simulation loop
} # vaccine prop loop
}
## Panel B
## Plot results of the simulations
par(mar=c(4,4,1,1))
plot(0,0, xlab = "initial proportion immune",
ylab = "outbreak size", xlim = c(0,1), ylim = c(0,60),
axes = F,
type="n", bty = "n")
axis(2, at = seq(0,60, by = 20))
axis(1, at = seq(0,1, by = .5))
## calculate medians
meds <- apply(cum.inc.matrix,2, quantile, prob = .5)
## calculate quantiles frome 0-100% by 10%
quants <- apply(cum.inc.matrix,2, quantile, prob = seq(0,1, by = .1))
# for each pair of symmetric quantiles (0,100), (10,90), (20,80), (30,70), (40,60)
## create a polygon, with darker shades for simulation results closer to the median
for(ii in 0:4)
{
polygon(c(vacc.prop.seq, rev(vacc.prop.seq)), c(quants[ii+1,], rev(quants[11-ii,])),
col = gray(.7-ii/10), lty = 0)
}
## Create a black line for the median
lines(vacc.prop.seq, meds, lwd = 1)
dev.off()
## Note that the additional labeling of the quantiles and other formatting was performed in
## Adobe Illustrator outside of R.