## How to make epidemiological training infectious.
## Bellan et al. 2012 PLoS Biology
##
## NOTE: This file can be converted to an R script by replacing the .txt suffix with a .R
## suffix. Individual R files, additional exercises, and future code updates are available
## for download from the MMED website at:
## http://lalashan.mcmaster.ca/theobio/mmed/index.php/MMF
##
##
#####################################################################
## This code gives all the necessary scripts for the below
## described exercises. Please feel free to modify and distribute
## this script as long as you retain the above citation
## in the header.
##
## 1 - Computer simulations to determine R0 of the outbreak given the
## number of course participants (to be performed by the organizers)
## 2 - Case Control Study
## 3 - Estimation of latent and infectious periods
## 4 - Stochastic simulation of outbreaks using fitted parameters
######################################################################
######################################################################
## 1 - Computer simulations to determine R0 of the outbreak given the
## number of course participants (to be performed by the organizers)
######################################################################
## By manipulating N, R0, and infectious period quantities, organizers
## can experiment with how they think their outbreak may proceed. We
## assume a latent period of 0 as participants can infect individuals
## as soon as they are infected as we performed the exercise but we
## leave it up to others to add latent periods.
## Note that the following stochastic simulation uses a chain binomial
## algorithm to simulate the epidemics while the model for Figure 3 is
## produced using the Gillespie algorithm. These both are tenable
## ways to model an outbreak and differ primarily in that the former
## considers time in discrete units and the latter considers time to
## be continuous.
par(mfrow=c(1,2))
num.sims <- 10
N <- 40 # number participants
vacc.prop <- .35 # immune proportion at the beginning
init.immune <- round(vacc.prop*N) # initial immune individuals
init.infect <- 4
R0 <- 3.5 # R0
infper <- 1 # infectious period in days
## latper <- 0 # latent period in days
### R0 = beta*N/(1/infper) = beta*N*infper
### beta = R0/(N*infper)
beta <- R0/(N*infper) # transmission coefficient
step <- .1 # time steps (in days)
timeseq <- seq(0, 14, by = step) # discrete time intervals to simulate
plot(0,0, type ="n", xlim = c(min(timeseq), max(timeseq)),
ylim = c(0, N+10), bty = "n", xlab = "time (days)", ylab = "# hosts")
cum.incidence <- rep(init.infect,num.sims)
for(ss in 1:num.sims) # do num.sims outbreaks
{
sim <- data.frame(S = N - init.infect - init.immune, I = init.infect, R = init.immune)
for(ii in 2:length(timeseq)) # run through time series
{
p.trans <- 1 - exp(-beta*step*sim$I[ii-1]) # probability of S -> I per unit S
new.inf <- rbinom(1, sim$S[ii-1], p.trans) # number new infections
p.recov <- 1 - exp(-1/infper*step) # probability of I -> R
new.recov <- rbinom(1, sim$I[ii-1], p.recov)
temp.S <- sim$S[ii-1] - new.inf
temp.I <- sim$I[ii-1] + new.inf - new.recov
temp.R <- sim$R[ii-1] + new.recov
sim <- rbind(sim, c(temp.S,temp.I,temp.R))
cum.incidence[ss] <- cum.incidence[ss]+new.inf
}
## lines(timeseq,sim[,"S"],col = "black")
lines(timeseq,sim[,"I"],col = "red")
lines(timeseq,sim[,"R"],col = "blue")
}
legend("topright", names(sim), col=c("black","red","blue"), lwd = 1)
hist(cum.incidence, xlim = c(0,N), breaks = seq(0,N, by=1), col = "red", main = "cumulative incidence", ylim = c(-2,20))
## abline(v = 23, lty = 3, col = "gray", lwd = 5)
arrows(23, -5, 23, 0, code = 2, angle = 45, length = .1, lwd = 2)
######################################################################
## 3 - Case Control Study
######################################################################
######################################################################
## In case control studies, the population is selected by picking
## known cases and appropriate controls. Thus the disease status is
## known from the outset of the study and we cannot calculate
## proportion of diseased or non-diseased individuals since those
## proportions are determined by how many controls we select for each
## case. For that reason, we use odds of exposure to a specific risk
## factor instead, where odds is defined by (# exposed)/(# not
## exposed). And the odds ratio is the ratio between the odds of
## exposure in diseased individual to that in non-diseased individuals.
## Change below to the directory where you saved Dataset S6:
setwd("~/Documents/R files/MMF/MMFsupinfo/")
## 2010
riskdat <- read.csv("Dataset_S6_MMFRF2010.csv")
## writing your own odds ratio & confidence interval calculator
odds.ratio <- function(tab)
{
or.out <- tab[1,1]*tab[2,2] / (tab[1,2] * tab[2,1])
## se(logOR) = sqrt(1/n1 +1/n2 + 1/n3 + 1/n4)
se <- sqrt(sum(1/gend.tab))
upper <- exp(log(or.out) + 1.96*se)
lower <- exp(log(or.out) - 1.96*se)
out <- data.frame(or = or.out, lower95 = lower, upper95 = upper,
ref=paste(rownames(tab)[1],":",rownames(tab)[2],sep = ""))
return(out)
}
names(riskdat)
## What is the OR for being infected based on gender?
gend.tab <- xtabs(~gender + reported.case, data = riskdat)
odds.ratio(gend.tab)
# odds of females being infected is .75 that of males, but this is not
# significant because the confidence intervals contain 1.
## What is the OR for being infected based on accomodation?
xtabs(~where.stay + reported.case, data = riskdat)
## To many different places people stayed, why don't we compare AIMS
## to everywhere else by creating a logical vector stating whether participants were
## housed at aims or not.
aims <- grepl("aims", riskdat$where.stay, ignore.case = TRUE)
odds.ratio(xtabs(~aims + reported.case, data = riskdat))
## odds of being infected if you weren't at AIMS was 2x bigger than if
## you were at AIMS, but this is not significant because the
## confidence intervals contain 1
## What is the OR by age?
riskdat$age.cat
## though it is called age category, it is represented by a number
## which corresponds to increasing age. We can treat this as a
## continuous variable (which makes the assumptioin that
## OR(1:2)=OR(2:3)=OR(3:4)=OR(4:5) and then fit a logistic model:
logit.mod <- glm(reported.case ~ age.cat, data = riskdat, family=binomial)
summary(logit.mod)
coef(logit.mod) # intercept and logOR
confint(logit.mod) # gives the confidence intervals for the logOR
exp(coef(logit.mod)["age.cat"]) # OR
exp(confint(logit.mod)["age.cat",]) # gives the confidence intervals for OR
## So there's a 0.64 OR by age meaning that with each increase in age
## category there is a .64 lower odds of being infected, but since the
## confidence interval overlaps 1 this is not significant.
## We could also have done this with the package epicalc, but its good
## to see the calculations done above.
library(epicalc)
cc(riskdat$reported.case, riskdat$gender, graph = FALSE)
odds.ratio(gend.tab)
exp(confint(glm(reported.case ~ gender, family = binomial, data = riskdat)))
## Note that the confidence intervals are slightly different, this is
## because they use different approximation methods for calculating
## the confidence interval. confint() uses profile likelihood
## confidence intervals which are the best option.
######################################################################
######################################################################
## Retrospective Cohort Study
######################################################################
library(epicalc)
## In a cohort study, the population is chosen without knowing their
## disease status. Thus it is possible to use the population as the
## denominator and calculate incidence ratios and not just odds
## ratios. We do not provide a tutorial here, but encourage you to
## read epicalc's help or learn how to do Poisson regression with
## person-time offsets.
######################################################################
######################################################################
## 3 - Estimation of latent and infectious periods
######################################################################
lat.pers <- rgamma(50, shape = 2, rate = 5) # fake latent period data
hist(lat.pers, freq = T, col = "gray", xlab = "latent period",
main = "histogram of observed latent periods", xlim = c(0,1.5*max(lat.pers)))
## create a function that calculates the negative log-likelihood of a
## pair of gamma distribution parameters. We optimize over the log of
## the shape and rate parameters because then optimize can search
## through all real numbers.
gamnll <- function(pers = lat.pers, params = c(log.shape = NULL, log.rate = NULL))
{
nll <- sum(-dgamma(pers,
shape = exp(params["shape"]), rate = exp(params["rate"]),
log = TRUE))
return(nll)
}
## Try out a few to get an idea of how they compare
hist(lat.pers, freq = F, col = "gray", xlab = "latent period",
main = "histogram of observed latent periods", xlim = c(0,1.5*max(lat.pers)),
ylim = c(0,3))
shape.examples <- c(2,1,5) # example shape parameters to calculate NLL
rate.examples <- c(5,3,3) # example rate parameters to calculate NLL
cols <- rainbow(length(shape.examples))
for(ii in 1:length(shape.examples)) # for each pair of shape & rate parameters
{
curve(dgamma(x,shape.examples[ii],rate.examples[ii]), # draw gamma pdf
add = T, col = cols[ii])
## calculate negative log likelihood and write it on the plot
tempnll <- gamnll(params=c(shape =log(shape.examples)[ii], rate=log(rate.examples)[ii]))
mtext(signif(tempnll,2), side = 3, adj = .55, col = cols[ii], line = -3-ii)
}
## title the negative log likelihoods on the plot, the smallest one
## means the best fit
mtext("-log likelihoods:", side = 3, adj = .55, col = "black", line = -3)
## Now let's optimize gamnll to give the lowest NLL
init.params <- log(c(shape = 1, rate = 1))
opt <- optim(init.params, gamnll, pers = lat.pers)
opt$par
hist(lat.pers, freq = F, col = "gray", xlab = "latent period",
main = "histogram of observed latent periods", xlim = c(0,1.5*max(lat.pers)),
ylim = c(0,3))
curve(dgamma(x,exp(opt$par["shape"]),exp(opt$par["rate"])), # draw gamma pdf
add = T, col = "black")
######################################################################
## Now do it with the real infectious period data from MMF 2011
######################################################################
## Change below to the directory where you saved Dataset S7:
setwd("~/Documents/R files/MMF/MMFsupinfo/")
## 2011
epidat <- read.csv("Dataset_S7_infper.csv")
## Look at the distribution first
hist(epidat$Duration, breaks = 6, col = "gray", xlab = "infectious period in hours",
main = "infectious period distribution of MMF outbreak, 2011")
## There is a clear outlier from someone who never recovered, so we
## remove it.
epidat$Duration[epidat$Duration>100] <- NA
## Redo the histogram
hist(epidat$Duration, breaks = 6, col = "gray", xlab = "infectious period in hours",
main = "infectious period distribution of MMF outbreak, 2011")
## Get rid of NAs to feed our optim function
durs <- as.vector(na.omit(epidat$Duration))
opt <- optim(init.params, gamnll, pers = durs)
## Plot resulting distribution on the data
hist(epidat$Duration, breaks = 6, col = "gray", xlab = "infectious period in hours",
freq = F, main = "infectious period distribution of MMF outbreak, 2011")
curve(dgamma(x,exp(opt$par["shape"]),exp(opt$par["rate"])), # draw gamma pdf
add = T, col = "black")
## Get the average infections period from fitted gamma and from data
exp(opt$par)[1]/exp(opt$par)[2] #mean of the gamma distribution is the shape/rate
mean(epidat$Duration, na.rm=T)
######################################################################
######################################################################
## 4 - Stochastic simulation of outbreaks using fitted parameters
######################################################################
## To do this participants simply need to repeat what the organizers
## do in (1) above with the estimated parameters. It
## should be an interesting exercise for students to compare the
## fitted parameters with to their true values. There are many
## reasons they may differ (individuals incorrectly following
## directions, newly infected individuals overly excited to infect new
## people and doing so right away, etc...)
## Other things that could be examined include the effectiveness of
## vaccination vs quarantine vs treatment.
######################################################################
## This code shows how to look at the distribution of final epidemic
## size vs initial proportion vaccinated. This uses a chain binomial
## algorithm for simulation. For an alternative, continuous time
## algorithm (that is computationally faster for smaller populations)
## see the script to produce Figure 3 in supporting information file
## Text_S3_Rcode_figs.txt
###
vacc.prop.seq <- seq(0,1,by = .05)
init.infect <- 4
num.sims <- 100
## Set up matrix to fill in final epidemic size for each simulation
## for each vaccination proportion
cum.inc.matrix <- matrix(init.infect, num.sims, length(vacc.prop.seq))
N <- 40 # number participants
R0 <- 3.5 # R0
infper <- 1 # infectious period in days
beta <- R0/(N*infper) # force of infection
step <- .1 # time steps (in days)
timeseq <- seq(0, 14, by = step) # discrete time intervals to simulate
for(vv in 1:length(vacc.prop.seq))
{
vacc.prop <- vacc.prop.seq[vv] # immune proportion at the beginning
print(paste("Starting simulations for a vaccinated proportion of",vacc.prop))
init.immune <- round(vacc.prop*N) # initial immune individuals
for(ss in 1:num.sims) # do num.sims outbreaks
{
sim <- data.frame(S = N - init.infect - init.immune, I = init.infect, R = init.immune)
for(ii in 2:length(timeseq)) # run through time series
{
p.trans <- 1 - exp(-beta*step*sim$I[ii-1]) # probability of S -> I per unit S
new.inf <- rbinom(1, sim$S[ii-1], p.trans) # number new infections
p.recov <- 1 - exp(-1/infper*step) # probability of I -> R
new.recov <- rbinom(1, sim$I[ii-1], p.recov)
temp.S <- sim$S[ii-1] - new.inf
temp.I <- sim$I[ii-1] + new.inf - new.recov
temp.R <- sim$R[ii-1] + new.recov
sim <- rbind(sim, c(temp.S,temp.I,temp.R))
cum.inc.matrix[ss,vv] <- cum.inc.matrix[ss,vv]+new.inf
} # end time loop
} # end sim loop
} # end vacc loop
par(mar=c(5,4,2,2))
boxplot(cum.inc.matrix, xlab = "initial vaccinated proportion", ylab = "final size of epidemic", ylim = c(0,N), xaxt="n")
axis(1, at = 1:length(vacc.prop.seq), labels = vacc.prop.seq)