###################################################################### # S3_R.txt An R function to calculate the number of missing genes # # # # in RNA-Seq libraries # # # # Reference: # # Luis Fernando Garcia-Ortega and Octavio Martinez. # # How many genes are expressed in a transcriptome? Estimation and # # results for RNA-seq Submitted to PLoS One. January, 2015. # # # # The full R package can be obtained from the following link # # http://computational.biology.langebio.cinvestav.mx/DOWNLOAD/UndetectedGenes/ # -------------------------------------------------------------------# # # # To use the function documented here set the file "S3_R.txt" in # # the current directory and in R type 'source("S3_R.txt")' # # The following paragraphs give you information about the function # # # # You are free to distribute this code (License: GPL-2) # # in such case give credit and please keep this header. # # # # Questions and comments to # # Octavio Martinez omartine@langebio.cinvestav.mx # # # # -------------------------------------------------------------------# # Description # # # # Estimates the number of undetected or missing genes that are # # likely to be expressed in an RNA-seq library but were missed in # # a particular sample (x) of the library. Approximated bias, # # standard error and confidence limits are also provided. # # Additionally, the sample size needed to estimate a given # # percentage of the undetected genes can also be obtained. # # Extra statistics can also be provided (see details) # # # # -------------------------------------------------------------------# # # # Usage # # h6(x, dist = "pois", B = 1000, alpha1 = 0.05, alpha2 = 0.05, # # psi = 0.95, extra = FALSE) # # # # -------------------------------------------------------------------# # # # Arguments # # # # x An integer vector containing the raw (NOT normalized!) counts # # for the number of tags for each one of the genes detected in # # the sample. # # # # dist Distribution assumed for the bootstrap procedure to obtain # # approximated standard error, bias and confidence intervals. # # Possible values are "pois" (Poisson, default) and "mult" # # (Multinomial). # # # # B Number of bootstrap replicates to obtain approximated standard # # error, bias and confidence intervals. # # # # alpha1 Probability of error Type I for approximate confidence # # intervals (at 1-alpha confidence level) # # # # alpha2 Probability to calculate the sample size, T, needed to # # obtain P[f0>0] = alpha2 assuming that the current vector # # represents a complete sample (non undetected genes) # # # # psi Desired proportion of the undetected genes that wants to be # # detected with extra sample, must be 0 < psi < 1. # # # # extra Option to ask for extra calculations. If FALSE a vector is # # returned, if TRUE a list with the extra components is returned. # # # # -------------------------------------------------------------------# # # # Details # # # # Counts of gene tags resulting from the sequencing and mapping of # # an RNA-seq library provide evidence of expression for a given # # number of genes. However, genes with low expression level are # # likely to be missed in any given sample, even when the sample # # size (sequence depth) could be large. This function provides an # # estimate of the number of genes that were undetected (missed) in # # the sample (h6). It also includes estimates of the bias # # (bias.h6), standard error (se.h6) and approximate confidence # # limits for the number of undetected genes, LL and UL (lower and # # upper limits respectively). # # # # Additionally, if extra=TRUE, a list is given with extra # # calculations, including sample attributes, sample size needed # # for a probability alpha2 to have no undetected genes, extra # # sample size needed to estimate a proportion psi of the estimated # # undetected genes, values of frequencies of frequencies fr in the # # original data and the call done to the function. # # # # -------------------------------------------------------------------# # # # Value # # # # If extra=FALSE (the default), then the function returns a vector # # of statistics for h6 with components: h6 - estimate of the # # number of undetected (missing) genes; bias.h6 - bias for the # # estimate h6 (mean of the bootstrap estimates of h6 minus value # # of h6 in the original sample), se.h6 - estimated standard error # # of h6 and (LL, UL) the lower and upper (1-alpha1) confidence # # limits for h6. # # # # If extra=TRUE then the result is a list with components # # # # h6.stats Vector os basic statistics for the number of undetected # # genes (see above) # # # # sam.atr Sample attributes; a vector with components: # # g - Number of genes observed in the original sample, # # N - sample size (sum of tags in the original sample), # # G.est - Number of genes estimated to be expressed in the # # transcriptome; G.est = g + h6. # # # # T.res Vector of results for sample size needed to obtain # # P[f0=0] = alpha2 with components: T - Estimated sample size # # needed to have P[f0=0] = alpha2 ASSUMING that the sample is # # complete, rTN - Ratio T/N, alpha2 - The input value of alpha2 # # used to calculate T, error - Numerical error in the # # approximation of the sample. # # # # m.psi Integer, m.psi - Extra sample size needed to estimate a # # proportion psi of the undetected genes. # # # # fr.sam Vector with the frequencies of frequencies f1, f2, f3, f4, # # f5, f6 observed in the original data. # # # # the.call The call for the function. # # # # -------------------------------------------------------------------# # # # Note Beware to the fact that the bootstrap procedure employed to # # calculate standard error and confidence interval is likely to # # underestimate the true variability of gene expression. # # # # -------------------------------------------------------------------# # Authors # # Octavio Martinez and Luis Fernando Garcia-Ortega # # -------------------------------------------------------------------# # # # Examples # # # # Runs with a dummy vector and default parameters # # temp <- rep(c(1:10), times=c(10:1)) # # (the dummy vector "temp" is assumed to be the result # # of an RNA-seq experiment) # # h6(temp) # # rm(temp) # # Runs with a distinct dummy vector, but with custom parameters # # and gives extra calculations. # # h6(rep(x=c(1:20), each=5), dist="mult", B=5000, # # alpha1=0.01, alpha2=0.01, psi=0.99, extra=TRUE) # # Running with a large vector roughly mimicking a real RNAseq # # library: # # h6(x=rnbinom(n=10000, size=1, mu=10)+1, extra=TRUE) # # -------------------------------------------------------------------# # # # Below this line is the code for the function DO NOT MODIFY! # # -------------------------------------------------------------------# h6 <- function (x, dist = "pois", B = 1000, alpha1 = 0.05, alpha2 = 0.05, psi = 0.95, extra = FALSE) { y <- x[x > 0] temp <- sum(y - round(y)) if (temp > 0) stop("The argument x does not appears to be a vector of integers!") fr.sam <- tabulate(y, nbins = 6) names(fr.sam) <- paste("f", c(1:6), sep = "") temp <- sum(1 * (fr.sam[2:6] == 0)) if (temp != 0) { temp <- paste("fr.sam = ", paste(fr.sam, collapse = ", "), "Some values of fr in sample are zero, resulting in h6=NA!") stop(temp) } the.call <- match.call() h6 <- function(z) { fr <- tabulate(bin = z, nbins = 6) r1 <- 6 * (fr[1]^2) * sum(1/fr[2:6])/50 if (is.na(r1)) return(r1) if (r1 == Inf) r1 <- NA r1 } est.h6 <- h6(y) if ((dist != "pois") & (dist != "mult")) stop("Only dist=\"pois\" (Poisson) or Multinomial (\"mult\") are implemented, sorry.") ng <- length(y) N <- sum(y) if (dist == "pois") { boo <- as.vector(replicate(n = B, h6(rpois(n = ng, lambda = y)))) } else { boo <- as.vector(replicate(n = B, h6(rmultinom(1, size = N, prob = y)))) } mean.boo <- mean(boo, na.rm = TRUE) bias.h6 <- mean.boo - est.h6 se.h6 <- sd(boo, na.rm = TRUE) za <- qnorm(p = alpha1/2, lower.tail = FALSE) LL <- round(est.h6 - (za * se.h6)) LL <- 0 * (LL < 0) + LL * (LL >= 0) UL <- round(est.h6 + (za * se.h6)) res <- c(round(est.h6), bias.h6, se.h6, LL, UL) names(res) <- c("h6", "bias.h6", "se.h6", "LL", "UL") if (extra) { G.est <- ng + est.h6 sam.atr <- c(ng, N, round(G.est)) names(sam.atr) <- c("g", "N", "G.est") to.opt <- function(T, prob, alpha) { (1 - prod(1 - exp(-T * prob)) - alpha)^2 } p <- y/N the.min <- optimize(f = to.opt, interval = c(N/2, 20 * N), prob = p, alpha = alpha2, tol = 1e-06) T.res <- c(round(the.min$minimum), the.min$minimum/N, alpha2, the.min$objective) names(T.res) <- c("T", "rTN", "alpha2", "error") f1 <- length(y[y == 1]) if (f1 > 0) { m.psi <- round(N * (est.h6/f1) * log(est.h6/(G.est * (1 - psi)))) if (m.psi < 0) m.psi <- NA } else { m.psi <- NA } res <- list(res, sam.atr, T.res, m.psi, fr.sam, the.call) names(res) <- list("h6.stats", "sam.atr", "T.res", "m.psi", "fr.sam", "the.call") } res }