############################################################################### # R SCRIPTS USED FOR CM3 MODELLING # ############################################################################### # Step 1: Load required libraries, set paths and define helper function library(dplyr) library(qpcR) library(stringr) filePath = "C:\\data_analysis\\modelling\\CM3\\" counter <- qpcR:::counter isTryError <- function(x) inherits(x, "try-error") colMax <- function(data) sapply(data, max, na.rm = TRUE) # Step 2: Read raw data and prepare dataframe for output data inputFile1 <- read.table(paste0(filePath, "FMDV panel_replicate1.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) inputFile2 <- read.table(paste0(filePath, "FMDV panel_replicate2.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) inputFile3 <- read.table(paste0(filePath, "FMDV panel_replicate3.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) inputFile <- left_join(x = inputFile1, y = inputFile2, by = "Cycles") inputFile <- left_join(x = inputFile, y = inputFile3, by = "Cycles") inputData5UTRn <- dplyr::select(inputFile, contains("Cycles"), contains("5UTR.n")) inputData5UTRf <- dplyr::select(inputFile, contains("Cycles"), contains("5UTR.f")) inputData3Dn <- dplyr::select(inputFile, contains("Cycles"), contains("3D.n")) inputData3Df <- dplyr::select(inputFile, contains("Cycles"), contains("3D.f")) nonTailed5UTRParameters <- data.frame(matrix(data = NA, nrow = (ncol(inputData5UTRn) - 1), ncol = 8)) colnames(nonTailed5UTRParameters) <- c("strain", "replicate", "D0_non-tailed", "max_non-tailed", "Kd_non-tailed", "Fb_non-tailed", "RSS_non-tailed", "Rsq.ad_non-tailed") tailed5UTRParameters <- data.frame(matrix(data = NA, nrow = (ncol(inputData5UTRn) - 1), ncol = 8)) colnames(tailed5UTRParameters) <- c("strain", "replicate", "D0_tailed", "max_tailed", "Kd_tailed", "Fb_tailed", "RSS_tailed", "Rsq.ad_tailed") nonTailed3DParameters <- data.frame(matrix(data = NA, nrow = (ncol(inputData5UTRn) - 1), ncol = 8)) colnames(nonTailed3DParameters) <- c("strain", "replicate", "D0_non-tailed", "max_non-tailed", "Kd_non-tailed", "Fb_non-tailed", "RSS_non-tailed", "Rsq.ad_non-tailed") tailed3DParameters <- data.frame(matrix(data = NA, nrow = (ncol(inputData5UTRn) - 1), ncol = 8)) colnames(tailed3DParameters) <- c("strain", "replicate", "D0_tailed", "max_tailed", "Kd_tailed", "Fb_tailed", "RSS_tailed", "Rsq.ad_tailed") # Step 3: Fit cm3 models for all panFMDV-5UTR RT-qPCR reactions and save model # parameters in output dataframes # Step 3.1: Calculate weights for each reaction maximum5UTRn <- colMax(inputData5UTRn[, 2:ncol(inputData5UTRn)]) weightData5UTRn <- data.frame(matrix(data = NA, nrow = nrow(inputData5UTRn), ncol = (ncol(inputData5UTRn) - 1))) for (i in 1:ncol(weightData5UTRn)) { for (j in 1:nrow(weightData5UTRn)) { weightData5UTRn[j, i] <- 1 / abs(inputData5UTRn[j, i + 1] / maximum5UTRn[i]) } } maximum5UTRf <- colMax(inputData5UTRf[, 2:ncol(inputData5UTRf)]) weightData5UTRf <- data.frame(matrix(data = NA, nrow = nrow(inputData5UTRf), ncol = (ncol(inputData5UTRf) - 1))) for (i in 1:ncol(weightData5UTRf)) { for (j in 1:nrow(weightData5UTRf)) { weightData5UTRf[j, i] <- 1 / abs(inputData5UTRf[j, i + 1] / maximum5UTRf[i]) } } # Step 3.2: Fit model to each reaction and store parameters in dataframe for (i in 2:ncol(inputData5UTRn)){ strainName <- str_extract(string = colnames(inputData5UTRn)[i], pattern = "^[^_]+(?=_)") strainName <- str_replace_all(string = strainName, pattern = "[.]", replacement = "/") nonTailed5UTRParameters$strain[i - 1] <- strainName nonTailed5UTRParameters$replicate[i - 1] <- substr( x = colnames(inputData5UTRn)[i], start = nchar(colnames(inputData5UTRn)[i]), stop = nchar(colnames(inputData5UTRn)[i])) tryModel <- try(cm3Model <- pcrfit(data = inputData5UTRn, cyc = 1, fluo = i, weights = weightData5UTRn[, i - 1], model = cm3)) if (isTryError(tryModel)) { nonTailed5UTRParameters$'D0_non-tailed'[i - 1] <- "NA" nonTailed5UTRParameters$'max_non-tailed'[i - 1] <- "NA" nonTailed5UTRParameters$'Kd_non-tailed'[i - 1] <- "NA" nonTailed5UTRParameters$'Fb_non-tailed'[i - 1] <- "NA" nonTailed5UTRParameters$'RSS_non-tailed'[i - 1] <- "NA" nonTailed5UTRParameters$'Rsq.ad_non-tailed'[i - 1] <- "NA" } else { parameters <- data.frame(cm3Model$parMat, stringsAsFactors = FALSE) nonTailed5UTRParameters$'D0_non-tailed'[i - 1] <- parameters$D0[2] nonTailed5UTRParameters$'max_non-tailed'[i - 1] <- parameters$max[2] nonTailed5UTRParameters$'Kd_non-tailed'[i - 1] <- parameters$Kd[2] nonTailed5UTRParameters$'Fb_non-tailed'[i - 1] <- parameters$Fb[2] nonTailed5UTRParameters$'RSS_non-tailed'[i - 1] <- RSS(cm3Model) nonTailed5UTRParameters$'Rsq.ad_non-tailed'[i - 1] <- Rsq.ad(cm3Model) } strainName <- str_extract(string = colnames(inputData5UTRf)[i], pattern = "^[^_]+(?=_)") strainName <- str_replace_all(string = strainName, pattern = "[.]", replacement = "/") tailed5UTRParameters$strain[i - 1] <- strainName tailed5UTRParameters$replicate[i - 1] <- substr( x = colnames(inputData5UTRf)[i], start = nchar(colnames(inputData5UTRf)[i]), stop = nchar(colnames(inputData5UTRf)[i])) tryModel <- try(cm3Model <- pcrfit(data = inputData5UTRf, cyc = 1, fluo = i, weights = weightData5UTRf[, i - 1], model = cm3)) if (isTryError(tryModel)) { tailed5UTRParameters$'D0_tailed'[i - 1] <- "NA" tailed5UTRParameters$'max_tailed'[i - 1] <- "NA" tailed5UTRParameters$'Kd_tailed'[i - 1] <- "NA" tailed5UTRParameters$'Fb_tailed'[i - 1] <- "NA" tailed5UTRParameters$'RSS_tailed'[i - 1] <- "NA" tailed5UTRParameters$'Rsq.ad_tailed'[i - 1] <- "NA" } else { parameters <- data.frame(cm3Model$parMat, stringsAsFactors = FALSE) tailed5UTRParameters$'D0_tailed'[i - 1] <- parameters$D0[2] tailed5UTRParameters$'max_tailed'[i - 1] <- parameters$max[2] tailed5UTRParameters$'Kd_tailed'[i - 1] <- parameters$Kd[2] tailed5UTRParameters$'Fb_tailed'[i - 1] <- parameters$Fb[2] tailed5UTRParameters$'RSS_tailed'[i - 1] <- RSS(cm3Model) tailed5UTRParameters$'Rsq.ad_tailed'[i - 1] <- Rsq.ad(cm3Model) } } outputData5UTR <- left_join(x = nonTailed5UTRParameters, y = tailed5UTRParameters, by = c("strain", "replicate")) write.table(x = outputData5UTR, file = paste0(filePath, "FMDV-5UTR_weightAdjustedCM3ModelParameters.txt"), sep = "\t", row.names = FALSE) # Step 4: Fit cm3 models for all panFMDV-3D RT-qPCR reactions and save model # parameters in output dataframes # Step 4.1: Calculate weights for each reaction maximum3Dn <- colMax(inputData3Dn[, 2:ncol(inputData3Dn)]) weightData3Dn <- data.frame(matrix(data = NA, nrow = nrow(inputData3Dn), ncol = (ncol(inputData3Dn) - 1))) for (i in 1:ncol(weightData3Dn)) { for (j in 1:nrow(weightData3Dn)) { weightData3Dn[j, i] <- 1 / abs(inputData3Dn[j, i + 1] / maximum3Dn[i]) } } maximum3Df <- colMax(inputData3Df[, 2:ncol(inputData3Df)]) weightData3Df <- data.frame(matrix(data = NA, nrow = nrow(inputData3Df), ncol = (ncol(inputData3Df) - 1))) for (i in 1:ncol(weightData3Df)) { for (j in 1:nrow(weightData3Df)) { weightData3Df[j, i] <- 1 / abs(inputData3Df[j, i + 1] / maximum3Df[i]) } } # Step 4.2: Fit model to each reaction and store parameters in dataframe for (i in 2:ncol(inputData3Dn)){ strainName <- str_extract(string = colnames(inputData3Dn)[i], pattern = "^[^_]+(?=_)") strainName <- str_replace_all(string = strainName, pattern = "[.]", replacement = "/") nonTailed3DParameters$strain[i - 1] <- strainName nonTailed3DParameters$replicate[i - 1] <- substr( x = colnames(inputData3Dn)[i], start = nchar(colnames(inputData3Dn)[i]), stop = nchar(colnames(inputData3Dn)[i])) tryModel <- try(cm3Model <- pcrfit(data = inputData3Dn, cyc = 1, fluo = i, weights = weightData3Dn[, i - 1], model = cm3)) if (isTryError(tryModel)) { nonTailed3DParameters$'D0_non-tailed'[i - 1] <- "NA" nonTailed3DParameters$'max_non-tailed'[i - 1] <- "NA" nonTailed3DParameters$'Kd_non-tailed'[i - 1] <- "NA" nonTailed3DParameters$'Fb_non-tailed'[i - 1] <- "NA" nonTailed3DParameters$'RSS_non-tailed'[i - 1] <- "NA" nonTailed3DParameters$'Rsq.ad_non-tailed'[i - 1] <- "NA" } else { parameters <- data.frame(cm3Model$parMat, stringsAsFactors = FALSE) nonTailed3DParameters$'D0_non-tailed'[i - 1] <- parameters$D0[2] nonTailed3DParameters$'max_non-tailed'[i - 1] <- parameters$max[2] nonTailed3DParameters$'Kd_non-tailed'[i - 1] <- parameters$Kd[2] nonTailed3DParameters$'Fb_non-tailed'[i - 1] <- parameters$Fb[2] nonTailed3DParameters$'RSS_non-tailed'[i - 1] <- RSS(cm3Model) nonTailed3DParameters$'Rsq.ad_non-tailed'[i - 1] <- Rsq.ad(cm3Model) } strainName <- str_extract(string = colnames(inputData3Df)[i], pattern = "^[^_]+(?=_)") strainName <- str_replace_all(string = strainName, pattern = "[.]", replacement = "/") tailed3DParameters$strain[i - 1] <- strainName tailed3DParameters$replicate[i - 1] <- substr(x = colnames(inputData3Df)[i], start = nchar(colnames(inputData3Df)[i]), stop = nchar(colnames(inputData3Df)[i])) tryModel <- try(cm3Model <- pcrfit(data = inputData3Df, cyc = 1, fluo = i, weights = weightData3Df[, i - 1], model = cm3)) if (isTryError(tryModel)) { tailed3DParameters$'D0_tailed'[i - 1] <- "NA" tailed3DParameters$'max_tailed'[i - 1] <- "NA" tailed3DParameters$'Kd_tailed'[i - 1] <- "NA" tailed3DParameters$'Fb_tailed'[i - 1] <- "NA" tailed3DParameters$'RSS_tailed'[i - 1] <- "NA" tailed3DParameters$'Rsq.ad_tailed'[i - 1] <- "NA" } else { parameters <- data.frame(cm3Model$parMat, stringsAsFactors = FALSE) tailed3DParameters$'D0_tailed'[i - 1] <- parameters$D0[2] tailed3DParameters$'max_tailed'[i - 1] <- parameters$max[2] tailed3DParameters$'Kd_tailed'[i - 1] <- parameters$Kd[2] tailed3DParameters$'Fb_tailed'[i - 1] <- parameters$Fb[2] tailed3DParameters$'RSS_tailed'[i - 1] <- RSS(cm3Model) tailed3DParameters$'Rsq.ad_tailed'[i - 1] <- Rsq.ad(cm3Model) } } outputData3D <- left_join(x = nonTailed3DParameters, y = tailed3DParameters, by = c("strain", "replicate")) write.table(x = outputData3D, file = paste0(filePath, "FMDV-3D_weightAdjustedCM3ModelParameters.txt"), sep = "\t", row.names = FALSE) ############################################################################### # R FUNCTION USED TO CONSTRUCT FIGURES 1, 2 AND 6 # ############################################################################### plotAmplificationCurves <- function (rawData, cyclesColumn, fluorescenceColumns, numberConditions, numberDilutions, replicates, conditions, dilutions, pointSymbol = 15, lineType = "solid") { ## author: FrVan ## version: 2.0 ## aim: to plot averaged amplification curves based on replicate data ## input: rawData = data frame containing raw fluorescence data ## cyclesColumn = column within rawData containing cycle numbers ## fluorescenceColumns = columns within rawData containing fluorescence data ## numberConditions = number of conditions ## numberDilutions = number of dilutions ## replicates = number of replicates ## conditions = labels of conditions to be shown in legend ## dilutions = labels of dilutions to be shown in legend ## pointSymbol = symbol to be used to depict points ## lineType = type to be used for plotting lines ## output: pdf file (600 dpi) # Step 1: Load required packages and define helper function require(extrafont) require(ggplot2) require(grid) require(qpcR) require(RColorBrewer) theme_MolPlat <- function(base_size = 12, base_family = "sans") { require(extrafont) require(grid) theme_grey(base_size = base_size, base_family = base_family) %+replace% theme( axis.text = element_text(size = 10, colour = "black"), axis.ticks = element_line(colour = "black"), axis.title.x = element_text(size = base_size, hjust= 0.5, vjust = -0.25, lineheight = 1.0, face = "bold", colour = "black"), axis.title.y = element_text(size = base_size, angle = 90, hjust = 0.5, vjust = 1.5, lineheight = 1.0, face="bold", colour = "black"), legend.direction = "vertical", legend.position = "right", legend.box = "vertical", legend.key = element_rect(colour = NA), legend.title.align = 0.5, legend.title = element_text(size = base_size * 0.8, face = "bold", colour = "black", hjust = 0), legend.text.align = 0, legend.text = element_text(size = base_size * 0.8, face = "bold", colour = "black", hjust = 0), panel.grid.major = element_line("grey95", size = 0.1), panel.grid.minor = element_line("grey95", size = 0.1), panel.background = element_rect(fill = "white"), panel.border = element_rect(fill = NA, colour = "black"), plot.margin = unit(c(2.5, 2.25, 2.5, 2.5), "lines") ) } # Step 2a: Construct model for each individual amplification curve singleModels <- modlist(x = rawData, cyc = cyclesColumn, fluo = fluorescenceColumns, model = l4) # Step 2b: Construct 'consensus' model based on the individual models # from the replicates replicateModels <- replist(object = singleModels, group = gl(n = numberConditions * numberDilutions, k = replicates), opt = TRUE) # Step 3a: Prepare dataset for plotting cycles <- length(rawData[, cyclesColumn]) plotData <- as.data.frame(matrix(data = NA, nrow = numberConditions * numberDilutions * cycles, ncol = 5)) names(plotData) <- c("condition", "dilution", "cycle", "fit", "stdv") plotData[, 1] <- gl(n = numberConditions, k = cycles, length = numberConditions * numberDilutions * cycles, labels = conditions) plotData[, 2] <- gl(n = numberDilutions, k = cycles * numberConditions, length = numberConditions * numberDilutions * cycles, labels = dilutions) for (i in 1 : (numberConditions * numberDilutions)){ replicateData <- replicateModels[[i]]$DATA replicateData <- na.omit(replicateData) fittedData <- fitted(replicateModels[[i]]) stdvData <- tapply(replicateData[, 2], replicateData[, 1], function(x) sd(x, na.rm = TRUE)) plotData[c(((i - 1) * 50 + 1) : (i * 50)), 3] <- unique(replicateData[, 1]) plotData[c(((i - 1) * 50 + 1) : (i * 50)), 4] <- fittedData[1 : cycles] plotData[c(((i - 1) * 50 + 1) : (i * 50)), 5] <- stdvData } # Step 3b: Construct plot if (numberConditions == 1) { amplificationCurves <- ggplot(plotData, aes(x = cycle, y = fit, colour = dilution)) + scale_shape_manual(values=c(15, 8, 1, 5)) + geom_errorbar(aes(ymin = fit - stdv, ymax = fit + stdv), colour = "black", size = 0.25, width = 0.25) + geom_line(linetype = lineType) + geom_point(shape = pointSymbol, size = 1.5) + scale_color_brewer(palette="Set2") } else if (numberDilutions == 1) { amplificationCurves <- ggplot(plotData, aes(x = cycle, y = fit, shape = condition, colour = condition)) + scale_shape_manual(values=c(15, 8, 1, 5)) + geom_errorbar(aes(ymin = fit - stdv, ymax = fit + stdv), colour = "black", size = 0.25, width = 0.25) + geom_line(aes(linetype = condition)) + geom_point(size = 1.5) + scale_linetype_manual(values=c("solid", "dotted", "dotdash", "dashed")) + scale_color_brewer(palette="Set2") } else { amplificationCurves <- ggplot(plotData, aes(x = cycle, y = fit, shape = condition, colour = dilution)) + scale_shape_manual(values=c(15, 8, 1, 5)) + geom_errorbar(aes(ymin = fit - stdv, ymax = fit + stdv), colour = "black", size = 0.25, width = 0.25) + geom_line(aes(linetype = condition)) + geom_point(size = 1.5) + scale_linetype_manual(values=c("solid", "dotted", "dotdash", "dashed")) + scale_color_brewer(palette="Set2") } amplificationCurves <- amplificationCurves + xlab("\ncycle") + ylab("fluorescence\n") + scale_colour_discrete(name = "primer\n pair") + scale_shape_discrete(name = "primer\n pair") + scale_linetype_discrete(name = "primer\n pair") amplificationCurves <- amplificationCurves + theme_MolPlat() ggsave("myAmplificationPlot.pdf", width = 24, height = 16, unit = "cm", dpi = 600) return(amplificationCurves) } ############################################################################### # R SCRIPTS USED TO CONSTRUCT HEAT MAPS # ############################################################################### # Step 1: Merge various data files containing raw count data and # produce separate data files containing raw count data from # non-tailed and tailed PCR reactions. # Step 1.1: Load required packages and set file path library(dplyr) filePath = paste0("C:\\data_analysis\\HTS\\heat_map\\") # Step 1.2: Read non-tailed data files and merge into a single data set and # write merged data to disk as a tab-separated file nonTailedFiles <- dir(filePath, pattern = paste0("_amplicons_nonTailedForward", "PrimerDataNormal.txt")) for (i in 1:length(nonTailedFiles)) { virusName <- sub("_.*", "", nonTailedFiles[i]) inputFile <- read.table(paste0(filePath, nonTailedFiles[i]), header = TRUE, sep = "\t", stringsAsFactors = FALSE) if (i == 1) { outputFile <- dplyr::select(inputFile, id, dG_eff, dG_std, Tm_eff, Tm_std, weight, count) colnames(outputFile)[6:7] <- c("bias", paste0(virusName, "_nt")) } else { inputFile <- dplyr::select(inputFile, id, count) outputFile <- left_join(outputFile, inputFile, by = "id") colnames(outputFile)[length(outputFile)] <- paste0(virusName, "_nt") } } write.table(x = outputFile, file = paste0(filePath, "nonTailedForwardPrimerCountData_normal_amplicons.txt"), sep = "\t", row.names = FALSE) nonTailedFiles <- dir(filePath, pattern = paste0("_amplicons_nonTailedForward", "PrimerDataCleanAmp.txt")) for (i in 1:length(nonTailedFiles)) { virusName <- sub("_.*", "", nonTailedFiles[i]) inputFile <- read.table(paste0(filePath, nonTailedFiles[i]), header = TRUE, sep = "\t", stringsAsFactors = FALSE) if (i == 1) { outputFile <- dplyr::select(inputFile, id, dG_eff, dG_std, Tm_eff, Tm_std, weight, count) colnames(outputFile)[6:7] <- c("bias", paste0(virusName, "_nt")) } else { inputFile <- dplyr::select(inputFile, id, count) outputFile <- left_join(outputFile, inputFile, by = "id") colnames(outputFile)[length(outputFile)] <- paste0(virusName, "_nt") } } write.table(x = outputFile, file = paste0(filePath, "nonTailedForwardPrimerCountData_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE) nonTailedFiles <- dir(filePath, pattern = paste0("_amplicons_nonTailedReverse", "PrimerDataNormal.txt")) for (i in 1:length(nonTailedFiles)) { virusName <- sub("_.*", "", nonTailedFiles[i]) inputFile <- read.table(paste0(filePath, nonTailedFiles[i]), header = TRUE, sep = "\t", stringsAsFactors = FALSE) if (i == 1) { outputFile <- dplyr::select(inputFile, id, dG_eff, dG_std, Tm_eff, Tm_std, weight, count) colnames(outputFile)[6:7] <- c("bias", paste0(virusName, "_nt")) } else { inputFile <- dplyr::select(inputFile, id, count) outputFile <- left_join(outputFile, inputFile, by = "id") colnames(outputFile)[length(outputFile)] <- paste0(virusName, "_nt") } } write.table(x = outputFile, file = paste0(filePath, "nonTailedReversePrimerCountData_normal_amplicons.txt"), sep = "\t", row.names = FALSE) nonTailedFiles <- dir(filePath, pattern = paste0("_amplicons_nonTailedReverse", "PrimerDataCleanAmp.txt")) for (i in 1:length(nonTailedFiles)) { virusName <- sub("_.*", "", nonTailedFiles[i]) inputFile <- read.table(paste0(filePath, nonTailedFiles[i]), header = TRUE, sep = "\t", stringsAsFactors = FALSE) if (i == 1) { outputFile <- dplyr::select(inputFile, id, dG_eff, dG_std, Tm_eff, Tm_std, weight, count) colnames(outputFile)[6:7] <- c("bias", paste0(virusName, "_nt")) } else { inputFile <- dplyr::select(inputFile, id, count) outputFile <- left_join(outputFile, inputFile, by = "id") colnames(outputFile)[length(outputFile)] <- paste0(virusName, "_nt") } } write.table(x = outputFile, file = paste0(filePath, "nonTailedReversePrimerCountData_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE) # Step 1.3: Read tailed data files, merge into a single data set and # write merged data to disk as a tab-separated file tailedFiles <- dir(filePath, pattern = paste0("_amplicons_tailedForward", "PrimerDataNormal.txt")) for (i in 1:length(tailedFiles)) { virusName <- sub("_.*", "", tailedFiles[i]) inputFile <- read.table(paste0(filePath, tailedFiles[i]), header = TRUE, sep = "\t", stringsAsFactors = FALSE) if (i == 1) { outputFile <- dplyr::select(inputFile, id, dG_eff, dG_std, Tm_eff, Tm_std, weight, count) colnames(outputFile)[6:7] <- c("bias", paste0(virusName, "_t")) } else { inputFile <- dplyr::select(inputFile, id, count) outputFile <- left_join(outputFile, inputFile, by = "id") colnames(outputFile)[length(outputFile)] <- paste0(virusName, "_t") } } write.table(x = outputFile, file = paste0(filePath, "tailedForwardPrimerCountData_normal_amplicons.txt"), sep = "\t", row.names = FALSE) tailedFiles <- dir(filePath, pattern = paste0("_amplicons_tailedForward", "PrimerDataCleanAmp.txt")) for (i in 1:length(tailedFiles)) { virusName <- sub("_.*", "", tailedFiles[i]) inputFile <- read.table(paste0(filePath, tailedFiles[i]), header = TRUE, sep = "\t", stringsAsFactors = FALSE) if (i == 1) { outputFile <- dplyr::select(inputFile, id, dG_eff, dG_std, Tm_eff, Tm_std, weight, count) colnames(outputFile)[6:7] <- c("bias", paste0(virusName, "_t")) } else { inputFile <- dplyr::select(inputFile, id, count) outputFile <- left_join(outputFile, inputFile, by = "id") colnames(outputFile)[length(outputFile)] <- paste0(virusName, "_t") } } write.table(x = outputFile, file = paste0(filePath, "tailedForwardPrimerCountData_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE) tailedFiles <- dir(filePath, pattern = paste0("_amplicons_tailedReverse", "PrimerDataNormal.txt")) for (i in 1:length(tailedFiles)) { virusName <- sub("_.*", "", tailedFiles[i]) inputFile <- read.table(paste0(filePath, tailedFiles[i]), header = TRUE, sep = "\t", stringsAsFactors = FALSE) if (i == 1) { outputFile <- dplyr::select(inputFile, id, dG_eff, dG_std, Tm_eff, Tm_std, weight, count) colnames(outputFile)[6:7] <- c("bias", paste0(virusName, "_t")) } else { inputFile <- dplyr::select(inputFile, id, count) outputFile <- left_join(outputFile, inputFile, by = "id") colnames(outputFile)[length(outputFile)] <- paste0(virusName, "_t") } } write.table(x = outputFile, file = paste0(filePath, "tailedReversePrimerCountData_normal_amplicons.txt"), sep = "\t", row.names = FALSE) tailedFiles <- dir(filePath, pattern = paste0("_amplicons_tailedReverse", "PrimerDataCleanAmp.txt")) for (i in 1:length(tailedFiles)) { virusName <- sub("_.*", "", tailedFiles[i]) inputFile <- read.table(paste0(filePath, tailedFiles[i]), header = TRUE, sep = "\t", stringsAsFactors = FALSE) if (i == 1) { outputFile <- dplyr::select(inputFile, id, dG_eff, dG_std, Tm_eff, Tm_std, weight, count) colnames(outputFile)[6:7] <- c("bias", paste0(virusName, "_t")) } else { inputFile <- dplyr::select(inputFile, id, count) outputFile <- left_join(outputFile, inputFile, by = "id") colnames(outputFile)[length(outputFile)] <- paste0(virusName, "_t") } } write.table(x = outputFile, file = paste0(filePath, "tailedReversePrimerCountData_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE) # Step 2: Normalise raw count data according to the 'total count' method and # produce new data files containing normalised count data from # non-tailed and tailed PCR reactions. # Step 2.1: Load required packages and set file path library(dplyr) filePath = paste0("C:\\data_analysis\\HTS\\heat_map\\") # Step 2.2: Read non-tailed data file containing raw count data, normalise # data set and write normalised data to disk as a tab-separated file inputFile <- read.table(paste0(filePath, "nonTailedForwardPrimerCountData", "_normal_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) metaData <- dplyr::select(inputFile, -contains("FMDV")) normalisedCountData <- round(t(t(countData)/rowSums(t(countData))) * mean(colSums(countData))) normalisedCountData <- bind_cols(metaData, as.data.frame(normalisedCountData)) write.table(x = normalisedCountData, file = paste0(filePath, "nonTailedForwardPrimerNormalisedCountData_normal_amplicons.txt"), sep = "\t", row.names = FALSE) inputFile <- read.table(paste0(filePath, "nonTailedForwardPrimerCountData", "_CleanAmp_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) metaData <- dplyr::select(inputFile, -contains("FMDV")) normalisedCountData <- round(t(t(countData)/rowSums(t(countData))) * mean(colSums(countData))) normalisedCountData <- bind_cols(metaData, as.data.frame(normalisedCountData)) write.table(x = normalisedCountData, file = paste0(filePath, "nonTailedForwardPrimerNormalisedCountData_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE) inputFile <- read.table(paste0(filePath, "nonTailedReversePrimerCountData", "_normal_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) metaData <- dplyr::select(inputFile, -contains("FMDV")) normalisedCountData <- round(t(t(countData)/rowSums(t(countData))) * mean(colSums(countData))) normalisedCountData <- bind_cols(metaData, as.data.frame(normalisedCountData)) write.table(x = normalisedCountData, file = paste0(filePath, "nonTailedReversePrimerNormalisedCountData_normal_amplicons.txt"), sep = "\t", row.names = FALSE) inputFile <- read.table(paste0(filePath, "nonTailedReversePrimerCountData", "_CleanAmp_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) metaData <- dplyr::select(inputFile, -contains("FMDV")) normalisedCountData <- round(t(t(countData)/rowSums(t(countData))) * mean(colSums(countData))) normalisedCountData <- bind_cols(metaData, as.data.frame(normalisedCountData)) write.table(x = normalisedCountData, file = paste0(filePath, "nonTailedReversePrimerNormalisedCountData_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE) # Step 2.3: Read tailed data file containing raw count data, normalise # data set and write normalised data to disk as a tab-separated file inputFile <- read.table(paste0(filePath, "tailedForwardPrimerCountData", "_normal_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) metaData <- dplyr::select(inputFile, -contains("FMDV")) normalisedCountData <- round(t(t(countData)/rowSums(t(countData))) * mean(colSums(countData))) normalisedCountData <- bind_cols(metaData, as.data.frame(normalisedCountData)) write.table(x = normalisedCountData, file = paste0(filePath, "tailedForwardPrimerNormalisedCountData_normal_amplicons.txt"), sep = "\t", row.names = FALSE) inputFile <- read.table(paste0(filePath, "tailedForwardPrimerCountData", "_CleanAmp_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) metaData <- dplyr::select(inputFile, -contains("FMDV")) normalisedCountData <- round(t(t(countData)/rowSums(t(countData))) * mean(colSums(countData))) normalisedCountData <- bind_cols(metaData, as.data.frame(normalisedCountData)) write.table(x = normalisedCountData, file = paste0(filePath, "tailedForwardPrimerNormalisedCountData_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE) inputFile <- read.table(paste0(filePath, "tailedReversePrimerCountData", "_normal_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) metaData <- dplyr::select(inputFile, -contains("FMDV")) normalisedCountData <- round(t(t(countData)/rowSums(t(countData))) * mean(colSums(countData))) normalisedCountData <- bind_cols(metaData, as.data.frame(normalisedCountData)) write.table(x = normalisedCountData, file = paste0(filePath, "tailedReversePrimerNormalisedCountData_normal_amplicons.txt"), sep = "\t", row.names = FALSE) inputFile <- read.table(paste0(filePath, "tailedReversePrimerCountData", "_CleanAmp_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) metaData <- dplyr::select(inputFile, -contains("FMDV")) normalisedCountData <- round(t(t(countData)/rowSums(t(countData))) * mean(colSums(countData))) normalisedCountData <- bind_cols(metaData, as.data.frame(normalisedCountData)) write.table(x = normalisedCountData, file = paste0(filePath, "tailedReversePrimerNormalisedCountData_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE) # Step 3: Make heatmaps to visualise the primer utilisation patterns across # the various FMDV strains. # Step 3.1: Load required packages, define helper function and set file path library(dplyr) library(extrafont) library(lattice) library(stringr) unalikeability1 <- function(countData, ...) { unalikeabilityCoefficient <- 0 totalCounts <- sum(countData) for (i in 1 : nrow(countData)) { unalikeabilityCoefficient <- unalikeabilityCoefficient + ((rowSums(x = countData)[i])/totalCounts) * (1 - (rowSums( x = countData)[i]/totalCounts)) } return(unalikeabilityCoefficient) } unalikeability2 <- function(countData, ...) { unalikeabilityCoefficient <- as.data.frame(matrix (data = 0, nrow = 1, ncol = ncol(countData))) colnames(unalikeabilityCoefficient) <- colnames(countData) totalCounts <- colSums(x = countData) for (i in 1 : ncol(countData)) { for (j in 1 : nrow(countData)) { unalikeabilityCoefficient[1, i] <- unalikeabilityCoefficient[1, i] + ((countData[j, i]/totalCounts[i]) * (1 - (countData[j, i]/totalCounts[i]))) } } return(unalikeabilityCoefficient) } Sys.setenv(R_GSCMD = "C:/Program Files/gs/gs9.10/bin/gswin64c.exe") filePath = paste0("C:\\data_analysis\\HTS\\heat_map\\") # Step 3.2: Read normal dNTP data files containing normalised count data # (wide-format), calculate unalikeability coefficient and create # heatmap # Step 3.2.1: Non-tailed forward primer data file unalikeabilityData1 <- as.data.frame(matrix (data = NA, nrow = 1, ncol = 2)) names(unalikeabilityData1) <- c("non-tailed", "tailed") inputFile <- read.table(paste0(filePath, "nonTailedForwardPrimer", "NormalisedCountData_normal_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) isolates <- read.table(paste0(filePath, "FMDV-5UTR_isolates.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) rownames(countData) <- inputFile$id isolateNames <- as.character(isolates$standardName) names(isolateNames) <- paste0(isolates$labName, "_nt") colnames(countData) <- str_replace_all(string = colnames(countData), pattern = isolateNames) countData <- countData[, order(colnames(countData), decreasing = TRUE)] ompData <- read.table(paste0(filePath, "nonTailedForwardPrimerPreparedOMPData", "_1aMTarget.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) ompData <- arrange(ompData, dG_eff, desc(Tm_eff)) affinityOrder <- gsub(pattern = ".*_", replacement = "", x = ompData$id) biasOrder <- as.data.frame(matrix(data = NA, nrow = nrow(ompData), ncol = 2)) colnames(biasOrder) <- c("id", "bias") biasOrder$id <- gsub(pattern = ".*_", replacement = "", x = ompData$id) biasOrder$bias <- 0 biasOrder$bias <- (str_count(str_sub(biasOrder$id, start = 5, end = 5), pattern = "G") * 1.30) + (str_count(str_sub(biasOrder$id, start = 5, end = 5), pattern = "A") * 1.00) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 4, end = 4), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 4, end = 4), pattern = "C") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 3, end = 3), pattern = "G") * 1.30) + (str_count(str_sub(biasOrder$id, start = 3, end = 3), pattern = "A") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 2, end = 2), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 2, end = 2), pattern = "C") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 1, end = 1), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 1, end = 1), pattern = "C") * 1.00)) biasOrder <- arrange(biasOrder, bias) unalikeabilityData1[, 1] <- unalikeability1(countData = countData) unalikeabilityData2 <- unalikeability2(countData = countData) write.table(x = unalikeabilityData2, file = paste0(filePath, "nonTailedForwardPrimer", "Unalikeability_normal_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) affinityData <- countData[match(affinityOrder, rownames(countData)), ] pdf(paste0(filePath, "nonTailedForwardPrimerHeatMap_normal_affinityOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(affinityData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("non-tailed primer variant\n", "(ordered according to binding affinity)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "nonTailedForwardPrimerHeatMap_normal_affinityOrdered.pdf")) biasData <- countData[match(rev(biasOrder$id), rownames(countData)), ] pdf(paste0(filePath, "nonTailedForwardPrimerHeatMap_normal_biasOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(biasData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("non-tailed primer variant\n", "(ordered according to synthesis bias)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "nonTailedForwardPrimerHeatMap_normal_biasOrdered.pdf")) # Step 3.2.2: Data from reactions containing tailed forward primers inputFile <- read.table(paste0(filePath, "tailedForwardPrimer", "NormalisedCountData_normal_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) rownames(countData) <- inputFile$id isolateNames <- as.character(isolates$standardName) names(isolateNames) <- paste0(isolates$labName, "_t") colnames(countData) <- str_replace_all(string = colnames(countData), pattern = isolateNames) countData <- countData[, order(colnames(countData), decreasing = TRUE)] ompData <- read.table(paste0(filePath, "tailedForwardPrimerPreparedOMPData", "_1aMTarget.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) ompData <- arrange(ompData, dG_eff, desc(Tm_eff)) affinityOrder <- gsub(pattern = ".*_", replacement = "", x = ompData$id) unalikeabilityData1[, 2] <- unalikeability1(countData = countData) write.table(x = unalikeabilityData1, file = paste0(filePath, "forwardPrimer", "Unalikeability_normal_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) unalikeabilityData2 <- unalikeability2(countData = countData) write.table(x = unalikeabilityData2, file = paste0(filePath, "tailedForwardPrimer", "Unalikeability_normal_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) affinityData <- countData[match(affinityOrder, rownames(countData)), ] pdf(paste0(filePath, "tailedForwardPrimerHeatMap_normal_affinityOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(affinityData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("tailed primer variant\n", "(ordered according to binding affinity)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "tailedForwardPrimerHeatMap_normal_affinityOrdered.pdf")) biasData <- countData[match(rev(biasOrder$id), rownames(countData)), ] pdf(paste0(filePath, "tailedForwardPrimerHeatMap_normal_biasOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(biasData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("tailed primer variant\n", "(ordered according to synthesis bias)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "tailedForwardPrimerHeatMap_normal_biasOrdered.pdf")) # Step 3.2.3: Non-tailed reverse primer data file unalikeabilityData1 <- as.data.frame(matrix (data = NA, nrow = 1, ncol = 2)) names(unalikeabilityData1) <- c("non-tailed", "tailed") inputFile <- read.table(paste0(filePath, "nonTailedReversePrimer", "NormalisedCountData_normal_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) isolates <- read.table(paste0(filePath, "FMDV-5UTR_isolates.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) rownames(countData) <- inputFile$id isolateNames <- as.character(isolates$standardName) names(isolateNames) <- paste0(isolates$labName, "_nt") colnames(countData) <- str_replace_all(string = colnames(countData), pattern = isolateNames) countData <- countData[, order(colnames(countData), decreasing = TRUE)] ompData <- read.table(paste0(filePath, "nonTailedReversePrimerPreparedOMPData", "_1aMTarget.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) ompData <- arrange(ompData, dG_eff, desc(Tm_eff)) affinityOrder <- gsub(pattern = ".*_", replacement = "", x = ompData$id) biasOrder <- as.data.frame(matrix(data = NA, nrow = nrow(ompData), ncol = 2)) colnames(biasOrder) <- c("id", "bias") biasOrder$id <- gsub(pattern = ".*_", replacement = "", x = ompData$id) biasOrder$bias <- 0 biasOrder$bias <- (str_count(str_sub(biasOrder$id, start = 5, end = 5), pattern = "G") * 1.30) + (str_count(str_sub(biasOrder$id, start = 5, end = 5), pattern = "A") * 1.00) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 4, end = 4), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 4, end = 4), pattern = "C") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 3, end = 3), pattern = "G") * 1.30) + (str_count(str_sub(biasOrder$id, start = 3, end = 3), pattern = "A") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 2, end = 2), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 2, end = 2), pattern = "C") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 1, end = 1), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 1, end = 1), pattern = "C") * 1.00)) biasOrder <- arrange(biasOrder, bias) unalikeabilityData1[, 1] <- unalikeability1(countData = countData) unalikeabilityData2 <- unalikeability2(countData = countData) write.table(x = unalikeabilityData2, file = paste0(filePath, "nonTailedReversePrimerUnalikeability_normal_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) affinityData <- countData[match(affinityOrder, rownames(countData)), ] pdf(paste0(filePath, "nonTailedReversePrimerHeatMap_normal_affinityOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(affinityData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("non-tailed primer variant\n", "(ordered according to binding affinity)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "nonTailedReversePrimerHeatMap_normal_affinityOrdered.pdf")) biasData <- countData[match(rev(biasOrder$id), rownames(countData)), ] pdf(paste0(filePath, "nonTailedReversePrimerHeatMap_normal_biasOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(biasData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("non-tailed primer variant\n", "(ordered according to synthesis bias)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "nonTailedReversePrimerHeatMap_normal_biasOrdered.pdf")) # Step 3.2.4: Data from reactions containing tailed forward primers inputFile <- read.table(paste0(filePath, "tailedReversePrimer", "NormalisedCountData_normal_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) rownames(countData) <- inputFile$id isolateNames <- as.character(isolates$standardName) names(isolateNames) <- paste0(isolates$labName, "_t") colnames(countData) <- str_replace_all(string = colnames(countData), pattern = isolateNames) countData <- countData[, order(colnames(countData), decreasing = TRUE)] ompData <- read.table(paste0(filePath, "tailedReversePrimerPreparedOMPData", "_1aMTarget.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) ompData <- arrange(ompData, dG_eff, desc(Tm_eff)) affinityOrder <- gsub(pattern = ".*_", replacement = "", x = ompData$id) unalikeabilityData1[, 2] <- unalikeability1(countData = countData) write.table(x = unalikeabilityData1, file = paste0(filePath, "reversePrimer", "Unalikeability_normal_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) unalikeabilityData2 <- unalikeability2(countData = countData) write.table(x = unalikeabilityData2, file = paste0(filePath, "tailedReversePrimerUnalikeability_normal_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) affinityData <- countData[match(affinityOrder, rownames(countData)), ] pdf(paste0(filePath, "tailedReversePrimerHeatMap_normal_affinityOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(affinityData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("tailed primer variant\n", "(ordered according to binding affinity)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "tailedReversePrimerHeatMap_normal_affinityOrdered.pdf")) biasData <- countData[match(rev(biasOrder$id), rownames(countData)), ] pdf(paste0(filePath, "tailedReversePrimerHeatMap_normal_biasOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(biasData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("tailed primer variant\n", "(ordered according to synthesis bias)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "tailedReversePrimerHeatMap_normal_biasOrdered.pdf")) # Step 3.3: Read CleanAmp dNTP data files containing normalised count data # (wide-format), calculate unalikeability coefficient and create # heatmap # Step 3.3.1: Non-tailed forward primer data file unalikeabilityData1 <- as.data.frame(matrix (data = NA, nrow = 1, ncol = 2)) names(unalikeabilityData1) <- c("non-tailed", "tailed") inputFile <- read.table(paste0(filePath, "nonTailedForwardPrimer", "NormalisedCountData_CleanAmp_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) isolates <- read.table(paste0(filePath, "FMDV-5UTR_isolates.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) rownames(countData) <- inputFile$id isolateNames <- as.character(isolates$standardName) names(isolateNames) <- paste0(isolates$labName, "_nt") colnames(countData) <- str_replace_all(string = colnames(countData), pattern = isolateNames) countData <- countData[, order(colnames(countData), decreasing = TRUE)] ompData <- read.table(paste0(filePath, "nonTailedForwardPrimerPreparedOMPData", "_1aMTarget.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) ompData <- arrange(ompData, dG_eff, desc(Tm_eff)) affinityOrder <- gsub(pattern = ".*_", replacement = "", x = ompData$id) biasOrder <- as.data.frame(matrix(data = NA, nrow = nrow(ompData), ncol = 2)) colnames(biasOrder) <- c("id", "bias") biasOrder$id <- gsub(pattern = ".*_", replacement = "", x = ompData$id) biasOrder$bias <- 0 biasOrder$bias <- (str_count(str_sub(biasOrder$id, start = 5, end = 5), pattern = "G") * 1.30) + (str_count(str_sub(biasOrder$id, start = 5, end = 5), pattern = "A") * 1.00) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 4, end = 4), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 4, end = 4), pattern = "C") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 3, end = 3), pattern = "G") * 1.30) + (str_count(str_sub(biasOrder$id, start = 3, end = 3), pattern = "A") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 2, end = 2), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 2, end = 2), pattern = "C") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 1, end = 1), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 1, end = 1), pattern = "C") * 1.00)) biasOrder <- arrange(biasOrder, bias) unalikeabilityData1[, 1] <- unalikeability1(countData = countData) unalikeabilityData2 <- unalikeability2(countData = countData) write.table(x = unalikeabilityData2, file = paste0(filePath, "nonTailedForwardPrimerUnalikeability_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) affinityData <- countData[match(affinityOrder, rownames(countData)), ] pdf(paste0(filePath, "nonTailedForwardPrimerHeatMap_CleanAmp_affinityOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(affinityData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("non-tailed primer variant\n", "(ordered according to binding affinity)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "nonTailedForwardPrimerHeatMap_CleanAmp_affinityOrdered.pdf")) biasData <- countData[match(rev(biasOrder$id), rownames(countData)), ] pdf(paste0(filePath, "nonTailedForwardPrimerHeatMap_CleanAmp_biasOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(biasData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("non-tailed primer variant\n", "(ordered according to synthesis bias)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "nonTailedForwardPrimerHeatMap_CleanAmp_biasOrdered.pdf")) # Step 3.3.2: Data from reactions containing tailed forward primers inputFile <- read.table(paste0(filePath, "tailedForwardPrimer", "NormalisedCountData_CleanAmp_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) rownames(countData) <- inputFile$id isolateNames <- as.character(isolates$standardName) names(isolateNames) <- paste0(isolates$labName, "_t") colnames(countData) <- str_replace_all(string = colnames(countData), pattern = isolateNames) countData <- countData[, order(colnames(countData), decreasing = TRUE)] ompData <- read.table(paste0(filePath, "tailedForwardPrimerPreparedOMPData", "_1aMTarget.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) ompData <- arrange(ompData, dG_eff, desc(Tm_eff)) affinityOrder <- gsub(pattern = ".*_", replacement = "", x = ompData$id) unalikeabilityData1[, 2] <- unalikeability1(countData = countData) write.table(x = unalikeabilityData1, file = paste0(filePath, "forwardPrimer", "Unalikeability_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) unalikeabilityData2 <- unalikeability2(countData = countData) write.table(x = unalikeabilityData2, file = paste0(filePath, "tailedForwardPrimerUnalikeability_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) affinityData <- countData[match(affinityOrder, rownames(countData)), ] pdf(paste0(filePath, "tailedForwardPrimerHeatMap_CleanAmp_affinityOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(affinityData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("tailed primer variant\n", "(ordered according to binding affinity)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "tailedForwardPrimerHeatMap_CleanAmp_affinityOrdered.pdf")) biasData <- countData[match(rev(biasOrder$id), rownames(countData)), ] pdf(paste0(filePath, "tailedForwardPrimerHeatMap_CleanAmp_biasOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(biasData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("tailed primer variant\n", "(ordered according to synthesis bias)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "tailedForwardPrimerHeatMap_CleanAmp_biasOrdered.pdf")) # Step 3.3.3: Non-tailed reverse primer data file unalikeabilityData1 <- as.data.frame(matrix (data = NA, nrow = 1, ncol = 2)) names(unalikeabilityData1) <- c("non-tailed", "tailed") inputFile <- read.table(paste0(filePath, "nonTailedReversePrimer", "NormalisedCountData_CleanAmp_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) isolates <- read.table(paste0(filePath, "FMDV-5UTR_isolates.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) rownames(countData) <- inputFile$id isolateNames <- as.character(isolates$standardName) names(isolateNames) <- paste0(isolates$labName, "_nt") colnames(countData) <- str_replace_all(string = colnames(countData), pattern = isolateNames) countData <- countData[, order(colnames(countData), decreasing = TRUE)] ompData <- read.table(paste0(filePath, "nonTailedReversePrimerPreparedOMPData", "_1aMTarget.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) ompData <- arrange(ompData, dG_eff, desc(Tm_eff)) affinityOrder <- gsub(pattern = ".*_", replacement = "", x = ompData$id) biasOrder <- as.data.frame(matrix(data = NA, nrow = nrow(ompData), ncol = 2)) colnames(biasOrder) <- c("id", "bias") biasOrder$id <- gsub(pattern = ".*_", replacement = "", x = ompData$id) biasOrder$bias <- 0 biasOrder$bias <- (str_count(str_sub(biasOrder$id, start = 5, end = 5), pattern = "G") * 1.30) + (str_count(str_sub(biasOrder$id, start = 5, end = 5), pattern = "A") * 1.00) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 4, end = 4), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 4, end = 4), pattern = "C") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 3, end = 3), pattern = "G") * 1.30) + (str_count(str_sub(biasOrder$id, start = 3, end = 3), pattern = "A") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 2, end = 2), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 2, end = 2), pattern = "C") * 1.00)) biasOrder <- biasOrder[order(biasOrder$bias), ] biasOrder$bias <- biasOrder$bias * ((str_count(str_sub(biasOrder$id, start = 1, end = 1), pattern = "T") * 1.25) + (str_count(str_sub(biasOrder$id, start = 1, end = 1), pattern = "C") * 1.00)) biasOrder <- arrange(biasOrder, bias) unalikeabilityData1[, 1] <- unalikeability1(countData = countData) unalikeabilityData2 <- unalikeability2(countData = countData) write.table(x = unalikeabilityData2, file = paste0(filePath, "nonTailedReversePrimerUnalikeability_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) affinityData <- countData[match(affinityOrder, rownames(countData)), ] pdf(paste0(filePath, "nonTailedReversePrimerHeatMap_CleanAmp_affinityOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(affinityData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("non-tailed primer variant\n", "(ordered according to binding affinity)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "nonTailedReversePrimerHeatMap_CleanAmp_affinityOrdered.pdf")) biasData <- countData[match(rev(biasOrder$id), rownames(countData)), ] pdf(paste0(filePath, "nonTailedReversePrimerHeatMap_CleanAmp_biasOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(biasData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("non-tailed primer variant\n", "(ordered according to synthesis bias)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "nonTailedReversePrimerHeatMap_CleanAmp_biasOrdered.pdf")) # Step 3.3.4: Data from reactions containing tailed forward primers inputFile <- read.table(paste0(filePath, "tailedReversePrimer", "NormalisedCountData_CleanAmp_amplicons.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) countData <- dplyr::select(inputFile, contains("FMDV")) rownames(countData) <- inputFile$id isolateNames <- as.character(isolates$standardName) names(isolateNames) <- paste0(isolates$labName, "_t") colnames(countData) <- str_replace_all(string = colnames(countData), pattern = isolateNames) countData <- countData[, order(colnames(countData), decreasing = TRUE)] ompData <- read.table(paste0(filePath, "tailedReversePrimerPreparedOMPData", "_1aMTarget.txt"), header = TRUE, sep = "\t", stringsAsFactors = FALSE) ompData <- arrange(ompData, dG_eff, desc(Tm_eff)) affinityOrder <- gsub(pattern = ".*_", replacement = "", x = ompData$id) unalikeabilityData1[, 2] <- unalikeability1(countData = countData) write.table(x = unalikeabilityData1, file = paste0(filePath, "reversePrimer", "Unalikeability_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) unalikeabilityData2 <- unalikeability2(countData = countData) write.table(x = unalikeabilityData2, file = paste0(filePath, "tailedReversePrimerUnalikeability_CleanAmp_amplicons.txt"), sep = "\t", row.names = FALSE, col.names = TRUE) affinityData <- countData[match(affinityOrder, rownames(countData)), ] pdf(paste0(filePath, "tailedReversePrimerHeatMap_CleanAmp_affinityOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(affinityData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("tailed primer variant\n", "(ordered according to binding affinity)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "tailedReversePrimerHeatMap_CleanAmp_affinityOrdered.pdf")) biasData <- countData[match(rev(biasOrder$id), rownames(countData)), ] pdf(paste0(filePath, "tailedReversePrimerHeatMap_CleanAmp_biasOrdered.pdf"), family = "Times New Roman", pointsize = 10, width = 8, height = 4) print(levelplot(as.matrix(biasData), scales = list(x = list(rot = 90)), col.regions = heat.colors, xlab = paste0("tailed primer variant\n", "(ordered according to synthesis bias)"), ylab = "FMDV isolate")) dev.off() embed_fonts(paste0(filePath, "tailedReversePrimerHeatMap_CleanAmp_biasOrdered.pdf")) ############################################################################### # R SCRIPTS USED TO ANALYSE ARTEFACTS # ############################################################################### # Step 1: Load required packages, define helper function and set varaibles library(Biostrings) library(Rsamtools) library(stringr) scoreInterval <- function(n, p, z = 1.96, cc = TRUE){ out <- list() q <- 1-p zsq <- z^2 denom <- (2 * (n + zsq)) if(cc){ numl <- (2 * n * p) + zsq - 1 - (z * sqrt(zsq - 2 - (1 / n) + 4 * p * ((n * q) +1))) numu <- (2 * n * p) + zsq + 1 + (z * sqrt(zsq + 2 - (1 / n) + 4 * p * ((n * q) - 1))) out$lb <- numl / denom out$ub <- numu / denom if (p == 1) out$ub <- 1 if (p == 0) out$lb <- 0 } else { out$lb <- ((2 * n * p) + zsq - (z * sqrt(zsq + (4 * n * p * q)))) / denom out$ub <- ((2 * n * p) + zsq + (z * sqrt(zsq + (4 * n * p * q)))) / denom } out } inputPath <- paste0("C:\\data_analysis\\HTS\\artefacts\\") outputPath <- paste0("C:\\data_analysis\\HTS\\artefacts\\summary\\") artefactsName <- c("FMDV-3_C1_usearchSmallArtefacts", "FMDV-3_C2_usearchSmallArtefacts", "FMDV-3_C3_usearchSmallArtefacts", "FMDV-3_C4_usearchSmallArtefacts", "FMDV-5_C1_usearchSmallArtefacts", "FMDV-5_C2_usearchSmallArtefacts", "FMDV-5_C3_usearchSmallArtefacts", "FMDV-5_C4_usearchSmallArtefacts", "FMDV-25_C1_usearchSmallArtefacts", "FMDV-25_C2_usearchSmallArtefacts", "FMDV-25_C3_usearchSmallArtefacts", "FMDV-25_C4_usearchSmallArtefacts", "FMDV-26_C1_usearchSmallArtefacts", "FMDV-26_C2_usearchSmallArtefacts", "FMDV-26_C3_usearchSmallArtefacts", "FMDV-26_C4_usearchSmallArtefacts", "FMDV-31_C1_usearchSmallArtefacts", "FMDV-31_C2_usearchSmallArtefacts", "FMDV-31_C3_usearchSmallArtefacts", "FMDV-31_C4_usearchSmallArtefacts", "FMDV-40_C1_usearchSmallArtefacts", "FMDV-40_C2_usearchSmallArtefacts", "FMDV-40_C3_usearchSmallArtefacts", "FMDV-40_C4_usearchSmallArtefacts", "FMDV-51_C1_usearchSmallArtefacts", "FMDV-51_C2_usearchSmallArtefacts", "FMDV-51_C3_usearchSmallArtefacts", "FMDV-51_C4_usearchSmallArtefacts", "FMDV-Botswana_C1_usearchSmallArtefacts", "FMDV-Botswana_C2_usearchSmallArtefacts", "FMDV-Botswana_C3_usearchSmallArtefacts", "FMDV-Botswana_C4_usearchSmallArtefacts", "FMDV-BUL32_C1_usearchSmallArtefacts", "FMDV-BUL32_C2_usearchSmallArtefacts", "FMDV-BUL32_C3_usearchSmallArtefacts", "FMDV-BUL32_C4_usearchSmallArtefacts", "FMDV-IND258_C1_usearchSmallArtefacts", "FMDV-IND258_C2_usearchSmallArtefacts", "FMDV-IND258_C3_usearchSmallArtefacts", "FMDV-IND258_C4_usearchSmallArtefacts", "FMDV-O1Brugge_C1_usearchSmallArtefacts", "FMDV-O1Brugge_C2_usearchSmallArtefacts", "FMDV-O1Brugge_C3_usearchSmallArtefacts", "FMDV-O1Brugge_C4_usearchSmallArtefacts", "FMDV-Openghu_C1_usearchSmallArtefacts", "FMDV-Openghu_C2_usearchSmallArtefacts", "FMDV-Openghu_C3_usearchSmallArtefacts", "FMDV-Openghu_C4_usearchSmallArtefacts") forwardReversePrimerName <- "FMDV-5UTR_forwardReversePrimerEnds" forwardPrimerName <- "FMDV-5UTR_forwardPrimerEnds" reversePrimerName <- "FMDV-5UTR_reversePrimerEnds" noPrimerAmpliconQueries <- "FMDV-5UTR_noPrimerAmplicons" nonTailedAmpliconQueries <- "FMDV-5UTR_nonTailedAmplicons" tailedAmpliconQueries <- "FMDV-5UTR_tailedAmplicons" outputData <- data.frame(matrix(data = NA, nrow = length(artefactsName), ncol = 6)) colnames(outputData) <- c("library", "forward/forward", "reverse/reverse", "forward/reverse", "others", "total") for (i in 1:length(artefactsName)) { # Step 2: Read .fasta files containing small artefact sequences outputData$library[i] <- str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)") rawSequences <- readDNAStringSet(filepath = paste0(inputPath, artefactsName[i], ".fasta"), format = "fasta", use.names = TRUE) # Step 3: Remove remaining partial amplicons from raw sequences subjects <- paste0(inputPath, artefactsName[i], ".fasta") queries <- paste0(inputPath, noPrimerAmpliconQueries, ".fasta") artefacts <- paste0(outputPath, paste0(str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_artefacts.fasta")) usearchCommand <- paste0("usearch -usearch_global \"", subjects, "\" -db \"", queries, "\" -strand both -id 0.90 -target_cov 0.90 -notmatched \"", artefacts, "\"") system(usearchCommand) artefactSequences <- readDNAStringSet(filepath = artefacts, format = "fasta", use.names = TRUE) # Step 4: Select artefacts with forward primers on both ends based on last # 12 nucleotides of the forward primer artefacts <- paste0(outputPath, str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_artefacts.fasta") primers <- paste0(inputPath, forwardPrimerName, ".fasta") hits <- paste0(outputPath, str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_forwardPrimerArtefacts.txt") forwardArtefacts <- paste0(outputPath, str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_forwardPrimerArtefacts.fasta") if (!(file.info(artefacts)$size == 0)) { usearchCommand <- paste0("usearch -search_pcr \"", artefacts, "\" -db \"", primers, "\" -strand both -maxdiffs 0 -minamp 10 -maxamp 250 -pcrout \"", hits, "\"") system(usearchCommand) } if (file.exists(hits) && !(file.info(hits)$size == 0)) { forwardHits <- read.table(hits, header = FALSE, sep = "\t", stringsAsFactors = FALSE) indexFa(artefacts) fastaReference <- FaFile(artefacts) fastaScan <- scanFaIndex(artefacts) forwardIds <- c(which(seqnames(fastaScan) %in% unique(forwardHits[, 1]))) forwardArtefactSequences <- getSeq(fastaReference, fastaScan[forwardIds]) outputData$`forward/forward`[i] <- length(forwardArtefactSequences) writeXStringSet(x = forwardArtefactSequences, filepath = forwardArtefacts, append = FALSE, compress = FALSE, format = "fasta") } else { outputData$`forward/forward`[i] <- 0 } # Step 5: Select artefacts with reverse primers on both ends based on last # 12 nucleotides of the reverse primer primers <- paste0(inputPath, reversePrimerName, ".fasta") hits <- paste0(outputPath, str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_reversePrimerArtefacts.txt") reverseArtefacts <- paste0(outputPath, str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_reversePrimerArtefacts.fasta") if (!(file.info(artefacts)$size == 0)) { usearchCommand <- paste0("usearch -search_pcr \"", artefacts, "\" -db \"", primers, "\" -strand both -maxdiffs 0 -minamp 10 -maxamp 250 -pcrout \"", hits, "\"") system(usearchCommand) } if (file.exists(hits) && !(file.info(hits)$size == 0)) { reverseHits <- read.table(hits, header = FALSE, sep = "\t", stringsAsFactors = FALSE) if (!(file.exists(paste0(artefacts, ".fai")))) { indexFa(artefacts) fastaReference <- FaFile(artefacts) fastaScan <- scanFaIndex(artefacts) } reverseIds <- c(which(seqnames(fastaScan) %in% unique(reverseHits[, 1]))) reverseArtefactSequences <- getSeq(fastaReference, fastaScan[reverseIds]) outputData$`reverse/reverse`[i] <- length(reverseArtefactSequences) writeXStringSet(x = reverseArtefactSequences, filepath = reverseArtefacts, append = FALSE, compress = FALSE, format = "fasta") } else { outputData$`reverse/reverse`[i] <- 0 } # Step 6: Select artefacts with forward/reverse primers on ends based on last # 12 nucleotides of the forward and reverse primers primers <- paste0(inputPath, forwardReversePrimerName, ".fasta") hits <- paste0(outputPath, str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_forwardReversePrimerArtefacts.txt") forwardReverseArtefacts <- paste0(outputPath, str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_forwardReversePrimerArtefacts.fasta") if (!(file.info(artefacts)$size == 0)) { usearchCommand <- paste0("usearch -search_pcr \"", artefacts, "\" -db \"", primers, "\" -strand both -maxdiffs 0 -minamp 10 -maxamp 250 -pcrout \"", hits, "\"") system(usearchCommand) } if (file.exists(hits) && !(file.info(hits)$size == 0)) { forwardReverseHits <- read.table(hits, header = FALSE, sep = "\t", stringsAsFactors = FALSE) if (!(file.exists(paste0(artefacts, ".fai")))) { indexFa(artefacts) fastaReference <- FaFile(artefacts) fastaScan <- scanFaIndex(artefacts) } forwardReverseIds <- c(which(seqnames(fastaScan) %in% unique(forwardReverseHits[, 1]))) if (!(outputData$'forward/forward'[i] == 0)) { forwardReverseIds <- c(which(!(forwardReverseIds %in% forwardIds))) } if (!(outputData$'reverse/reverse'[i] == 0)) { forwardReverseIds <- c(which(!(forwardReverseIds %in% reverseIds))) } forwardReverseArtefactSequences <- getSeq(fastaReference, fastaScan[forwardReverseIds]) outputData$`forward/reverse`[i] <- length(forwardReverseArtefactSequences) writeXStringSet(x = forwardReverseArtefactSequences, filepath = forwardReverseArtefacts, append = FALSE, compress = FALSE, format = "fasta") } else { outputData$`forward/reverse`[i] <- 0 } # Step 7: Remove contaminant sequences from remaining artefact sequences if (!(file.exists(paste0(artefacts, ".fai")))) { indexFa(artefacts) fastaReference <- FaFile(artefacts) fastaScan <- scanFaIndex(artefacts) } remainingArtefacts <- paste0(outputPath, "remainingArtefacts.fasta") remainingIds <- c(1:length(fastaScan)) if (!(outputData$'forward/forward'[i] == 0)) { remainingIds <- c(which(!(remainingIds %in% forwardIds))) } if (!(outputData$'reverse/reverse'[i] == 0)) { remainingIds <- c(which(!(remainingIds %in% reverseIds))) } if (!(outputData$'forward/reverse'[i] == 0)) { remainingIds <- c(which(!(remainingIds %in% forwardReverseIds))) } remainingArtefactSequences <- getSeq(fastaReference, fastaScan[remainingIds]) writeXStringSet(x = remainingArtefactSequences, filepath = remainingArtefacts, append = FALSE, compress = FALSE, format = "fasta") condition <- gsub(".*_(.*)_.*", "\\1", artefactsName[i]) if (condition == "C1" | condition == "C3") { queries <- paste0(inputPath, nonTailedAmpliconQueries, ".fasta") } else { queries <- paste0(inputPath, tailedAmpliconQueries, ".fasta") } contaminants <- paste0(outputPath, paste0(str_extract( string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_contaminants.fasta")) otherArtefacts <- paste0(outputPath, str_extract(string = artefactsName[i], pattern = "([^_]*_[^_]*)"), "_otherArtefacts.fasta") usearchCommand <- paste0("usearch -usearch_global \"", remainingArtefacts, "\" -db \"", queries, "\" -strand both -id 0.50 -target_cov 0.15 ", "-notmatched \"", contaminants, "\" -matched \"", otherArtefacts, "\"") system(usearchCommand) if (file.exists(otherArtefacts) && !(file.info(otherArtefacts)$size == 0)) { otherArtefactSequences <- readDNAStringSet(filepath = otherArtefacts, format = "fasta", use.names = TRUE) outputData$others[i] <- length(otherArtefactSequences) } else { outputData$others[i] <- 0 } outputData$total[i] <- sum(outputData[i, 2:5]) # Step 8: Remove temporary fasta files file.remove(remainingArtefacts) } # Step 9: Prepare summary tables write.table(x = outputData, file = paste0(outputPath, "artefactsAnalysis.txt"), sep = "\t", row.names = FALSE) normalData <- data.frame(matrix(data = NA, nrow = (length(artefactsName) / 2), ncol = 7)) colnames(normalData) <- c("library", "tailing", "forward/forward", "reverse/reverse", "forward/reverse", "others", "total") cleanAmpData <- data.frame(matrix(data = NA, nrow = (length(artefactsName) / 2), ncol = 7)) colnames(cleanAmpData) <- c("library", "tailing", "forward/forward", "reverse/reverse", "forward/reverse", "others", "total") normalCounter <- 0 cleanAmpCounter <- 0 for (i in 1:nrow(outputData)) { library <- gsub("(.*)_.*", "\\1", outputData$library[i]) condition <- gsub(".*_(.*)", "\\1", outputData$library[i]) if (condition == "C1") { normalCounter <- normalCounter + 1 normalData$library[normalCounter] <- library normalData$tailing[normalCounter] <- "non-tailed" total <- outputData$total[i] proportion <- outputData$'forward/forward'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) normalData$'forward/forward'[normalCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'reverse/reverse'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) normalData$'reverse/reverse'[normalCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'forward/reverse'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) normalData$'forward/reverse'[normalCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'others'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) normalData$'others'[normalCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") normalData$total[normalCounter] <- outputData$total[i] } else if (condition == "C2") { normalCounter <- normalCounter + 1 normalData$library[normalCounter] <- library normalData$tailing[normalCounter] <- "tailed" total <- outputData$total[i] proportion <- outputData$'forward/forward'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) normalData$'forward/forward'[normalCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'reverse/reverse'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) normalData$'reverse/reverse'[normalCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'forward/reverse'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) normalData$'forward/reverse'[normalCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'others'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) normalData$'others'[normalCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") normalData$total[normalCounter] <- outputData$total[i] } else if (condition == "C3") { cleanAmpCounter <- cleanAmpCounter + 1 cleanAmpData$library[cleanAmpCounter] <- library cleanAmpData$tailing[cleanAmpCounter] <- "non-tailed" total <- outputData$total[i] proportion <- outputData$'forward/forward'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) cleanAmpData$'forward/forward'[cleanAmpCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'reverse/reverse'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) cleanAmpData$'reverse/reverse'[cleanAmpCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'forward/reverse'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) cleanAmpData$'forward/reverse'[cleanAmpCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'others'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) cleanAmpData$'others'[cleanAmpCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") cleanAmpData$total[cleanAmpCounter] <- outputData$total[i] } else if (condition == "C4") { cleanAmpCounter <- cleanAmpCounter + 1 cleanAmpData$library[cleanAmpCounter] <- library cleanAmpData$tailing[cleanAmpCounter] <- "tailed" total <- outputData$total[i] proportion <- outputData$'forward/forward'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) cleanAmpData$'forward/forward'[cleanAmpCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'reverse/reverse'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) cleanAmpData$'reverse/reverse'[cleanAmpCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'forward/reverse'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) cleanAmpData$'forward/reverse'[cleanAmpCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") proportion <- outputData$'others'[i] / total interval <- scoreInterval(n = total, p = proportion, z = 1.96, cc = TRUE) cleanAmpData$'others'[cleanAmpCounter] <- paste0( round(x = (100 * proportion), digits = 2), " (", round(x = (100 * as.numeric(interval[1])), digits = 2), "-", round(x = (100 * as.numeric(interval[2])), digits = 2), ")") cleanAmpData$total[cleanAmpCounter] <- outputData$total[i] } } write.table(x = normalData, file = paste0(outputPath, "normaldNTPArtefactsAnalysis.txt"), sep = "\t", row.names = FALSE) write.table(x = cleanAmpData, file = paste0(outputPath, "cleanAmpdNTPArtefactsAnalysis.txt"), sep = "\t", row.names = FALSE)