#CNV based analysis #Read in files CNVlist <- read.csv('DiseaseCNVList.csv', header=T, stringsAsFactors=FALSE) #CNV list file containing CNV locations and gene names nsc <- read.csv('NSCGenelist.csv', header=T, stringsAsFactors=FALSE) #NSC gene list (with protocadherin cluster removed) control <- read.csv('dbVAR.csv', header=T, stringsAsFactors=FALSE) #Control CNV list #Control Data (for FISHER's exact test) - place appropriate values in here ControlgenesDisease <- 9 # for gene number comparisons ControlgenesNoDisease <- 2772 # for gene number comparisons ControlCNVDisease <- 9 # CNV numbers ControlCNVNoDisease <- 431 # CNV numbers #Setup lookup genelist and remove blank entries genelist <- nsc$StMA.All genelist <- subset(genelist, genelist != "") #remove any blank entries x <- CNVlist$GeneSymbol %in% genelist # measure overlap of StMA genelist with disease associated gene symbols. #Since multiple hits for one gene symbol maybe present, e.g. isoforms, subset the data and process the subset MAhits <- CNVlist[x,] #subset based on any StMA matches MAgenehits <- MAhits[!duplicated(MAhits$GeneSymbol),] #remove any gene symbol duplicates MAcnvhits <- MAhits[!duplicated(MAhits$Merged.CNV.ID),] #make a separate subset of StMA genelist hits but this time remove CNV duplicates instead totalmappedgenes <- CNVlist[!duplicated(CNVlist$GeneSymbol),] #total number of mapped genes in CNVs totalcnvsmapped <- CNVlist[!duplicated(CNVlist$Merged.CNV.ID),] #make a list of unique CNVs with containing genes StMAgenes <- nrow(MAgenehits) #MA gene hits TotalGenes <- nrow(totalmappedgenes) #total number of genes genedensity <- StMAgenes/TotalGenes #number of MA genes vs total number of genes MACNV <- nrow(MAcnvhits) #Number of CNVs with MA genes MACNVsize <- sum(MAcnvhits$CNV.Size) #Total MA CNV size MACNVdensity <- MACNVsize/MACNV #MA CNV density TotalCNV <- nrow(totalcnvsmapped) #Total number of CNVs TotalCNVsize <- sum(totalcnvsmapped$CNV.Size) #Total CNV length TotalCNVdensity <- TotalCNVsize/TotalCNV #CNV mean size GeneCNVDensity <- (TotalGenes/TotalCNVsize)*1000000 #Total genes per megabase of CNV MAGeneCNVDensity <- (StMAgenes/TotalCNVsize)*1000000 #StMA genes per megabase of CNV #Fisher's Exact test on each ft <- c(StMAgenes, TotalGenes-StMAgenes, ControlgenesDisease, ControlgenesNoDisease) dim(ft) <-c(2,2) fishergenes <- fisher.test(ft) chisq.test(ft) # Carry out ChiSq test to console and use if no approximation warnings given ft <- c(MACNV, TotalCNV-MACNV, ControlCNVDisease, ControlCNVNoDisease) dim(ft) <-c(2,2) fishercnv <- fisher.test(ft) chisq.test(ft) # Carry out ChiSq test to console and use if no approximation warnings given # output into a dataframe which could be incremented if required. d <- data.frame(TotalGenes=numeric(0), StMAgenes=numeric(0), genedensity=numeric(0), fishergenes=numeric(0), TotalCNVsize=numeric(0), TotalCNVdensity=numeric(0), TotalCNV=numeric(0), MACNV=numeric(0), fishercnv=numeric(0), MACNVdensity=numeric(0), GeneCNVDensity=numeric(0), MAGeneCNVDensity=numeric(0)) d <- data.frame(TotalGenes, StMAgenes, genedensity, fishergenes$p.value, TotalCNVsize, TotalCNVdensity, TotalCNV, MACNV, fishercnv$p.value, MACNVdensity, GeneCNVDensity, MAGeneCNVDensity) d # Display the results ######################### ### permutation code: ### ######################### GenePopulation <- nsc$AllAssayedExpressedGenes #list of genes to sample from GenePopulation <- subset(GenePopulation, GenePopulation != "") # as a backup, ensure no blank lines in the file #define a storage dataframe to store all iterative variables for test and control datasets permtest <- data.frame(TotalGenes=numeric(0), StMAgenes=numeric(0), genedensity=numeric(0), fishergenes=numeric(0), TotalCNVsize=numeric(0), TotalCNVdensity=numeric(0), TotalCNV=numeric(0), MACNV=numeric(0), fishercnv=numeric(0), MACNVdensity=numeric(0), GeneCNVDensity=numeric(0), MAGeneCNVDensity=numeric(0)) permcontrol <- data.frame(TotalGenes=numeric(0), StMAgenes=numeric(0), genedensity=numeric(0), fishergenes=numeric(0), TotalCNVsize=numeric(0), TotalCNVdensity=numeric(0), TotalCNV=numeric(0), MACNV=numeric(0), fishercnv=numeric(0), MACNVdensity=numeric(0), GeneCNVDensity=numeric(0), MAGeneCNVDensity=numeric(0)) set.seed(1) permresults <- numeric() for(n in 1:10000) { #Permutation Permgenelist <- sample(GenePopulation, length(genelist)) #random sample of assayed genes x <- CNVlist$GeneSymbol %in% Permgenelist #How many genes map to the disease CNV list? y <- control$GeneSymbol %in% Permgenelist #How many map to control? #Since multiple hits for one gene symbol maybe present, e.g. isoforms, subset the data and process the subset xMAhits <- CNVlist[x,] xMAgenehits <- xMAhits[!duplicated(xMAhits$GeneSymbol),] #make a MA gene list without duplicates xMAcnvhits <- xMAhits[!duplicated(xMAhits$Merged.CNV.ID),] #make a MA gene hit CNV list without duplicates xtotalmappedgenes <- CNVlist[!duplicated(CNVlist$GeneSymbol),] #total number of mapped genes in CNVs xtotalcnvsmapped <- CNVlist[!duplicated(CNVlist$Merged.CNV.ID),] #make a list of unique CNVs with containing genes xStMAgenes <- nrow(xMAgenehits) #MA gene hits xTotalGenes <- nrow(xtotalmappedgenes) #total number of genes xgenedensity <- xStMAgenes/xTotalGenes #number of MA genes vs total number of genes xMACNV <- nrow(xMAcnvhits) #Number of CNVs with MA genes xMACNVsize <- sum(xMAcnvhits$CNV.Size) #Total MA CNV size xMACNVdensity <- xMACNVsize/xMACNV #MA CNV density xTotalCNV <- nrow(xtotalcnvsmapped) #Total number of CNVs xTotalCNVsize <- sum(xtotalcnvsmapped$CNV.Size) #Total CNV length xTotalCNVdensity <- xTotalCNVsize/xTotalCNV #CNV mean size xGeneCNVDensity <- (xTotalGenes/xTotalCNVsize)*1000000 xMAGeneCNVDensity <- (xStMAgenes/xTotalCNVsize)*1000000 #Now do the same processing but for the control CNV dataset yMAhits <- control[y,] yMAgenehits <- yMAhits[!duplicated(yMAhits$GeneSymbol),] #make a MA gene list without duplicates yMAcnvhits <- yMAhits[!duplicated(yMAhits$Merged.CNV.ID),] #make a MA gene hit CNV list without duplicates ytotalmappedgenes <- control[!duplicated(control$GeneSymbol),] #total number of mapped genes in CNVs ytotalcnvsmapped <- control[!duplicated(control$Merged.CNV.ID),] #make a list of unique CNVs with containing genes #calculations for each iteration yStMAgenes <- nrow(yMAgenehits) #MA gene hits yTotalGenes <- nrow(ytotalmappedgenes) #total number of genes ygenedensity <- yStMAgenes/yTotalGenes #number of MA genes vs total number of genes yMACNV <- nrow(yMAcnvhits) #Number of CNVs with MA genes yMACNVsize <- sum(yMAcnvhits$CNV.Size) #Total MA CNV size yMACNVdensity <- yMACNVsize/yMACNV #MA CNV density yTotalCNV <- nrow(ytotalcnvsmapped) #Total number of CNVs yTotalCNVsize <- sum(ytotalcnvsmapped$CNV.Size) #Total CNV length yTotalCNVdensity <- yTotalCNVsize/yTotalCNV #CNV mean size yGeneCNVDensity <- (yTotalGenes/yTotalCNVsize)*1000000 yMAGeneCNVDensity <- (yStMAgenes/yTotalCNVsize)*1000000 #Fisher's Exact test on each ft <- c(xStMAgenes, xTotalGenes-xStMAgenes, yStMAgenes, yTotalGenes-yStMAgenes) dim(ft) <-c(2,2) Permfishergenes <- fisher.test(ft) ft <- c(xMACNV, xTotalCNV-xMACNV, yMACNV, yTotalCNV-xMACNV) dim(ft) <-c(2,2) Permfishercnv <- fisher.test(ft) # Store the results from this iteration into a dataframe permtest[n,] <- data.frame(xTotalGenes, xStMAgenes, xgenedensity, Permfishergenes$p.value, xTotalCNVsize, xTotalCNVdensity, xTotalCNV, xMACNV, Permfishercnv$p.value, xMACNVdensity, xGeneCNVDensity, xMAGeneCNVDensity) permcontrol[n,] <- data.frame(yTotalGenes, yStMAgenes, ygenedensity, Permfishergenes$p.value, yTotalCNVsize, yTotalCNVdensity, yTotalCNV, yMACNV, Permfishercnv$p.value, yMACNVdensity, yGeneCNVDensity, yMAGeneCNVDensity) } summary(permtest) #summary statistics of permutation test - median values can be used for for ChiSq or Fishers exact test summary(permcontrol) summary(p.adjust(p=permtest$fishergenes, method="bonferroni", n=10000)) # check p-values after multiple testing correction