#CNV based analysis
#Read in files
CNVlist <- read.csv('DiseaseCNVList.csv', header=T, stringsAsFactors=FALSE)   	#CNV list file containing CNV locations and gene names
nsc <- read.csv('NSCGenelist.csv', header=T, stringsAsFactors=FALSE)	  	#NSC gene list (with protocadherin cluster removed)
control <- read.csv('dbVAR.csv', header=T, stringsAsFactors=FALSE)  		#Control CNV list


#Control Data (for FISHER's exact test) - place appropriate values in here
ControlgenesDisease <- 9 							# for gene number comparisons
ControlgenesNoDisease <- 2772 							# for gene number comparisons
ControlCNVDisease <- 9 								# CNV numbers
ControlCNVNoDisease <- 431 							# CNV numbers


#Setup lookup genelist and remove blank entries
genelist <- nsc$StMA.All
genelist <- subset(genelist, genelist != "") #remove any blank entries
x <- CNVlist$GeneSymbol %in% genelist		# measure overlap of StMA genelist with disease associated gene symbols.



#Since multiple hits for one gene symbol maybe present, e.g. isoforms, subset the data and process the subset
MAhits <- CNVlist[x,]		#subset based on any StMA matches
MAgenehits <- MAhits[!duplicated(MAhits$GeneSymbol),]  #remove any gene symbol duplicates
MAcnvhits <- MAhits[!duplicated(MAhits$Merged.CNV.ID),]  #make a separate subset of StMA genelist hits but this time remove CNV duplicates instead
totalmappedgenes <- CNVlist[!duplicated(CNVlist$GeneSymbol),]  #total number of mapped genes in CNVs
totalcnvsmapped <- CNVlist[!duplicated(CNVlist$Merged.CNV.ID),]  #make a list of unique CNVs with containing genes

StMAgenes <- nrow(MAgenehits) 							#MA gene hits
TotalGenes <- nrow(totalmappedgenes) 						#total number of genes
genedensity <- StMAgenes/TotalGenes 						#number of MA genes vs total number of genes

MACNV <- nrow(MAcnvhits) 							#Number of CNVs with MA genes
MACNVsize <- sum(MAcnvhits$CNV.Size) 						#Total MA CNV size
MACNVdensity <- MACNVsize/MACNV 						#MA CNV density

TotalCNV <- nrow(totalcnvsmapped) 						#Total number of CNVs
TotalCNVsize <- sum(totalcnvsmapped$CNV.Size) 					#Total CNV length
TotalCNVdensity <- TotalCNVsize/TotalCNV 					#CNV mean size

GeneCNVDensity <- (TotalGenes/TotalCNVsize)*1000000				#Total genes per megabase of CNV
MAGeneCNVDensity <- (StMAgenes/TotalCNVsize)*1000000				#StMA genes per megabase of CNV

#Fisher's Exact test on each
ft <- c(StMAgenes, TotalGenes-StMAgenes, ControlgenesDisease, ControlgenesNoDisease)
dim(ft) <-c(2,2)
fishergenes <- fisher.test(ft)
chisq.test(ft)									# Carry out ChiSq test to console and use if no approximation warnings given
ft <- c(MACNV, TotalCNV-MACNV, ControlCNVDisease, ControlCNVNoDisease)
dim(ft) <-c(2,2)
fishercnv <- fisher.test(ft)
chisq.test(ft)									# Carry out ChiSq test to console and use if no approximation warnings given

# output into a dataframe which could be incremented if required.
d <- data.frame(TotalGenes=numeric(0), StMAgenes=numeric(0), genedensity=numeric(0), fishergenes=numeric(0), TotalCNVsize=numeric(0), TotalCNVdensity=numeric(0), TotalCNV=numeric(0), MACNV=numeric(0), fishercnv=numeric(0), MACNVdensity=numeric(0), GeneCNVDensity=numeric(0), MAGeneCNVDensity=numeric(0))
d <- data.frame(TotalGenes, StMAgenes, genedensity, fishergenes$p.value, TotalCNVsize, TotalCNVdensity, TotalCNV, MACNV, fishercnv$p.value, MACNVdensity, GeneCNVDensity, MAGeneCNVDensity)

d 										# Display the results




#########################
### permutation code: ###
#########################
GenePopulation <- nsc$AllAssayedExpressedGenes  				#list of genes to sample from
GenePopulation <- subset(GenePopulation, GenePopulation != "")   		# as a backup, ensure no blank lines in the file

#define a storage dataframe to store all iterative variables for test and control datasets
permtest <- data.frame(TotalGenes=numeric(0), StMAgenes=numeric(0), genedensity=numeric(0), fishergenes=numeric(0), TotalCNVsize=numeric(0), TotalCNVdensity=numeric(0), TotalCNV=numeric(0), MACNV=numeric(0), fishercnv=numeric(0), MACNVdensity=numeric(0), GeneCNVDensity=numeric(0), MAGeneCNVDensity=numeric(0))

permcontrol <- data.frame(TotalGenes=numeric(0), StMAgenes=numeric(0), genedensity=numeric(0), fishergenes=numeric(0), TotalCNVsize=numeric(0), TotalCNVdensity=numeric(0), TotalCNV=numeric(0), MACNV=numeric(0), fishercnv=numeric(0), MACNVdensity=numeric(0), GeneCNVDensity=numeric(0), MAGeneCNVDensity=numeric(0))



set.seed(1)

permresults <- numeric()
for(n in 1:10000) {

#Permutation
Permgenelist <- sample(GenePopulation, length(genelist)) #random sample of assayed genes
x <- CNVlist$GeneSymbol %in% Permgenelist					#How many genes map to the disease CNV list?
y <- control$GeneSymbol %in% Permgenelist					#How many map to control?

#Since multiple hits for one gene symbol maybe present, e.g. isoforms, subset the data and process the subset
xMAhits <- CNVlist[x,]
xMAgenehits <- xMAhits[!duplicated(xMAhits$GeneSymbol),]  			#make a MA gene list without duplicates
xMAcnvhits <- xMAhits[!duplicated(xMAhits$Merged.CNV.ID),]  			#make a MA gene hit CNV list without duplicates
xtotalmappedgenes <- CNVlist[!duplicated(CNVlist$GeneSymbol),]  		#total number of mapped genes in CNVs
xtotalcnvsmapped <- CNVlist[!duplicated(CNVlist$Merged.CNV.ID),]  		#make a list of unique CNVs with containing genes

xStMAgenes <- nrow(xMAgenehits) #MA gene hits
xTotalGenes <- nrow(xtotalmappedgenes) 						#total number of genes
xgenedensity <- xStMAgenes/xTotalGenes 						#number of MA genes vs total number of genes

xMACNV <- nrow(xMAcnvhits) 							#Number of CNVs with MA genes
xMACNVsize <- sum(xMAcnvhits$CNV.Size) 						#Total MA CNV size
xMACNVdensity <- xMACNVsize/xMACNV 						#MA CNV density

xTotalCNV <- nrow(xtotalcnvsmapped) 						#Total number of CNVs
xTotalCNVsize <- sum(xtotalcnvsmapped$CNV.Size) 				#Total CNV length
xTotalCNVdensity <- xTotalCNVsize/xTotalCNV 					#CNV mean size

xGeneCNVDensity <- (xTotalGenes/xTotalCNVsize)*1000000
xMAGeneCNVDensity <- (xStMAgenes/xTotalCNVsize)*1000000


#Now do the same processing but for the control CNV dataset
yMAhits <- control[y,]
yMAgenehits <- yMAhits[!duplicated(yMAhits$GeneSymbol),]  			#make a MA gene list without duplicates
yMAcnvhits <- yMAhits[!duplicated(yMAhits$Merged.CNV.ID),]  			#make a MA gene hit CNV list without duplicates
ytotalmappedgenes <- control[!duplicated(control$GeneSymbol),]  		#total number of mapped genes in CNVs
ytotalcnvsmapped <- control[!duplicated(control$Merged.CNV.ID),]  		#make a list of unique CNVs with containing genes

#calculations for each iteration
yStMAgenes <- nrow(yMAgenehits) 						#MA gene hits
yTotalGenes <- nrow(ytotalmappedgenes) 						#total number of genes
ygenedensity <- yStMAgenes/yTotalGenes 						#number of MA genes vs total number of genes

yMACNV <- nrow(yMAcnvhits) 							#Number of CNVs with MA genes
yMACNVsize <- sum(yMAcnvhits$CNV.Size) 						#Total MA CNV size
yMACNVdensity <- yMACNVsize/yMACNV 						#MA CNV density

yTotalCNV <- nrow(ytotalcnvsmapped) 						#Total number of CNVs
yTotalCNVsize <- sum(ytotalcnvsmapped$CNV.Size) 				#Total CNV length
yTotalCNVdensity <- yTotalCNVsize/yTotalCNV 					#CNV mean size

yGeneCNVDensity <- (yTotalGenes/yTotalCNVsize)*1000000
yMAGeneCNVDensity <- (yStMAgenes/yTotalCNVsize)*1000000



#Fisher's Exact test on each
ft <- c(xStMAgenes, xTotalGenes-xStMAgenes, yStMAgenes, yTotalGenes-yStMAgenes)
dim(ft) <-c(2,2)
Permfishergenes <- fisher.test(ft)
ft <- c(xMACNV, xTotalCNV-xMACNV, yMACNV, yTotalCNV-xMACNV)
dim(ft) <-c(2,2)
Permfishercnv <- fisher.test(ft)
# Store the results from this iteration into a dataframe
permtest[n,] <- data.frame(xTotalGenes, xStMAgenes, xgenedensity, Permfishergenes$p.value, xTotalCNVsize, xTotalCNVdensity, xTotalCNV, xMACNV, Permfishercnv$p.value, xMACNVdensity, xGeneCNVDensity, xMAGeneCNVDensity)
permcontrol[n,] <- data.frame(yTotalGenes, yStMAgenes, ygenedensity, Permfishergenes$p.value, yTotalCNVsize, yTotalCNVdensity, yTotalCNV, yMACNV, Permfishercnv$p.value, yMACNVdensity, yGeneCNVDensity, yMAGeneCNVDensity)

}

summary(permtest)    #summary statistics of permutation test - median values can be used for for ChiSq or Fishers exact test
summary(permcontrol)
summary(p.adjust(p=permtest$fishergenes, method="bonferroni", n=10000)) # check p-values after multiple testing correction