#Script to analyse mouse and human data #RNA-seq and Microarray #For Cytoscape: Cytoscape has to be instaled and open with CytoscapeRPC plugin installed and activated. #For Circos: Only text files are created, circos has to be run separately. #Used librarys library("corpcor") library("plyr") library("reshape") library("gplots") library("lattice") library("latticeExtra") library("preprocessCore") library("xtable") library("RCytoscape") cy <- CytoscapeConnection() pluginVersion(cy) #Color used for graphes my.col <- colorRampPalette(c("#FFFFFF", "black", "blue", "#FA8072","#00A2FF", "#00CC00", "#E0E0E0"))(7) #1:Backgroundcolor for all graphs, 2: Foregroundcolor for all graphs (E6E6E6), 3: Fill for histograms, 4: Red, for boxplots, 5: Blue, for boxplots, 6: Green, for boxplots, 7: Light gray ############ #Parameters# ############ #This section has to be changed according to analysis to run ###Set folders setwd("~/Folder with data for analysis") ###Choose organism and data set for analysis organism <- "Mus" #"Hum" or "Mus" expDataSource <- "ENCODE" #Brawand, ENCODE, Bgee for Mouse; Brawand, Fagerberg, Bgee for Human; add <- "" #Correlation methods used for Cytoscape and Circos corMethod <- "spearman" #"pearson" or "spearman" partial=TRUE #TRUE or FALSE if (expDataSource == "ENCODE" | expDataSource == "Brawand" | expDataSource == "Fagerberg"| expDataSource == "Yu") { expNorm <- 1000000 #To have all values bigger than 1 } else if (expDataSource == "Bgee" ) { expNorm <- 1 } if (organism == "Mus") { folder <- paste("~/Folder with data for analysis") folderAnalysis <- paste("~/Folder for analysis") organismName <- "Mouse" ortOrganism <- "Rat" dataSource <- "EnsV69" } else if (organism == "Hum") { folder <- paste("~/Folder with data for analysis") folderAnalysis <- paste("~/Folder for analysis") organismName <- "Human" ortOrganism <- "Chp" dataSource <- "EnsV69" } #Tissue names for different organisms and data sets if(organism == "Mus") { if (expDataSource == "ENCODE") { tissuesRPKMNames <- c("Averaged.RPKM.cerebellum","Averaged.RPKM.cortex","Averaged.RPKM.heart","Averaged.RPKM.kidney","Averaged.RPKM.liver", "Averaged.RPKM.lung","Averaged.RPKM.placenta","Averaged.RPKM.smintestine","Averaged.RPKM.spleen","Averaged.RPKM.testis", "Averaged.RPKM.thymus","Averaged.RPKM.adrenal","Averaged.RPKM.bladder","Averaged.RPKM.colon","Averaged.RPKM.duodenum", "Averaged.RPKM.flobe","Averaged.RPKM.gfat","Averaged.RPKM.lgintestine","Averaged.RPKM.mamgland","Averaged.RPKM.ovary", "Averaged.RPKM.sfat","Averaged.RPKM.stomach") tissuesNames <- c("cerebellum", "cortex", "heart", "kidney", "liver", "lung", "placenta", "smintestine", "spleen", "testis", "thymus", "adrenal", "bladder", "colon", "duodenum", "flobe", "gfat", "lgintestine", "mamgland", "ovary", "sfat", "stomach") tissuesPrintNames <- c("Cerebellum", "Cortex", "Heart", "Kidney", "Liver", "Lung","Placenta","Small Intestine","Spleen","Testis", "Thymus", "Adrenal", "Bladder", "Colon", "Duodenum", "Frontal Lobe", "Genital Fat Pad", "Large Intestine", "Mammary Gland", "Ovary", "Subcutaneous Fat Pad", "Stomach") } else if (expDataSource == "Brawand") { tissuesRPKMNames <- c("Averaged.RPKM.brain","Averaged.RPKM.cerebellum","Averaged.RPKM.heart","Averaged.RPKM.kidney","Averaged.RPKM.liver", "Averaged.RPKM.testis") tissuesNames <- c("brain","cerebellum","heart","kidney","liver", "testis") tissuesPrintNames <- c("Brain","Cerebellum","Heart","Kidney","Liver", "Testis") } else if (expDataSource == "Bgee") { tissuesRPKMNames <- c("Averaged.RPKM.liver", "Averaged.RPKM.kidney", "Averaged.RPKM.testis", "Averaged.RPKM.blood", "Averaged.RPKM.lung", "Averaged.RPKM.colon", "Averaged.RPKM.hcampus", "Averaged.RPKM.cortex", "Averaged.RPKM.placenta", "Averaged.RPKM.spleen", "Averaged.RPKM.ovary", "Averaged.RPKM.muscle", "Averaged.RPKM.salivary", "Averaged.RPKM.marrow", "Averaged.RPKM.skin", "Averaged.RPKM.spinal", "Averaged.RPKM.thymus", "Averaged.RPKM.adrenal", "Averaged.RPKM.hypothalamus", "Averaged.RPKM.pituitary", "Averaged.RPKM.duodenum", "Averaged.RPKM.cerebellum") tissuesNames <- c("liver", "kidney", "testis", "blood", "lung", "colon", "hcampus", "cortex", "placenta", "spleen", "ovary", "muscle", "salivary", "marrow", "skin", "spinal", "thymus", "adrenal", "hypothalamus", "pituitary", "duodenum", "cerebellum") tissuesPrintNames <- c("Liver", "Kidney", "Testis", "Blood", "Lung", "Colon", "Hippocampus", "Cortex", "Placenta", "Spleen", "Ovary", "Muscle", "Salivary Gland", "Bone Marrow", "Skin", "Spinal Cord", "Thymus", "Adrenal", "Hypothalamus", "Pituitary Gland", "Duodenum", "Cerebellum") } } else if (organism == "Hum") { if (expDataSource == "Fagerberg") { tissuesRPKMNames <- c("Averaged.RPKM.colon","Averaged.RPKM.kidney", "Averaged.RPKM.liver", "Averaged.RPKM.pancreas", "Averaged.RPKM.lung", "Averaged.RPKM.prostate", "Averaged.RPKM.brain", "Averaged.RPKM.stomach", "Averaged.RPKM.spleen", "Averaged.RPKM.lymphnode", "Averaged.RPKM.appendix", "Averaged.RPKM.smint", "Averaged.RPKM.adrenal", "Averaged.RPKM.duodenum", "Averaged.RPKM.fat", "Averaged.RPKM.endometrium", "Averaged.RPKM.placenta", "Averaged.RPKM.testis", "Averaged.RPKM.gbladder", "Averaged.RPKM.ubladder", "Averaged.RPKM.thyroid", "Averaged.RPKM.esophagus","Averaged.RPKM.heart", "Averaged.RPKM.skin", "Averaged.RPKM.ovary", "Averaged.RPKM.bonem", "Averaged.RPKM.sgland") tissuesNames <- c("colon","kidney", "liver", "pancreas", "lung", "prostate", "brain", "stomach", "spleen", "lymphnode", "appendix", "smint", "adrenal", "duodenum", "fat", "endometrium", "placenta", "testis", "gbladder", "ubladder", "thyroid", "esophagus", "heart", "skin", "ovary", "bonem", "sgland") tissuesPrintNames <- c("Colon","Kidney", "Liver", "Pancreas", "Lung", "Prostate", "Brain", "Stomach", "Spleen", "Lymph Node", "Appendix", "Small Intestine", "Adrenal", "Duodenum", "Fat", "Endometrium", "Placenta", "Testis", "Gallbladder", "Urinal Bladder", "Thyroid", "Esophagus","Heart", "Skin", "Ovary", "Bone Marrow", "Salivary Gland") } else if (expDataSource == "Brawand") { tissuesRPKMNames <- c("Averaged.RPKM.fcortex","Averaged.RPKM.pcortex","Averaged.RPKM.tlobe", "Averaged.RPKM.cerebellum", "Averaged.RPKM.heart", "Averaged.RPKM.kidney", "Averaged.RPKM.liver", "Averaged.RPKM.testis") tissuesNames <- c("fcortex","pcortex","tlobe","cerebellum","heart","kidney","liver", "testis") tissuesPrintNames <- c("Frontal Cortex","Prefrontal Cortex","Temporal Lobe","Cerebellum","Heart","Kidney","Liver", "Testis") } else if (expDataSource == "Bgee") { tissuesRPKMNames <- c("Averaged.RPKM.liver", "Averaged.RPKM.kidney", "Averaged.RPKM.testis", "Averaged.RPKM.blood", "Averaged.RPKM.lung", "Averaged.RPKM.colon", "Averaged.RPKM.hcampus", "Averaged.RPKM.cortex", "Averaged.RPKM.placenta", "Averaged.RPKM.spleen", "Averaged.RPKM.ovary", "Averaged.RPKM.muscle", "Averaged.RPKM.salivary", "Averaged.RPKM.marrow", "Averaged.RPKM.skin", "Averaged.RPKM.spinal", "Averaged.RPKM.thymus", "Averaged.RPKM.adrenal", "Averaged.RPKM.hypothalamus", "Averaged.RPKM.pituitary", "Averaged.RPKM.duodenum", "Averaged.RPKM.cerebellum") tissuesNames <- c("liver", "kidney", "testis", "blood", "lung", "colon", "hcampus", "cortex", "placenta", "spleen", "ovary", "muscle", "salivary", "marrow", "skin", "spinal", "thymus", "adrenal", "hypothalamus", "pituitary", "duodenum", "cerebellum") tissuesPrintNames <- c("Liver", "Kidney", "Testis", "Blood", "Lung", "Colon", "Hippocampus", "Cortex", "Placenta", "Spleen", "Ovary", "Muscle", "Salivary Gland", "Bone Marrow", "Skin", "Spinal Cord", "Thymus", "Adrenal", "Hypothalamus", "Pituitary Gland", "Duodenum", "Cerebellum") } } #Number of tissues nTissues <- length(tissuesNames) yLim <- 1000 #for Tau comparison #Correction term for Circos picture representation correctionTerm <- 8*nTissues ############ #Input data# ############ ##Filters: in Genes "protein_coding" ##Save: in CSV and Unique results only #Gene information #Ensembl Gene ID, Ensembl Transcript ID, Associated Gene Name, % GC content orgGenes = read.delim(paste("~/Gene strucuture data") #Ortholog information #Ensembl Gene ID, Ensembl Transcript ID, Ortholog Ensembl Gene ID, dN, dS, Homology Type orgOrthologs = read.table(paste("~/Ortholog information") dataOrthologs <- orgOrthologs #Paralog information #Ensembl Gene ID, Ensembl Transcript ID, Organism Paralog Ensembl Gene ID, Homology Type orgParalogs = read.table(paste("~/Paralog information"), sep="\t", header=TRUE) #Developmental information #Developmental data from Bgee orgDevelopment = read.table(paste("~/Developmental information"), sep="\t", header=TRUE) #PPI information #The number of direct neighbors of genes in protein-protein network orgConnectivity = read.table(paste("~/PPI information"), sep="\t", header=TRUE) #Protein gene connection #Ensembl Gene ID, Ensembl Transcript ID, Ensembl Protein ID orgProtein = read.table(paste("~/Protein information"), sep="\t", header=TRUE) #Phyletic ages of genes orgPhyleticage = read.table(paste("~/Phyletic age information"), sep="\t", header=TRUE) #Essentiality of genes if (organism == "Mus" | organism == "Hum") { orgEssentiality = read.table(paste("~/Essentiality information"), sep="\t", header=TRUE) #GO annotation of genes orgGO = read.table(paste("~/GO annotation information"), sep="\t", header=TRUE) #Omega from Selectome orgSelectome <- read.table(paste("~/Evolutionary rate information"), sep="\t", header=TRUE) #Ensembl Gene ID, Ensembl Transcript ID, CDS Length, Exon Rank in Transcript, Exon Chr Start (bp), Exon Chr End (bp) orgStructure = read.table(paste("~/Gene structure information"), sep=",", header=TRUE) ##Tissue expression if (organism == "Mus") { if (expDataSource == "ENCODE") { orgExpression <- read.table(paste("~/Expression information"), sep="\t", header=TRUE) } else if (expDataSource == "Brawand") { orgExpression <- read.table(paste("~/Expression information"), sep="\t", header=TRUE) } else if (expDataSource == "Bgee") { orgExpression <- read.table(paste("~/Expression information"), sep="\t", header=TRUE) } } else if (organism == "Hum") { if (expDataSource == "Fagerberg") { orgExpression <- read.table(paste("~/Expression information"), sep="\t", header=TRUE) colnames(orgExpression) <-lapply(colnames(orgExpression),function(x){x <- unlist(strsplit(toString(x), split='_', fixed=TRUE))[1]}) } else if (expDataSource == "Brawand") { orgExpression <- read.table(paste("~/Expression information"), sep="\t", header=TRUE) } else if (expDataSource == "Bgee") { orgExpression <- read.table(paste("~/Expression information"), sep="\t", header=TRUE) } } ####################################### #Merge all the loaded data to one file# ####################################### cat("\n Analysis is done for ", nTissues," tissues.",sep="") cat("\n Overall ",nrow(orgOrthologs)," transcripts in orthologs data."," Summary:",sep="") summary(orgOrthologs) #Take only one to one ortologs orgOrthologs <- orgOrthologs[regexpr("one2one",orgOrthologs$Homology.Type)>0,] cat("\n Overall ",nrow(orgOrthologs)," transcripts with one to one orthologs."," Summary:",sep="") summary(orgOrthologs) cat("\n Overall ",nrow(orgSelectome)," transcripts with one to one orthologs."," Summary:",sep="") summary(orgSelectome) cat("\n Overall ",nrow(orgParalogs)," transcripts in paralogs data."," Summary:",sep="") summary(orgParalogs) #Take only within-species paralogs orgParalogs <- orgParalogs[regexpr("within_species",orgParalogs$Homology.Type)>0,] sumParalogs <- count(orgParalogs,"Ensembl.Gene.ID") names(sumParalogs) <- c("Ensembl.Gene.ID","Paralogs.Number") orgParalogs <- orgParalogs[,c("Ensembl.Gene.ID", "Ensembl.Transcript.ID")] orgParalogs <- merge(orgParalogs, sumParalogs, by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE) orgParalogs <- orgParalogs[,c("Ensembl.Gene.ID","Paralogs.Number")] orgParalogs <- unique(orgParalogs) cat("\n Overall ",nrow(orgParalogs)," transcripts with within species paralogs."," Summary:",sep="") summary(orgParalogs) summary(orgDevelopment) summary(orgConnectivity) orgConnectivity <- orgConnectivity[,c("locus", "protein","connectivity")] colnames(orgConnectivity) <- c("Ensembl.Gene.ID", "Ensembl.Protein.ID","Connectivity") orgConnectivity <- merge(orgConnectivity, orgProtein, by=c("Ensembl.Gene.ID","Ensembl.Protein.ID"), all.x=TRUE, sort=FALSE) orgConnectivity <- na.omit(orgConnectivity) orgConnectivity <- orgConnectivity[,c("Ensembl.Gene.ID", "Ensembl.Transcript.ID","Connectivity")] summary(orgConnectivity) summary(orgPhyleticage) orgPhyleticage <- orgPhyleticage[, c("locus", "phyleticage")] colnames(orgPhyleticage) <- c("Ensembl.Gene.ID", "Phyletic.Age") summary(orgPhyleticage) summary(orgEssentiality) if(organism == "Mus") { orgEssentiality <- orgEssentiality[,c("ens_mouse_id", "mouse_ontology_essentiality")] colnames(orgEssentiality) <- c("Ensembl.Gene.ID", "Essentiality") orgEssentiality$Essentiality <- ifelse(orgEssentiality$Essentiality == "yes", 1, 0) } else if (organism == "Hum") { orgEssentiality <- orgEssentiality[,c("ens_human_id", "human_omim_desc_essentiality")] colnames(orgEssentiality) <- c("Ensembl.Gene.ID", "Essentiality") orgEssentiality$Essentiality <- ifelse(orgEssentiality$Essentiality == "yes", 1, 0) } summary(orgEssentiality) summary(orgGO) orgGO <- orgGO[, c("locus", "GOID", "GOTerm")] colnames(orgGO) <- c("Ensembl.Gene.ID", "GO.ID", "GO.Term") summary(orgGO) cat("\n Overall ",nrow(orgGenes)," transcripts", " in ",length(unique(orgGenes$Ensembl.Gene.ID))," genes for ", organismName," (",dataSource,")."," Summary:",sep="") summary(orgGenes) #Collect all files in one total <- merge(orgGenes,orgOrthologs,by=c("Ensembl.Gene.ID","Ensembl.Transcript.ID"), all.x=TRUE, sort=FALSE) total <- merge(total,orgParalogs,by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE) total$Paralogs.Number <- ifelse(is.na(total$Paralogs.Number), 0, total$Paralogs.Number) total <- merge(total,orgConnectivity, by=c("Ensembl.Gene.ID","Ensembl.Transcript.ID"), all.x=TRUE, sort=FALSE) total <- merge(total,orgPhyleticage,by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE) total <- merge(total,orgEssentiality,by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE) total <- merge(total,orgDevelopment,by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE) total <- merge(total,orgSelectome,by=c("Ensembl.Gene.ID","Ensembl.Transcript.ID"), all.x=TRUE, sort=FALSE) #Add to the file with gene names dN and dS, and calculate Omega cat("\n Omega is calculated as dN/dS if dS is > 0, otherweis Omega = 0",sep="") total$Omega <- ifelse(total$dS>0,total$dN/total$dS,total$dS) cat("\n Overall ",nrow(orgStructure)," exons for ", organismName," (",dataSource,")."," Summary:",sep="") summary(orgStructure) structure <- orgStructure #Calculate the length of each exon structure$Exon.Length <- ifelse(structure$Exon.Chr.End..bp.>0,structure$Exon.Chr.End..bp. - structure$Exon.Chr.Start..bp.+1,structure$Exon.Chr.End..bp.) ##Calculations for Introns #Calculate the summary length of all exons in transcript exonLength <- aggregate(Exon.Length ~ Ensembl.Transcript.ID, FUN="sum", data=structure) names(exonLength) <- c("Ensembl.Transcript.ID","Exon.Total.Length") #Calculate the number of all exons in the transcript exonNumber <- aggregate(Exon.Rank.in.Transcript ~ Ensembl.Transcript.ID, FUN="max", data=structure) names(exonNumber) <- c("Ensembl.Transcript.ID","Exon.Number") #Find the start of the first exon in transcript transcriptStart <- aggregate(Exon.Chr.Start..bp. ~ Ensembl.Transcript.ID, FUN="min", data=structure) names(transcriptStart) <- c("Ensembl.Transcript.ID","Transcript.Start") #Find the end of the last exon in transcript transcriptEnd <- aggregate(Exon.Chr.End..bp. ~ Ensembl.Transcript.ID, FUN="max", data=structure) names(transcriptEnd) <- c("Ensembl.Transcript.ID","Transcript.End") #Calculate the longest Transcript for each gene maxCDS <- aggregate(CDS.Length ~ Ensembl.Gene.ID, FUN="max", data=structure) names(maxCDS) <- c("Ensembl.Gene.ID","Max.CDS.Length") #Put all the calculated data to one table structure <- merge(structure,exonLength,by=c("Ensembl.Transcript.ID"), all.x=TRUE, sort=FALSE) structure <- merge(structure,exonNumber,by=c("Ensembl.Transcript.ID"), all.x=TRUE, sort=FALSE) structure <- merge(structure,transcriptStart,by=c("Ensembl.Transcript.ID"), all.x=TRUE, sort=FALSE) structure <- merge(structure,transcriptEnd,by=c("Ensembl.Transcript.ID"), all.x=TRUE, sort=FALSE) structure <- merge(structure,maxCDS,by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE) #Calculate the length of the transcript structure$Transcript.Length <- ifelse(structure$Transcript.End>0,structure$Transcript.End - structure$Transcript.Start+1,structure$Transcript.End) #Calculate the number of introns structure$Intron.Number <- ifelse(structure$Exon.Number>0,structure$Exon.Number-1,structure$Exon.Number) #Calculate the length of the Introns structure$Intron.Length <- ifelse(structure$Transcript.Length>0,(structure$Transcript.Length - structure$Exon.Total.Length),structure$Transcript.Length) #Intron Length is the mean length of introns in transcript cat("\n Intron Length is the mean length of introns in the transcript.") structure$Intron.Length <- ifelse(structure$Intron.Number>0,structure$Intron.Length/structure$Intron.Number,structure$Intron.Number) #Label if transcript is longest for this gene structure$Max.CDS.Length <- ifelse(structure$CDS.Length==structure$Max.CDS.Length,TRUE,FALSE) #Choose only columns and rows that are needed structure <- structure[,c('Ensembl.Gene.ID','Ensembl.Transcript.ID','CDS.Length','Intron.Length','Intron.Number',"Max.CDS.Length")] structure <- unique(structure) #Many rows are the same, because before there was a row for each Exon, so just delete duplicates #Merge the two tables (with gene properties and gene structure) total <- merge(total,structure,by=c("Ensembl.Gene.ID","Ensembl.Transcript.ID"), all.x=TRUE) tempTotal <- total ##Choice of the transcript #Transcript with avalable Omega.0 (for calculation of Omega.0 the longest transcrip was used) #If no Omega.0 for the gene, the longest transcript #If more transcripts have same CDS.Length, then one for which Connectivity is avalable #If still more transcripts, the one with longest introns. cat("\n Longest transcript will be chousen.") #Transcript with avalable Omega.0 values maxOmega <- aggregate(Omega.0 ~ Ensembl.Gene.ID, FUN="max", data=total) names(maxOmega) <- c("Ensembl.Gene.ID","Max.Omega") total <- merge(total, maxOmega, by="Ensembl.Gene.ID", all.x=TRUE) total1 <- total[!is.na(total$Max.Omega),] #Avalable Omega.0 for at least one transcript total2 <- total[is.na(total$Max.Omega),] #No Omega.0 for all transcripts total1 <- total1[!is.na(total1$Omega.0),] total1 <- total1[,-(length(colnames(total1)))] #Remove Max.Omega column total2 <- total2[,-(length(colnames(total2)))] #Remove Max.Omega column #The longest transcript total2 <- subset(total2,Max.CDS.Length == TRUE) #Connectivity data avalability, if several transcripts the same length maxConnectivity <- aggregate(Connectivity ~ Ensembl.Gene.ID, FUN="max", data=total2) names(maxConnectivity) <- c("Ensembl.Gene.ID","Max.Connectivity") total2 <- merge(total2, maxConnectivity, by="Ensembl.Gene.ID", all.x=TRUE) total21 <- total2[!is.na(total2$Max.Connectivity),] #Avalable Connectivity for at list one transcript total22 <- total2[is.na(total2$Max.Connectivity),] #No Connectivity for all transcripts total21 <- total21[!is.na(total21$Connectivity),] total2 <- rbind(total21, total22) #Maximal intron length for the rest maxIntron <- aggregate(Intron.Length ~ Ensembl.Gene.ID, FUN="max", data=total2) names(maxIntron) <- c("Ensembl.Gene.ID","Max.Intron") total2 <- merge(total2, maxIntron, by="Ensembl.Gene.ID", all.x=TRUE) total2 <- total2[total2$Intron.Length==total2$Max.Intron,] #Random one for the rest. 15 for Mouse temp <- split(1:nrow(total2),total2$Ensembl.Gene.ID) temp2 <- sapply(temp,function(x){x <- x[1]}) total2 <- total2[temp2,] total2 <- total2[!is.na(total2$Ensembl.Gene.ID),] total2 <- total2[,-(length(colnames(total2)))] # Remove Max.Intron column total2 <- total2[,-(length(colnames(total2)))] # Remove Max.Connectivity column total2 <- total2[,-(length(colnames(total2)))] # Remove Max.CDS.Length column total1 <- total1[,-(length(colnames(total1)))] # Remove Max.CDS.Length column total <- rbind(total1, total2) #Bgee data were already normalized, so bring them back to FPKM if(expDataSource == "Bgee") { fmin <- function(x) { x <- subset(x,x>0) res <- min(x, na.rm=TRUE) return(res) } orgExpression[,-1] <- apply(orgExpression[,-1], c(1,2), function(x){x <- 2^x}) minExp <- apply(orgExpression[,-1],2,fmin) minExp <- min(minExp) orgExpression[,-1] <- apply(orgExpression[,-1], c(1,2), function(x){x <- x-minExp}) } #Choose used genes cat("\n If many replicates for one organ, then the mean of expression is chosen.") totalExpr <- total[,c("Ensembl.Gene.ID","Ensembl.Transcript.ID")] if(expDataSource == "Brawand") { orgExpression <- merge(totalExpr,orgExpression,by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE,incomparables = NA) if(organism == "Hum") { orgExpression$Averaged.RPKM.fcortex <- rowMeans(orgExpression[,regexpr("_frontal_cortex",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.pcortex <- orgExpression[,regexpr("prefrontal_cortex",colnames(orgExpression))>0] orgExpression$Averaged.RPKM.tlobe <- orgExpression[,regexpr("temporal_lobe",colnames(orgExpression))>0] orgExpression$Averaged.RPKM.cerebellum <- rowMeans(orgExpression[,regexpr("Cerebellum",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.heart <- rowMeans(orgExpression[,regexpr("Heart",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.kidney <- rowMeans(orgExpression[,regexpr("Kidney",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.liver <- rowMeans(orgExpression[,regexpr("Liver",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.testis <- rowMeans(orgExpression[,regexpr("Testis",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression <- orgExpression[,c("Ensembl.Gene.ID", tissuesRPKMNames)] } else if(organism == "Mus"){ orgExpression$Averaged.RPKM.brain <- rowMeans(orgExpression[,regexpr("Brain",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.cerebellum <- rowMeans(orgExpression[,regexpr("Cerebellum",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.heart <- rowMeans(orgExpression[,regexpr("Heart",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.kidney <- rowMeans(orgExpression[,regexpr("Kidney",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.liver <- rowMeans(orgExpression[,regexpr("Liver",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.testis <- rowMeans(orgExpression[,regexpr("Testis",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression <- orgExpression[,c("Ensembl.Gene.ID", tissuesRPKMNames)] } } else if(expDataSource == "ENCODE"){ #orgExpression <- merge(totalExpr,orgExpression,by=c("Ensembl.Gene.ID","Ensembl.Transcript.ID"), all.x=TRUE, sort=FALSE,incomparables = NA) orgExpression <- merge(totalExpr,orgExpression,by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE,incomparables = NA) orgExpression$Averaged.RPKM.cerebellum <- rowMeans(orgExpression[,regexpr("Cbellum",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.cortex <- rowMeans(orgExpression[,regexpr("Cortex",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.heart <- rowMeans(orgExpression[,regexpr("Heart",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.kidney <- rowMeans(orgExpression[,regexpr("Kidney",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.liver <- rowMeans(orgExpression[,regexpr("Liver",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.lung <- rowMeans(orgExpression[,regexpr("Lung",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.placenta <- rowMeans(orgExpression[,regexpr("Plac",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.smintestine <- rowMeans(orgExpression[,regexpr("Smint",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.spleen <- rowMeans(orgExpression[,regexpr("Spleen",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.testis <- rowMeans(orgExpression[,regexpr("Testis",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.thymus <- rowMeans(orgExpression[,regexpr("Thymus",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.adrenal <- rowMeans(orgExpression[,regexpr("Adrenal",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.bladder <- rowMeans(orgExpression[,regexpr("Bladder",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.colon <- rowMeans(orgExpression[,regexpr("Colon",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.duodenum <- rowMeans(orgExpression[,regexpr("Duod",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.flobe <- rowMeans(orgExpression[,regexpr("Flobe",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.gfat <- rowMeans(orgExpression[,regexpr("Gfat",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.lgintestine <- rowMeans(orgExpression[,regexpr("Lgint",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.mamgland <- rowMeans(orgExpression[,regexpr("Mamg",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.ovary <- rowMeans(orgExpression[,regexpr("Ovary",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.sfat <- rowMeans(orgExpression[,regexpr("Sfat",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.stomach <- rowMeans(orgExpression[,regexpr("Stom",colnames(orgExpression))>0], na.rm=TRUE, dim=1) # orgExpression <- orgExpression[,c("Ensembl.Gene.ID","Ensembl.Transcript.ID", tissuesRPKMNames)] orgExpression <- orgExpression[,c("Ensembl.Gene.ID", tissuesRPKMNames)] } else if(expDataSource == "Bgee"){ orgExpression <- merge(totalExpr,orgExpression,by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE,incomparables = NA) orgExpression$Averaged.RPKM.liver <- rowMeans(orgExpression[,regexpr("liver",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.kidney <- rowMeans(orgExpression[,regexpr("kidney",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.testis <- rowMeans(orgExpression[,regexpr("testis",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.blood <- rowMeans(orgExpression[,regexpr("blood",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.lung <- rowMeans(orgExpression[,regexpr("lung",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.colon <- rowMeans(orgExpression[,regexpr("colon",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.hcampus <- rowMeans(orgExpression[,regexpr("hippocampus",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.cortex <- rowMeans(orgExpression[,regexpr("cortex",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.placenta <- rowMeans(orgExpression[,regexpr("placenta",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.spleen <- rowMeans(orgExpression[,regexpr("spleen",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.ovary <- rowMeans(orgExpression[,regexpr("ovary",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.muscle <- rowMeans(orgExpression[,regexpr("muscle",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.salivary <- rowMeans(orgExpression[,regexpr("salivary",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.marrow <- rowMeans(orgExpression[,regexpr("marrow",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.skin <- rowMeans(orgExpression[,regexpr("skin",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.spinal <- rowMeans(orgExpression[,regexpr("spinal",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.thymus <- rowMeans(orgExpression[,regexpr("thymus",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.adrenal <- rowMeans(orgExpression[,regexpr("adrenal",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.hypothalamus <- rowMeans(orgExpression[,regexpr("hypothalamus",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.pituitary <- rowMeans(orgExpression[,regexpr("pituitary",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.duodenum <- rowMeans(orgExpression[,regexpr("duodenum",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.cerebellum <- rowMeans(orgExpression[,regexpr("cerebellum",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression <- orgExpression[,c("Ensembl.Gene.ID", tissuesRPKMNames)] } else if(expDataSource == "Fagerberg"){ orgExpression <- merge(totalExpr,orgExpression,by=c("Ensembl.Gene.ID"), all.x=TRUE, sort=FALSE,incomparables = NA) orgExpression$Averaged.RPKM.colon <- rowMeans(orgExpression[,regexpr("colon",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.kidney <- rowMeans(orgExpression[,regexpr("kidney",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.liver <- rowMeans(orgExpression[,regexpr("liver",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.pancreas <- rowMeans(orgExpression[,regexpr("pancreas",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.lung <- rowMeans(orgExpression[,regexpr("lung",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.prostate <- rowMeans(orgExpression[,regexpr("prostate",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.brain <- rowMeans(orgExpression[,regexpr("brain",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.stomach <- rowMeans(orgExpression[,regexpr("stomach",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.spleen <- rowMeans(orgExpression[,regexpr("spleen",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.lymphnode <- rowMeans(orgExpression[,regexpr("lymphnode",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.appendix <- rowMeans(orgExpression[,regexpr("appendix",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.smint <- rowMeans(orgExpression[,regexpr("smallintestine",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.adrenal <- rowMeans(orgExpression[,regexpr("adrenal",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.duodenum <- rowMeans(orgExpression[,regexpr("duodenum",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.fat <- rowMeans(orgExpression[,regexpr("fat",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.endometrium <- rowMeans(orgExpression[,regexpr("endometrium",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.placenta <- rowMeans(orgExpression[,regexpr("placenta",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.testis <- rowMeans(orgExpression[,regexpr("testis",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.gbladder <- rowMeans(orgExpression[,regexpr("gallbladder",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.ubladder <- rowMeans(orgExpression[,regexpr("urinarybladde",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.thyroid <- rowMeans(orgExpression[,regexpr("thyroid",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.esophagus <- rowMeans(orgExpression[,regexpr("esophagus",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.heart <- rowMeans(orgExpression[,regexpr("heart",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.skin <- rowMeans(orgExpression[,regexpr("skin",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.ovary <- rowMeans(orgExpression[,regexpr("ovary",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.bonem <- rowMeans(orgExpression[,regexpr("bonem",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression$Averaged.RPKM.sgland <- rowMeans(orgExpression[,regexpr("salivarygland",colnames(orgExpression))>0], na.rm=TRUE, dim=1) orgExpression <- orgExpression[,c("Ensembl.Gene.ID", tissuesRPKMNames)] } #Draw expression distribution orgRPKM <- na.omit(orgExpression) orgRPKM[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""))] <- apply(orgRPKM[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""))], c(1,2), function(x){x <- log2(x)}) dev.new(height=9, width=12) par(cex.main=0.95, bg=my.col[1], fg=my.col[2], col.axis=my.col[2], col.lab=my.col[2], col.main=my.col[2]) palette(rev(rich.colors(length(tissuesNames)+2))) plot(density(orgRPKM[,tissuesRPKMNames[1]],n=1000), main = "Expression values among different tissues",xlab="RPKM",col=(1), lwd=3) for(i in c(2:length(tissuesRPKMNames))) { lines(density(orgRPKM[,tissuesRPKMNames[i]],n = 1000), col=(i), lwd=3) } legend("topright",tissuesPrintNames,col=(1:length(tissuesRPKMNames)),lty="solid", lwd=3) dev.copy2pdf(device=quartz, file=paste(folderAnalysis, organism,"TissuesOriginalExpressionRPKM", expDataSource, add,".pdf", sep=""),onefile=TRUE) #dev.off() orgRPKM <- na.omit(orgExpression) x2 <- orgRPKM[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""))] x2 <- x2 * expNorm + 1.0001 x2 <- log2(x2) orgRPKM[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""))] <- data.frame(x2) dev.new(height=9, width=12) par(cex.main=0.95, bg=my.col[1], fg=my.col[2], col.axis=my.col[2], col.lab=my.col[2], col.main=my.col[2]) palette(rev(rich.colors(length(tissuesNames)+2))) plot(density(orgRPKM[,tissuesRPKMNames[1]],n=1000), main = "Expression values among different tissues",xlab="Normalized expression",col=(1), lwd=3) for(i in c(2:length(tissuesRPKMNames))) { lines(density(orgRPKM[,tissuesRPKMNames[i]],n = 1000), col=(i), lwd=3) } legend("topright",tissuesPrintNames,col=(1:length(tissuesRPKMNames)),lty="solid", lwd=3) dev.copy2pdf(device=quartz, file=paste(folderAnalysis, organism,"TissuesOriginalExpressionRPKM", expDataSource, "Extended" , add, ".pdf", sep=""),onefile=TRUE) #dev.off() #Quintile normalization orgExpression <- na.omit(orgExpression) x <- orgExpression[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""))] x <- x * expNorm x <- log2(x) x[x<0] <- -Inf x_m <- as.matrix(x) x <- normalize.quantiles(x_m) x[x == -Inf] <- log2(1.0001) orgExpression[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""))] <- data.frame(x) dev.new(height=9, width=12) par(cex.main=0.95, bg=my.col[1], fg=my.col[2], col.axis=my.col[2], col.lab=my.col[2], col.main=my.col[2]) palette(rev(rich.colors(length(tissuesNames)+2))) plot(density(x[,1],n=1000), main = "Expression values among different tissues",xlab="Quantile normalized expression",col=(1), lwd=3) for(i in c(2:length(tissuesRPKMNames))) { lines(density(x[,i],n = 1000), col=(i), lwd=3) } legend("topright",tissuesPrintNames,col=(1:length(tissuesRPKMNames)),lty="solid", lwd=3) dev.copy2pdf(device=quartz, file=paste(folderAnalysis, organism,"OriginalExpressionRPKM", expDataSource,"ExtendedQN", add, ".pdf", sep=""),onefile=TRUE) #dev.off() cat("\n Mean is calculated taking in account tissues with 0 expression. 2+0+4=2",sep="") fmean <- function(x) { #x <- subset(x,x>0) if(!all(is.na(x))) { res <- mean(x, na.rm=TRUE) } else { res <- NA } return(res) } orgExpression$Mean.Expression <- apply(orgExpression[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""))],1,fmean) cat("\n Median is calculated taking in account tissues with 0 expression. 2+0+4=2",sep="") fmedian <- function(x) { #x <- subset(x,x>0) if(!all(is.na(x))) { res <- median(x, na.rm=TRUE) }else { res <- NA } return(res) } orgExpression$Median.Expression <- apply(orgExpression[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""))],1,fmedian) #Maximal expression value over all tissues fmax <- function(x) { if(!all(is.na(x))) { res <- max(x, na.rm=TRUE) } else { res <- NA } return(res) } orgExpression$Max.Expression <- apply(orgExpression[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""))],1,fmax) #Function to calculate Tau, xi=xi/max(xi), tau=sum(1-xi)/(n-1) ftau <- function(x) { if(!all(is.na(x))) { x <- (1-(x/x[length(x)])) res <- sum(x, na.rm=TRUE) res <- res/(length(x)-1) } else { res <- NA } return(res) } orgExpression$Tau <- apply(orgExpression[,c(paste("Averaged.RPKM.", tissuesNames[1:nTissues], sep=""), paste("Max.Expression", sep=""))],1,ftau) cat("\n Expression data are avalable for: ", nrow(orgExpression)," genes.",sep="") cat("\n Summary of expression data after normalisation and calculating tau: ") ###Output write.table(orgExpression, file=paste(folderAnalysis, organism, "Expression", expDataSource, add,".txt",sep=""),row.names = FALSE, col.names=TRUE, quote = FALSE) summary(orgExpression) totalTemp1 <- total write.table(total, file=paste(folderAnalysis, organism, "TotalSum", expDataSource,"Parameters", add, ".txt",sep=""), row.names = FALSE, col.names=TRUE, quote = FALSE) # if(expDataSource != "ENCODE") # { #Add calculated expression values to the main table total <- merge(total,orgExpression,by=c("Ensembl.Gene.ID"), all.x=TRUE, incomparables = NA, sort=FALSE) # } else if (expDataSource == "ENCODE") { # total <- merge(total,orgExpression,by=c("Ensembl.Gene.ID","Ensembl.Transcript.ID"), all.x=TRUE, incomparables = NA, sort=FALSE) # totalTemp2 <- total # #The most expressed transcript will be chosen # total <- total[!is.na(total$CDS.Length),] # #Omega 0 is assumed for gene not for individual transcript # maxOmega <- aggregate(Omega.0 ~ Ensembl.Gene.ID, FUN="max", data=total) # names(maxOmega) <- c("Ensembl.Gene.ID","Omega.0") # total <- total[,-which(names(total) %in% c("Omega.0"))] # total <- merge(total, maxOmega, by="Ensembl.Gene.ID", all.x=TRUE) # maxLRT <- aggregate(LRT ~ Ensembl.Gene.ID, FUN="max", data=total) # names(maxLRT) <- c("Ensembl.Gene.ID","LRT") # total <- total[,-which(names(total) %in% c("LRT"))] # total <- merge(total, maxLRT, by="Ensembl.Gene.ID", all.x=TRUE) # maxP1 <- aggregate(P.1 ~ Ensembl.Gene.ID, FUN="max", data=total) # names(maxP1) <- c("Ensembl.Gene.ID","P.1") # total <- total[,-which(names(total) %in% c("P.1"))] # total <- merge(total, maxP1, by="Ensembl.Gene.ID", all.x=TRUE) # maxPS <- aggregate(Positive.Selection ~ Ensembl.Gene.ID, FUN="max", data=total) # names(maxPS) <- c("Ensembl.Gene.ID","Positive.Selection") # total <- total[,-which(names(total) %in% c("Positive.Selection"))] # total <- merge(total, maxPS, by="Ensembl.Gene.ID", all.x=TRUE) # #Choice of the transcript with maximal median expression # maxMedianE <- aggregate(Median.Expression ~ Ensembl.Gene.ID, FUN="max", data=total) # names(maxMedianE) <- c("Ensembl.Gene.ID","Max.Median.Expression") # total <- merge(total, maxMedianE, by="Ensembl.Gene.ID", all.x=TRUE) # total$Max.Median.Expression <- ifelse(total$Median.Expression == total$Max.Median.Expression,TRUE,FALSE) # total <- subset(total,Max.Median.Expression == TRUE) # #Choise of transcript with maximal maximal expression # maxMaxE <- aggregate(Max.Expression ~ Ensembl.Gene.ID, FUN="max", data=total) # names(maxMaxE) <- c("Ensembl.Gene.ID","Max.Max.Expression") # total <- merge(total, maxMaxE, by="Ensembl.Gene.ID", all.x=TRUE) # total$Max.Max.Expression <- ifelse(total$Max.Expression == total$Max.Max.Expression,TRUE,FALSE) # total <- subset(total,Max.Max.Expression == TRUE) # #For the rest, the longest transcript is chousen # minIntron <- aggregate(Intron.Length ~ Ensembl.Gene.ID, FUN="min", data=total) # names(minIntron) <- c("Ensembl.Gene.ID","Min.Intron") # total <- merge(total, minIntron, by="Ensembl.Gene.ID", all.x=TRUE) # total$Min.Intron <- ifelse(total$Intron.Length == total$Min.Intron,TRUE,FALSE) # total <- subset(total,Min.Intron == TRUE) # temp <- split(1:nrow(total),total$Ensembl.Gene.ID) # temp2 <- sapply(temp,function(x){x <- x[1]}) # total <- total[temp2,] # total <- total[!is.na(total$Ensembl.Gene.ID),] # } #File to save the data to all the tissues totalTissues <- total[,c("Ensembl.Gene.ID", "Ensembl.Transcript.ID","CDS.Length", "Intron.Length", "Intron.Number", "Omega", "Omega.0", "LRT", "P.1", "Positive.Selection", "Max.Expression", "Mean.Expression", "Median.Expression", "Tau", "X..GC.content", "Paralogs.Number", "Stage.Number", "Stage.First", "Connectivity", "Phyletic.Age", "Essentiality", tissuesRPKMNames)] #Choose only usefull colums total <- total[,c("Ensembl.Gene.ID", "Ensembl.Transcript.ID", "X..GC.content", "Omega.0", "LRT", "P.1", "Positive.Selection", "CDS.Length", "Intron.Length", "Intron.Number", "Omega", "Max.Expression", "Mean.Expression", "Median.Expression", "Tau", "Paralogs.Number", "Stage.Number", "Stage.First", "Connectivity", "Phyletic.Age", "Essentiality")] cat("\n Overall ",nrow(total)," genes."," Summary:",sep="") summary(total) cat("\n Overall ",nrow(totalTissues)," genes for each tissue."," Summary:",sep="") summary(totalTissues) ###Save the results write.table(total,file=paste(folderAnalysis, organism, "Table", expDataSource, add, ".txt",sep=""),row.names = FALSE, col.names=TRUE, quote = FALSE) write.table(totalTissues,file=paste(folderAnalysis, organism, "TableTissues", expDataSource, add,".txt",sep=""), row.names = FALSE, col.names=TRUE, quote = FALSE) ####################### ####################### ####################### ####################################################### #Make partial correlation with glm model for Cytoscape# ####################################################### #Load the data data <- read.table(paste(folderAnalysis,organism, "Table", expDataSource, ".txt",sep=""), header=TRUE) add <- "" #partial <- FALSE #corMethod <- "pearson" # #"spearman" cat("\n Summary of the data in the first step: ", sep="") summary(data) # ##Only essential human-mouse orthologs # data <- data[data$Essentiality==0,] # data <- data[!is.na(data$Essentiality),] # ##Only specific genes # data <- data[data$Tau>0.2,] #0.2 used to define tissue specific genes # data <- data[!is.na(data$Tau),] # geneData <- data.frame(H=runif(10000, 160, 190)) # #geneData$High <- runif(10000, 160, 190) # geneData$LL <- geneData$H - 100 - runif(10000, 1, 3) # geneData$RL <- geneData$H - 100 - runif(10000, 1, 3) # geneData$RL <- geneData$LL - runif(10000, 1, 3) # corMethod="spearman" # partial <- TRUE # ###Change Omaga 0 to MI score from appris data base, only for human # data2 <- read.table(paste(folder, "APPRIS_DB.csv",sep=""), header=TRUE, sep=";") # data2 <- data2[,c("ENSEMBL", "MI.score")] # colnames(data2) <- c("Ensembl.Gene.ID", "MI.Score") # data2$MI.Score <- ifelse(as.numeric(as.character(data2$MI.Score))<0, NA, as.numeric(as.character(data2$MI.Score))) # data <- merge(data, data2, by="Ensembl.Gene.ID", all.x=TRUE,sort=FALSE) # ##Leave only selected columns # data <- data[,c("MI.Score", "CDS.Length", "Intron.Length", "Intron.Number", "Median.Expression", "Max.Expression", "Tau", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age")] # colnames(data) <- c("Omega", "CDS.Length", "Intron.Length", "Intron.Number", "Median.Expression", "Max.Expression", "Tau", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age") # parametersNames <- c("MI score", "CDS length", "Intron \n length", "Intron \n number", "Median \n expression", "Maximal \n expression", "Tau", "%GC content", "Paralogs \n number", "Stage \n number", "Phyletic age") # ###Only genes without Omega 0 # data <- data[is.na(data$Omega.0),] # data <- data[,c("CDS.Length", "Intron.Length", "Intron.Number", "Median.Expression", "Max.Expression", "Tau", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age")] #"Connectivity" # colnames(data) <- c("CDS.Length", "Intron.Length", "Intron.Number", "Median.Expression", "Max.Expression", "Tau", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age")# # parametersNames <- c("CDS length", "Intron \n length", "Intron \n number", "Median \n expression", "Maximal \n expression", "Tau", "%GC content", "Paralogs \n number", "Stage \n number", "Phyletic age") ###All parameters ##Leave only needed columns data <- data[,c("Omega.0", "LRT", "P.1","CDS.Length", "Intron.Length", "Intron.Number", "Median.Expression", "Max.Expression", "Tau", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age")] colnames(data) <- c("Omega", "LRT", "P.1", "CDS.Length", "Intron.Length", "Intron.Number", "Median.Expression", "Max.Expression", "Tau", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age") parametersNames <- c("Omega", "LRT", "P 1", "CDS length", "Intron \n length", "Intron \n number", "Median \n expression", "Maximal \n expression", "Tau", "%GC content", "Paralogs \n number", "Stage \n number", "Phyletic age") cat("\n To calculate correlation ", corMethod, " correlation was used.",sep="") if (partial) { cat("\n Partial correlation was performed.",sep="") part <- "Partial" } else { cat("\n Normal correlation was performed.",sep="") part <- "Normal" } #Delete all genes with NA and not known parameters data <- na.omit(data) data <- data[data$Max.Expression>0.00015,] #Genes that are not expressed in any tissue cat("\n All the genes with unknown parameters are removed from the analysis. ", nrow(data), " are left for the analysis.", " Summary: ", sep="") summary(data) geneData<-data #Normalization of the data minOmega <- min(geneData$Omega[geneData$Omega>0]) geneData$Omega <- geneData$Omega + minOmega geneData$Omega <- log2(geneData$Omega) minLength <- min(geneData$Intron.Length[geneData$Intron.Length>0]) geneData$Intron.Length <- geneData$Intron.Length + minLength geneData$Intron.Length <- log2(geneData$Intron.Length) minP1 <- min(geneData$P.1[geneData$P.1>0]) geneData$P.1 <- geneData$P.1 + minP1 geneData$P.1 <- sqrt(sqrt(geneData$P.1)) geneData$Intron.Number <- geneData$Intron.Number + 1 geneData$Intron.Number <- log2(geneData$Intron.Number) geneData$Paralogs.Number <- geneData$Paralogs.Number + 1 geneData$Paralogs.Number <- log2(geneData$Paralogs.Number) geneData$Phyletic.Age <- geneData$Phyletic.Age + 1 minTau <- 1 - max(geneData$Tau) geneData$Tau <- geneData$Tau + minTau geneData$Tau <- log2(geneData$Tau) geneData$LRT <- ifelse(geneData$LRT<0, 0, geneData$LRT) geneData$LRT <- sqrt(sqrt(geneData$LRT)) geneData$CDS.Length <- log2(geneData$CDS.Length) cat("\n The data are normalised. Log2 of all paremeters is taken, exept GC content.", sep="") cat("Summary of the data after normalization:",sep="") summary(geneData) cat("\n Graphical representation of the data is saved in the file \"Parameters\".", sep="") dev.new(height=12, width=18) par(mfrow=c(6,4),cex.main=0.95, bg=my.col[1], fg=my.col[2], col.axis=my.col[2], col.lab=my.col[2], col.main=my.col[2])# hist(data$Omega[which(data$Omega < quantile(data$Omega,0.95))], main=paste("Omega"),xlab="Omega",breaks=30) hist(geneData$Omega, main=paste("log2(Omega)"),xlab="log2(Omega)", col=my.col[3],breaks=30) hist(data$CDS.Length[which(data$CDS.Length < quantile(data$CDS.Length,0.95))], main=paste("CDS length"), xlab="CDS length",breaks=30) hist(geneData$CDS.Length, main=paste("log2(CDS length)"), xlab="log2(CDS length)", col=my.col[3],breaks=30) hist(data$Intron.Number[which(data$Intron.Numberquantile(data$LRT,0.95))], main=paste("(LRT)^1/4"), xlab="(LRT)^1/4", col=my.col[3],breaks=30) hist(data$Median.Expression, main=paste("Median Expression"), xlab="Median Expression",breaks=30) hist(data$Max.Expression, main=paste("Max Expression"), xlab="Max Expression",breaks=30) hist(data$X..GC.content, main=paste("%GC Content"), xlab="%GC Content",breaks=30) hist(geneData$X..GC.content, main=paste("log2(X..GC.content)"), xlab="log2(X..GC.content)", col=my.col[3],breaks=50) hist(data$Stage.Number, main=paste("Stage Number"), xlab="Stage Number",breaks=30) hist(data$Phyletic.Age, main=paste("Phyletic Age"), xlab="Phyletic Age",breaks=30) dev.copy2pdf(device=quartz, file=paste(folderAnalysis, organism,"Parameters", expDataSource, add,".pdf", sep=""),onefile=TRUE)#,paper="A4r" #dev.off() ######## #Names of the variables used variableNames <- colnames(geneData) #Calculating correlations if (partial==TRUE) { x <- data.frame(x1=NULL,x2=NULL,corValue=NULL,pValue=NULL,significant=NULL) for(j in variableNames) #j is the name of variable for which the correlation is calculated { variablesToUse <- variableNames[variableNames != j] #all other variables t = length(variableNames)-1 for(n in c(1:t)) { j2 <- variablesToUse[n] variablesToUse2 <- variablesToUse[variablesToUse != j2] fmodel <-"geneData$" fmodel <- paste(fmodel,j,"~",sep="") fmodel2 <-"geneData$" fmodel2 <- paste(fmodel2,j2,"~",sep="") for(i in variablesToUse2) { fmodel <- paste(fmodel,"geneData$",i,"+",sep="") fmodel2 <- paste(fmodel2,"geneData$",i,"+",sep="") } fmodel <- substr(fmodel, 1, nchar(fmodel)-1) #delet last character "+" fmodel2 <- substr(fmodel2, 1, nchar(fmodel2)-1) #delet last character "+" fmx <- glm(fmodel, na.action = na.exclude) fmy <- glm(fmodel2, na.action = na.exclude) xres <- resid(fmx) yres <- resid(fmy) ct <- cor.test(xres, yres, method=corMethod) s <- ct$estimate coeff <- ct$p.value signCoeff <- ct$p.value < 0.0005 #Treshhold for significance, corrected for 14 parameters x <- rbind(x, data.frame(x1=j,x2=j2,corValue=s,pValue=coeff,significant=as.integer(signCoeff))) } } } else { x <- data.frame(x1=NULL,x2=NULL,corValue=NULL,pValue=NULL,significant=NULL) for(j in variableNames) #j is the name of variable for which the correlation is calculated { variablesToUse <- variableNames[variableNames != j] #all other variables t = length(variableNames)-1 for(n in c(1:t)) { j2 <- variablesToUse[n] variablesToUse2 <- variablesToUse[variablesToUse != j2] ct <- cor.test(geneData[,j], geneData[,j2], method=corMethod) s <- ct$estimate coeff <- ct$p.value signCoeff <- ct$p.value < 0.0005 #Treshhold for significance, corrected for 14 parameters, 91 correlation x <- rbind(x, data.frame(x1=j,x2=j2,corValue=s,pValue=coeff,significant=as.integer(signCoeff))) } } } row.names(x) <- c(1:nrow(x)) sameCorRowNumbers <- vector("numeric") for(i in rownames(x)) { sameCor <- x[with(x,x$x2 == x[i,]$x1 & x$x1 == x[i,]$x2),] if(as.integer(rownames(sameCor))>as.integer(i)) { sameCorRowNumbers <- append(sameCorRowNumbers, as.integer(rownames(sameCor))) } } x <- x[-sameCorRowNumbers,] cat("\n Correlation table",sep="") print(x, type="latex",file="") cat("\n Result of the correlation is saved in \"Cor_Original\".",sep="") write.table(x,file=paste(folderAnalysis, organism, part, corMethod, "Cor", expDataSource, "Original", add, ".txt", sep=""),row.names = FALSE,quote = FALSE) xp <- x[x$x1=="Omega",] xp <- xp[xp$significant >0,] xp$var <- xp$corValue*xp$corValue v <- sum(xp$var)*100 print(paste("The variance of Omega is explained to ",v, "% through used parameters",sep="")) #Making the file for Cytoscape x$abs <- abs(x$corValue)*20 x$sign <- sign(x$corValue) x <- x[,c("x1","x2","significant","abs","sign")] #Data to use with cytoscape cat("\n Result of the correlation for Cytoscape representation is saved in \"Cor_List\".",sep="") write.table(x, file=paste(folderAnalysis, organism, part, corMethod, "Cor", expDataSource,"List", add, ".txt",sep=""),row.names = FALSE,quote = FALSE) ### #Draw graph in cytoscape graphC <- x cy <- CytoscapeConnection() # initialize g <- new ("graphNEL", edgemode = "undirected") g <- initNodeAttribute (g, "nodeType", "char", "undefined") g <- initNodeAttribute (g, "label", "char", "undefined") g <- initEdgeAttribute (g, "edgeType", "char", "undefined") g <- initEdgeAttribute (g, "significant", "char", "undefined") g <- initEdgeAttribute (g, "sign", "char", "undefined") #g <- initEdgeAttribute (g, "label", "char", "undefined") #add nodes and edges g <- addNode("info.node", g) #g <- addNode("title.node", g) parameters <- unique(levels(graphC$x1)) for (p in parameters){ g <- addNode(p, g) } for (n in 1:length(rownames(graphC))){ g <- addEdge(as.character(graphC[n,1]), as.character(graphC[n,2]), g) } #add node and edge attribues nodeData(g, "info.node", "label") = "Information I want" #nodeData(g, "title.node", "label") = "Information I want manny majniofjfpwef ndiojfpewfjwe mofpewjkfoeqüwpfkjoweü meowpfjkoü" for (p in parameters){ nodeData(g, p, "nodeType") = p nodeData(g, p, "label") = p } nodeData(g, parameters, "label") = parametersNames for (n in 1:length(rownames(graphC))){ edgeData(g, as.character(graphC[n,1]), as.character(graphC[n,2]), "edgeType") = as.character(graphC[n,4]) edgeData(g, as.character(graphC[n,1]), as.character(graphC[n,2]), "sign") = graphC[n,5] edgeData(g, as.character(graphC[n,1]), as.character(graphC[n,2]), "significant") = graphC[n,3] #edgeData(g, as.character(graphC[n,1]), as.character(graphC[n,2]), "label") = round(graphC[n,4]/20*graphC[n,5], digits=2) } #create a CytoscapeWindow, after first making sure that no prior window of the same name cy <- CytoscapeConnection() setDefaultBackgroundColor(cy, my.col[1]) window.title = 'Correlation' if (window.title %in% as.character (getWindowList(cy))) deleteWindow (cy, window.title) cw <- new.CytoscapeWindow (window.title, g) # set window and network sizes setWindowSize (cw, 1200, 1200) fitContent (cw) setZoom (cw, 0.9 * getZoom (cw)) #send graph to Cytoscape displayGraph (cw) #Set default settings for the graph setDefaultEdgeColor(cw, my.col[2]) lockNodeDimensions(cw, FALSE) setNodeShapeDirect(cw, parameters, "ellipse") setNodeFontSizeDirect(cw, parameters, 10) setNodeColorDirect(cw, parameters, my.col[7]) setNodeWidthDirect(cw, parameters, 85) setNodeHeightDirect(cw, parameters, 40) #Legend setNodeShapeDirect(cw, "info.node", "rect") setNodeFontSizeDirect(cw, "info.node", 10) setNodeColorDirect(cw, "info.node", my.col[1]) setNodeWidthDirect(cw, "info.node", 135) setNodeHeightDirect(cw, "info.node", 135) setNodeImageDirect(cw,"info.node", "file:/Legend.png") setNodeBorderColorDirect(cw, "info.node", my.col[1]) setNodeOpacityDirect(cw, "info.node", 0) #ask Cytoscape to layout the graph layoutNetwork (cw, 'attribute-circle') #instruct Cytoscape to use each node's 'label' attribute as the value for the visible label it draws on the node setNodeLabelRule (cw, 'label') setEdgeLineWidthRule(cw, "edgeType", as.character(graphC$abs), as.numeric(graphC$abs)) setEdgeColorRule(cw, "sign", c("-1", "1"), c("blue", "red"), mode="lookup") setEdgeOpacityRule(cw, "significant", c("1", "0"), c("175", "0"), mode="lookup") #setEdgeLabelRule(cw, "label") # now ask Cytoscape to redraw the graph using these rules redraw (cw) #saveLayout(cw,'CorrelationLayout13') #Manually change the order of parameters restoreLayout(cw, 'CorrelationLayout13') fitContent (cw) saveImage(cw, paste(folderAnalysis, organism, part, corMethod, "Cor", expDataSource, "Cyt", add, ".png", sep=""), 'png', 2.0) ########################## ########################## ########################## ######################################################## #Organism partial correlation with GLM model for circos# ######################################################## cat("\n Calculating partial correlation with glm model for each tissue separately.",sep="") #Load the data dataOrg <- read.table(paste(folderAnalysis, organism, "TableTissues", expDataSource,".txt",sep=""), header=TRUE) add <- "" cat("\n Summary of the data (", nrow(dataOrg), " genes) in the first step: ", sep="") summary(dataOrg) dataOrg <- dataOrg[dataOrg$Max.Expression>0.00015,] # ###Only essential human-mouse orthologs # dataOrg <- dataOrg[dataOrg$Essentiality==1,] # dataOrg <- dataOrg[!is.na(dataOrg$Essentiality),] # ###Only specific genes # dataOrg <- dataOrg[dataOrg$Tau>0.2,] # dataOrg <- dataOrg[!is.na(dataOrg$Tau),] # summary(dataOrg) cat("\n To calculate correlation ", corMethod, " correlation was used.",sep="") if (partial) { cat("Partial correlation was performed.",sep="") part <- "Partial" } else { cat("Normal correlation was performed.",sep="") part <- "Normal" } cat("\n The analysis is done for ", length(tissuesRPKMNames), " tissues.", sep="") ###All parameters #Leave only needed columns dataOrg <- dataOrg[,c("Omega.0", "LRT", "P.1","CDS.Length", "Intron.Length", "Intron.Number", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age", tissuesRPKMNames)] parameterNames <- c("Omega", "LRT", "P.1", "CDS.Length", "Intron.Length", "Intron.Number", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age") colnames(dataOrg) <- c(parameterNames, tissuesRPKMNames) geneDataOrg <- na.omit(dataOrg) cat("\n All the genes with unknown parameters are removed from the analysis. ", nrow(geneDataOrg)," are left for the analysis.", " Summary: ", sep="") summary(geneDataOrg) minOmega <- min(geneDataOrg$Omega[geneDataOrg$Omega>0]) geneDataOrg$Omega <- geneDataOrg$Omega + minOmega geneDataOrg$Omega <- log2(geneDataOrg$Omega) minLength <- min(geneDataOrg$Intron.Length[geneDataOrg$Intron.Length>0]) geneDataOrg$Intron.Length <- geneDataOrg$Intron.Length + minLength geneDataOrg$Intron.Length <- log2(geneDataOrg$Intron.Length) geneDataOrg$LRT <- ifelse(geneDataOrg$LRT<0, 0, geneDataOrg$LRT) geneDataOrg$LRT <- sqrt(sqrt(geneDataOrg$LRT)) minP1 <- min(geneDataOrg$P.1[geneDataOrg$P.1>0]) geneDataOrg$P.1 <- geneDataOrg$P.1 + minP1 geneDataOrg$P.1 <- sqrt(sqrt(geneDataOrg$P.1)) geneDataOrg$Intron.Number <- geneDataOrg$Intron.Number + 1 geneDataOrg$Intron.Number <- log2(geneDataOrg$Intron.Number) geneDataOrg$Paralogs.Number <- geneDataOrg$Paralogs.Number + 1 geneDataOrg$Paralogs.Number <- log2(geneDataOrg$Paralogs.Number) geneDataOrg$Phyletic.Age <- geneDataOrg$Phyletic.Age + 1 geneDataOrg$CDS.Length <- log2(geneDataOrg$CDS.Length) cat("\n Graphical representation of the expression data is saved in the file \"TissuesExpression\".", sep="") #Run separately in R, cannot draw from LaTeX dev.new(height=9, width=12) par(cex.main=0.95, bg=my.col[1], fg=my.col[2], col.axis=my.col[2], col.lab=my.col[2], col.main=my.col[2]) palette(rev(rich.colors(length(tissuesNames)+2))) plot(density(geneDataOrg[,tissuesRPKMNames[1]],n=1000), main = "Expression values among different tissues",xlab="Normalized RPKM",col=(1), lwd=3) for(i in c(2:length(tissuesRPKMNames))) { lines(density(geneDataOrg[,tissuesRPKMNames[i]],n = 1000), col=(i), lwd=3) } legend("topright",tissuesPrintNames,col=(1:length(tissuesRPKMNames)),lty="solid", lwd=3) dev.copy2pdf(device=quartz, file=paste(folderAnalysis, organism,"TissuesExpression", expDataSource, add,".pdf", sep=""),onefile=TRUE)#,paper="A4r" #dev.off() cat("\n Overall ",nrow(geneDataOrg)," genes were used for analysis."," Summary:",sep="") summary(geneDataOrg) variableNames <- tissuesRPKMNames ############### ##Calculation correlation x <- data.frame(x1=NULL,x2=NULL,corValue=NULL,pValue=NULL,significant=NULL) for(j in variableNames) #j is the name of variable for which the correlation is calculated { variablesToUse <- parameterNames #Names of other variables t = length(variablesToUse) for(n in c(1:t)) { j2 <- variablesToUse[n] variablesToUse2 <- variablesToUse[variablesToUse != j2] if(partial==TRUE) { fmodel <-"geneDataOrg$" fmodel <- paste(fmodel,j,"~",sep="") fmodel2 <-"geneDataOrg$" fmodel2 <- paste(fmodel2,j2,"~",sep="") for(i in variablesToUse2) { fmodel <- paste(fmodel,"geneDataOrg$",i,"+",sep="") fmodel2 <- paste(fmodel2,"geneDataOrg$",i,"+",sep="") } fmodel <- substr(fmodel, 1, nchar(fmodel)-1) #delet last character "+" fmodel2 <- substr(fmodel2, 1, nchar(fmodel2)-1) #delet last character "+" fmx <- glm(fmodel, na.action = na.exclude) fmy <- glm(fmodel2, na.action = na.exclude) xres <- resid(fmx) yres <- resid(fmy) ct <- cor.test(xres, yres, method=corMethod) } else { ct <- cor.test(geneDataOrg[,j], geneDataOrg[,j2], method=corMethod) } s <- ct$estimate coeff <- ct$p.value signCoeff <- ct$p.value < 0.0005 x <- rbind(x, data.frame(x1=j,x2=j2,corValue=s,pValue=coeff,significant=as.integer(signCoeff))) } } variableNames <- parameterNames for(j in variableNames) #j is the name of variable for which the correlation is calculated { variablesToUse <- variableNames[variableNames != j] #Names of other variables t = length(variablesToUse) for(n in c(1:t)) { j2 <- variablesToUse[n] variablesToUse2 <- variablesToUse[variablesToUse != j2] if(partial==TRUE) { fmodel <-"geneDataOrg$" fmodel <- paste(fmodel,j,"~",sep="") fmodel2 <-"geneDataOrg$" fmodel2 <- paste(fmodel2,j2,"~",sep="") for(i in variablesToUse2) { fmodel <- paste(fmodel,"geneDataOrg$",i,"+",sep="") fmodel2 <- paste(fmodel2,"geneDataOrg$",i,"+",sep="") } fmodel <- substr(fmodel, 1, nchar(fmodel)-1) #delet last character "+" fmodel2 <- substr(fmodel2, 1, nchar(fmodel2)-1) #delet last character "+" fmx <- glm(fmodel, na.action = na.exclude) fmy <- glm(fmodel2, na.action = na.exclude) xres <- resid(fmx) yres <- resid(fmy) ct <- cor.test(xres, yres, method=corMethod) } else { ct <- cor.test(geneDataOrg[,j], geneDataOrg[,j2], method=corMethod) } s <- ct$estimate coeff <- ct$p.value signCoeff <- ct$p.value < 0.0005 x <- rbind(x, data.frame(x1=j,x2=j2,corValue=s,pValue=coeff,significant=as.integer(signCoeff))) } } row.names(x) <- c(1:nrow(x)) x$x2 <- factor(x$x2,levels=levels(x$x1)) sameCorRowNumbers <- vector("numeric") for(i in rownames(x)) { sameCor <- x[with(x,x$x2 == x[i,]$x1 & x$x1 == x[i,]$x2),] if(nrow(sameCor)>0) { if(as.integer(rownames(sameCor))>as.integer(i)) { sameCorRowNumbers <- append(sameCorRowNumbers, as.integer(rownames(sameCor))) } } } x <- x[-sameCorRowNumbers,] cat("\n Correlation table",sep="") print(x, type="latex",file="",append=FALSE) cat("\n Result of the correlation is saved in \"CorTissues_Original\".",sep="") write.table(x,file=paste(folderAnalysis, organism, part, corMethod, "CorTissues", expDataSource, "Original", add,".txt",sep=""),row.names = FALSE,quote = FALSE) ############################## nTissues <- length(tissuesNames) correctionTerm <- (length(parameterNames)-1)*nTissues cat("\n Expression data are sorted according to correlation with Omega.",sep="") #Sorting expression parameters according to correlation with Omega x.exp <- x[(regexpr("Averaged.RPKM.",x$x1)+regexpr("Averaged.RPKM.",x$x2))==0,]#data frame with expression correlations x.exp$abs <- abs(x.exp$corValue) x.exp <- x.exp[with(x.exp, order(x.exp$x2, x.exp$corValue, decreasing=TRUE)),] row.names(x.exp) <- c(1:nrow(x.exp)) #x.exp$exp.order <- row.names(x.exp) nRow <- nrow(x.exp) x.exp <- x.exp[with(x.exp,x.exp$x2=="Omega"),]#Parameter used for sorting expression data# x.exp$exp.order <- c((nRow-nTissues+1):nRow) #x.exp <- x.exp[(nrow(x.exp)-nTissues+1):nrow(x.exp),] x.exp <- x.exp[,c("x1","exp.order")] x2 <- merge(x,x.exp,by=c("x1"), all.x=TRUE, sort=FALSE) for(i in c(1:nrow(x2))) { if(regexpr("Averaged.RPKM.",x2$x1[i])>0) #If it is expression in the first row { resA <- lapply(strsplit(as.character(x2$x1[i]), split=".", fixed=TRUE),function(x){x[1]}) resR <- lapply(strsplit(as.character(x2$x1[i]), split=".", fixed=TRUE),function(x){x[2]}) resN <- ifelse(as.integer(x2$exp.order[i])<10, paste("0",x2$exp.order[i],sep=""), x2$exp.order[i]) resT <- lapply(strsplit(as.character(x2$x1[i]), split=".", fixed=TRUE),function(x){x[3]}) x2$x1a[i] <- paste(resA,".",resR,".",resN,".",resT,sep="") } } x2$x1 <- x2$x1a x2 <- x2[,c("x1","x2","corValue")] x3 <- merge(x,x2,by=c("x2","corValue"), all.x=TRUE, sort=FALSE) for(i in c(1:nrow(x3))) { if(regexpr("Averaged.RPKM.",x3$x1.x[i])<0) #If it is not an expression in the first row { x3$x1.y[i] <- as.character(x3$x1.x[i]) } } x3 <- x3[,c("x1.y","x2","corValue","pValue","significant")] names(x3) <- c("x1","x2","corValue","pValue","significant") x <- x3 x <- x[with(x, order(x$x1, decreasing=TRUE)),] v.temp <- data.frame(values=NULL) v.temp <- rbind(v.temp,data.frame(v=x$x1)) v.temp <- rbind(v.temp,data.frame(v="Omega")) variables <- unique(v.temp)#All expression variables variables <- variables[with(variables, order(variables$v)),] x$x1 <- factor(x$x1,levels=levels(variables)) #Finish sorting xtemp1 <- x x$abs <- abs(x$corValue) # was 20 for cytoscype x$sign <- sign(x$corValue) x <- x[,c("x1","x2","significant","abs","sign")] #Table with corralation data cat("\n Result is saved in \"CorTissues_List\". ",sep="") write.table(x,file=paste(folderAnalysis, organism, part, corMethod, "CorTissues", expDataSource, "List", add,".txt",sep=""),row.names = FALSE,quote = FALSE) x <- xtemp1 x <- x[with(x, order(x$x1, decreasing=TRUE)),] row.names(x) <- c(1:nrow(x)) v.temp <- data.frame(values=NULL) for(i in c(1:nrow(x))) { if(regexpr("Averaged.RPKM.",x$x1[i])>0) #If it is expression in the first row { res <- lapply(strsplit(as.character(x$x1[i]), split=".", fixed=TRUE),function(x){x[3]}) x$order[i] <- as.integer(res)-1-correctionTerm v.temp <- rbind(v.temp,data.frame(variables=x$x1[i])) } else if(regexpr("Averaged.RPKM.",x$x2[i])>0) #If it is expression in the second column { res <- lapply(strsplit(as.character(x$x1[i]), split=".", fixed=TRUE),function(x){x[3]}) x$order[i] <- as.integer(res)-1 v.temp <- rbind(v.temp,data.frame(variables=x$x2[i])) } else { x$order[i] <- 0 } } ######CORRELATIONS HERE x$abs <- as.integer(abs(x$corValue)*1000) #Correlation strength, width of the lines x$sgn <- paste("color=c", sign(x$corValue)+1,x$significant,sep="") #Correlation positive or negative and if significant, "color=c11" which describe the color variables <- levels(x$x1) #All variables used for correlations v.temp <- data.frame(values=NULL) v.temp <- rbind(v.temp,data.frame(v=x$x1)) v.temp <- rbind(v.temp,data.frame(v=x$x2)) variables <- unique(v.temp)#All expression variables variables <- variables[with(variables, order(variables$v)),] x$x1x <- 0 x$x1y <- 0 x$x2x <- 0 x$x2y <- 0 x.exp <- x[(regexpr("Averaged.RPKM.",x$x1)+regexpr("Averaged.RPKM.",x$x2))==0,]#data frame with expression correlations x.ne <- x[(regexpr("Averaged.RPKM.",x$x1)+regexpr("Averaged.RPKM.",x$x2))<0,]#data frame with other correlations #Names of not exprssion variables v.temp <- data.frame(values=NULL) v.temp <- rbind(v.temp,data.frame(variables=x.exp$x1)) variables.exp <- unique(v.temp)#All expression variables variables.exp <- variables.exp[with(variables.exp, order(variables.exp$variables)),] variables.ne <- parameterNames v <- vector(mode="numeric") #Vector with the length for each segment, used later in gaps #Calculating the maximum needed width for expression segment and sum of all segments maxL=0 for(j in variables) { s <- (sum(x[x$x1==j,]$abs)+sum(x[x$x2==j,]$abs))*2 #Segment should be twice as long as width of all correlations if(maxL < s) { maxL <- s } v <- append(v, s) } #Calclulating the maximum of expression segments and the length for each expression segment v.exp <- vector(mode="numeric") #used later in bands, contain weidth of each expression segment maxL.exp=0 for(j in variables.exp) { s <- (sum(x.exp[x.exp$x1==j,]$abs)+sum(x.exp[x.exp$x2==j,]$abs))*4 #was 2 before, now 4 for Poster #Segment should be twice as long as width of all correlations if(maxL.exp < s) { maxL.exp <- s } v.exp <- append(v.exp, s) } #Table for Karyotype file kar <- data.frame(variables=variables.ne, maxL)#Not expression variables and segment length. data.frame with "variables" and "maxL" kar <- rbind(kar,data.frame(variables="Expression",maxL=maxL.exp*length(v.exp))) #Length of the big expression segment. Max expression length * number of gaps <- data.frame(variables, maxL, v)#All parameters with maximal length (maxL) and own length (v) kar$name <- paste ("chr","\t","-","\t", kar$variables,sep="")#Adding “chr - “ to the variables #Make error in LaTeX kar$x <- 0 #Adding x column with 0, starting points kar$color <- "chr1" #Adding color parameter kar <- kar[, c("name", "variables", "x", "maxL", "color")] #puting colums in the right order kar <- kar[with(kar, order(kar$variables, decreasing=TRUE)),] #puting rows in the right order #Calculating the exact coordinates for each band of expression band <- data.frame(variables=NULL, start=NULL, end=NULL) a.p = 0 for(i in c(1:length(v.exp))) { band <- rbind(band,data.frame(variables=variables.exp[i],start=a.p+1,end=a.p + maxL.exp))#calculate the coordinates for each segment of expression a.p=a.p + maxL.exp } band$start[1]=0 band$c <- paste("band","\t","Expression","\t",band$variables,"\t",band$variables,sep="") #data.frame with variables, start and end point and c("band Expression"+variable names) band$color <- "chr2" #color column #Names of parameters for labeling the graph labels <- data.frame(name="Expression",start=band$start, end=band$end) #data.frame with "Expression" and start and end of each segment #Creating labels for each segment for(i in c(1:nrow(labels))) { res <- lapply(strsplit(as.character(band$variables[i]), split=".", fixed=TRUE),function(x){x[4]}) labels$label[i] <- as.character(res) } ###########ONTOLOGY############### ##divers >>#chr2 ##gastrointestinal system >>#chr4 ##central nervous system >>#chr3 ##reproductive system >>#chr5 #salivary(salivary gland) > oral region > gastrointestinal system > visceral organ >>#chr4 #sgland == salivary #stomach > gastrointestinal system > visceral organ >> #chr4 #duodenum > intestine > gastrointestinal system (first section of the small intestine) > visceral organ >>#chr4 #smintestine(small intestine) > intestine > gastrointestinal system > visceral organ >>#chr4 #lgintestine(large intestine) > intestine > gastrointestinal system > visceral organ >>#chr4 #colon > large intestine > intestine > gastrointestinal system > visceral organ >>#chr4 #appendix > large intestine > intestine > gastrointestinal system > visceral organ >>#chr4 #esophagus > gastrointestinal system > visceral organ >> #chr4 #cerebellum > forebrain > brain > central nervous system >>#chr3 #flobe(frontal lobe) > central nervous system >>#chr3 #tlobe(temporal lobe) > central nervous system >>#chr3 #cortex > forebrain > brain > central nervous system >>#chr3 #fcortex(frontal cortex) > cortex > farebrain > brain > central nervous system >>#chr3 #pcortex(prefrontal cortex) > cortex > farebrain > brain > central nervous system >>#chr3 #brain > central nervous system >>#chr3 #hcampus(Hippocampus) > telencephalon > forebrain > brain > central nervous system >>#chr3 #spinal(Spinal Cord) > central nervous system >>#chr3 #hypothalamus > diencephalon > forebrain > brain > central nervous system >>#chr3 #pituitary(Pituitary Gland) > diencephalon gland > diencephalon > forebrain > brain > central nervous system >>#chr3 #gfat (Genital Adipose Tissue) > seminal vesicle > male reproductive system > reproductive system >>#chr5 #ovary > reproductive system >>#chr5 #placenta > reproductive system > visceral organ >>#chr5 #testis > reproductive system > visceral organ >>#chr5 #prostate > reproductive system > visceral organ >>#chr5 #endometrium > uterus > reproductive system > visceral organ >>#chr5 #uterus > reproductive system > visceral organ >>#chr5 #thymus > haemolymphoid system >>#chr2 #spleen > haemolymphoid system >>#chr2 #lymphnode > haemolymphoid system >>#chr2 #adrenal > endocrine system >>#chr2 #pancreas > endocrine system >>#chr2 #kidney > renal-urinal system > visceral organ >>#chr2 #bladder > renal-urinal system > visceral organ >>#chr2 #ubladder == bladder #lung > respiratory system > visceral organ >>#chr2 #liver > liver and biliary system > visceral organ >>#chr2 #gbladder (gallbladder) > liver and biliary system > visceral organ >>#chr2 #heart > cardiovascular system >>#chr2 #blood > cardiovascular system >>#chr2 #mamgland(mamary gland) > integumental system gland > integumental system >>#chr2 #skin > integumental system >>#chr2 #sfat (Subcutaneous Fat Pad (Subcutaneous Adipose Tissue)) > Adipose Tissue >>#chr2 #fat == sfat #muscle >>#chr2 #marrow(Bone Marrow) > bone > sceletal system >>#chr2 #bonem == marrow #thyroid > foregut > gut > alimentary system > visceral organ >>#chr2 for(i in c(1:nrow(band))) { if(regexpr("flobe",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Frontal Lobe" } else if(regexpr("cerebellum",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Cerebellum" } else if(regexpr("brain",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Brain" } else if(regexpr("fcortex",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Frontal Cortex" } else if(regexpr("pcortex",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Prefrontal Cortex" } else if(regexpr("tlobe",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Temporal Lobe" } else if(regexpr(".cortex",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Cortex" } else if(regexpr("hcampus",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Hippocampus" } else if(regexpr("spinal",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Spinal Cord" } else if(regexpr("hypothalamus",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Hypothalamus" } else if(regexpr("pituitary",band$variables[i])>0) { band$color[i] <- "chr3" labels$label[i] <- "Pituitary Gland" } ###### else if(regexpr("colon",band$variables[i])>0) { band$color[i] <- "chr4" labels$label[i] <- "Colon" } else if(regexpr("stomach",band$variables[i])>0) { band$color[i] <- "chr4" labels$label[i] <- "Stomach" } else if(regexpr("smintestine",band$variables[i])>0 | regexpr("smint",band$variables[i])>0) { band$color[i] <- "chr4" labels$label[i] <- "Small Intestine" } else if(regexpr("duodenum",band$variables[i])>0) { band$color[i] <- "chr4" labels$label[i] <- "Duodenum" } else if(regexpr("lgintestine",band$variables[i])>0) { band$color[i] <- "chr4" labels$label[i] <- "Large Intestine" } else if(regexpr("salivary",band$variables[i])>0 | regexpr("sgland",band$variables[i])>0) { band$color[i] <- "chr4" labels$label[i] <- "Salivary Gland" } else if(regexpr("appendix",band$variables[i])>0) { band$color[i] <- "chr4" labels$label[i] <- "Appendix" } else if(regexpr("esophagus",band$variables[i])>0) { band$color[i] <- "chr4" labels$label[i] <- "Esophagus" } ##### else if(regexpr("ovary",band$variables[i])>0) { band$color[i] <- "chr5" labels$label[i] <- "Ovary" } else if(regexpr("gfat",band$variables[i])>0) { band$color[i] <- "chr5" labels$label[i] <- "Genital Fat Pad" } else if(regexpr("testis",band$variables[i])>0) { band$color[i] <- "chr5" labels$label[i] <- "Testis" } else if(regexpr("placenta",band$variables[i])>0) { band$color[i] <- "chr5" labels$label[i] <- "Placenta" } else if(regexpr("prostate",band$variables[i])>0) { band$color[i] <- "chr5" labels$label[i] <- "Prostate" } else if(regexpr("endometrium",band$variables[i])>0) { band$color[i] <- "chr5" labels$label[i] <- "Endometrium" } else if(regexpr("uterus",band$variables[i])>0) { band$color[i] <- "chr5" labels$label[i] <- "Uterus" } ##### else if(regexpr("thymus",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Thymus" } ##### else if(regexpr("sfat",band$variables[i])>0 | (regexpr("fat",band$variables[i])>0 & regexpr("gfat",band$variables[i])<0)) { band$color[i] <- "chr2" labels$label[i] <- "Fat Pad" } ##### else if((regexpr("bladder",band$variables[i])>0 & regexpr("gbladder",band$variables[i])<0) | regexpr("ubladder",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Bladder" } else if(regexpr("gbladder",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Gallbladder" } ##### else if(regexpr("mamgland",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Mammary Gland" } ##### else if(regexpr("skin",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Skin" } ##### else if(regexpr("lung",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Lung" } ##### else if(regexpr("adrenal",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Adrenal" } ##### else if(regexpr("heart",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Heart" } ##### else if(regexpr("blood",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Blood" } ##### else if(regexpr("kidney",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Kidney" } ##### else if(regexpr("spleen",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Spleen" } ##### else if(regexpr("liver",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Liver" } ##### else if(regexpr("muscle",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Muscle" } ##### else if(regexpr("marrow",band$variables[i])>0 | regexpr("bonem",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Bone Marrow" } ##### else if(regexpr("pancreas",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Pancreas" } ##### else if(regexpr("thyroid",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Thyroid" } ##### else if(regexpr("lymphnode",band$variables[i])>0) { band$color[i] <- "chr2" labels$label[i] <- "Lymph Node" } } ###########END######ontology############ band <- band[, c("c", "start", "end", "color")] #Puting colums in the right order #Data to use with circos cat("\n Correlation result for representation in circos are saved in \"Karyotype\", \"Labels\" and \"Links\".",sep="") write.table(kar,file=paste(folderAnalysis, organism, part, corMethod, "Cor", expDataSource, "Karyotype", add,".txt",sep=""),row.names = FALSE, col.names=FALSE, quote = FALSE,sep="\t") #_GLM_karyotype.txt write.table(band,file=paste(folderAnalysis, organism, part, corMethod, "Cor", expDataSource, "Karyotype", add,".txt",sep=""), row.names = FALSE, col.names=FALSE, quote = FALSE, append = TRUE,sep="\t")#_GLM_karyotype.txt write.table(labels,file=paste(folderAnalysis, organism, part, corMethod, "Cor", expDataSource, "Labels", add, ".txt",sep=""), row.names = FALSE, col.names=FALSE, quote = FALSE,sep="\t")#_GLM_labels.txt #Puting maximal expression value for the expression segments for(i in c(1:nrow(gaps))) { if(regexpr("Averaged.RPKM.",gaps$variables[i])>0) { gaps$maxL[i] <- maxL.exp } } temp_x <- x x <- temp_x #Calculating the exact possitions of the links inside the segment for(k in c(1:nrow(gaps))) { j <- gaps[k,]$variables if(regexpr("Averaged.RPKM.",gaps[k,]$variables)>0) #for expression segments { m <- (gaps[k,]$maxL-gaps[k,]$v)/(2*length(parameterNames)) #m is the spacing between correlations } else #for not expression segments { m <- (gaps[k,]$maxL-gaps[k,]$v)/(2*(nTissues+length(parameterNames)-1)) } n = 0 for(i in c(1:nrow(x))) { if (as.character(x[i,]$x1)==as.character(j)) { n <- n+ x[i,]$abs/2+m #spacing x[i,]$x1x <- n #beginning of the link n <- n+ x[i,]$abs x[i,]$x1y <- n #end of the link n <- n+ x[i,]$abs/2+m #spacing } if (as.character(x[i,]$x2)==as.character(j)) { n <- n+ x[i,]$abs/2+m x[i,]$x2x <- n n <- n+ x[i,]$abs x[i,]$x2y <- n n <- n+ x[i,]$abs/2+m } } } x$x1x <- as.integer(x$x1x) x$x1y <- as.integer(x$x1y) x$x2x <- as.integer(x$x2x) x$x2y <- as.integer(x$x2y) #Finish calculating the positions x.temp <- x #Recalculating the positions for the links for expression according to the big Expression segment for(i in c(1:nrow(x))) { if(regexpr("Averaged.RPKM.",x$x1[i])>0) #if expression parameter is in the 1. column { x$x1x[i] <- x$x1x[i] + maxL.exp*x$order[i] x$x1y[i] <- x$x1y[i] + maxL.exp*x$order[i] } else if(regexpr("Averaged.RPKM.",x$x2[i])>0) #if the expression parameter is in the 2. column { x$x2x[i] <- x$x2x[i] + maxL.exp*x$order[i] x$x2y[i] <- x$x2y[i] + maxL.exp*x$order[i] } } #Renaming all expression segment into "Expression" for(i in c(1:nrow(x))) { if(regexpr("Averaged.RPKM.",x$x1[i])>0) { x$x1a[i] <- "Expression" } else { x$x1a[i] <- as.character(x$x1[i]) } } x$x1 <- x$x1a for(i in c(1:nrow(x))) { if(regexpr("Averaged.RPKM.",x$x2[i])>0) { x$x2a[i] <- "Expression" } else { x$x2a[i] <- as.character(x$x2[i]) } } x$x2 <- x$x2a x <- x[,c("x1","x1x","x1y","x2","x2x","x2y","sgn")] #Chousing the right columns#Data to use with circos (together with karyotype) write.table(x,file=paste(folderAnalysis, organism, part, corMethod, "Cor", expDataSource, "Links", add, ".txt",sep=""), row.names = FALSE, col.names=FALSE, quote = FALSE,sep="\t")#_GLM_links.txt ################################# ################################ ################################ ################################################ #Essentiality and Omega in Violin Plot & T-test# ################################################ data <- read.table(paste(folderAnalysis, organism, "Table", expDataSource, ".txt",sep=""), header=TRUE) data <- data[data$Max.Expression>0.00015,] data <- data[,c("Omega.0", "LRT", "P.1","CDS.Length", "Intron.Length", "Intron.Number", "Median.Expression", "Max.Expression", "Tau", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age", "Essentiality")] colnames(data) <- c("Omega", "LRT", "P.1", "CDS.Length", "Intron.Length", "Intron.Number", "Median.Expression", "Max.Expression", "Tau", "X..GC.content", "Paralogs.Number", "Stage.Number", "Phyletic.Age", "Essentiality") data <- na.omit(data) geneData<-data #Normalization of the data minOmega <- min(geneData$Omega[geneData$Omega>0]) geneData$Omega <- geneData$Omega + minOmega geneData$Omega <- log2(geneData$Omega) minLength <- min(geneData$Intron.Length[geneData$Intron.Length>0]) geneData$Intron.Length <- geneData$Intron.Length + minLength geneData$Intron.Length <- log2(geneData$Intron.Length) minP1 <- min(geneData$P.1[geneData$P.1>0]) geneData$P.1 <- geneData$P.1 + minP1 geneData$P.1 <- sqrt(sqrt(geneData$P.1)) geneData$Intron.Number <- geneData$Intron.Number + 1 geneData$Intron.Number <- log2(geneData$Intron.Number) geneData$Paralogs.Number <- geneData$Paralogs.Number + 1 geneData$Paralogs.Number <- log2(geneData$Paralogs.Number) geneData$Phyletic.Age <- geneData$Phyletic.Age + 1 minTau <- 1 - max(geneData$Tau) geneData$Tau <- geneData$Tau + minTau geneData$Tau <- log2(geneData$Tau) geneData$LRT <- ifelse(geneData$LRT<0, 0, geneData$LRT) geneData$LRT <- sqrt(sqrt(geneData$LRT)) geneData$CDS.Length <- log2(geneData$CDS.Length) summary(geneData) #Names of the variables used variableNames <- colnames(geneData) variablesToUse <- variableNames[variableNames != c("Omega", "Essentiality")] #all other variables fmodel <-"geneData$" fmodel <- paste(fmodel,"Omega","~",sep="") for(i in variablesToUse) { fmodel <- paste(fmodel,"geneData$",i,"+",sep="") } fmodel <- substr(fmodel, 1, nchar(fmodel)-1) #delet last character "+" fmx <- glm(fmodel, na.action = na.exclude) xres <- resid(fmx) dataPlot <- data.frame(Omega=xres, Essentiality=geneData$Essentiality) dataP <- as.matrix(dataPlot[,c("Omega", "Essentiality")]) dev.new(height=8, width=8) palette(rainbow(9)) trellis.par.set(list(background=list(col=my.col[1]), add.text=list(col=my.col[2], cex=1.5),axis.line=list(col=my.col[2]), axis.text=list(col=my.col[2], cex=1.5), par.main.text=list(col=my.col[2], cex=1.3), par.xlab.text=list(col=my.col[2], cex=1.5), par.ylab.text=list(col=my.col[2], cex=1.7), plot.line=list(col=my.col[2]), dot.line=list(lwd=1, lty=2, col="#4B4B4B"))) #trellis.par.get() bwplot(as.numeric(dataP[,1])~dataP[,2], xlab="", ylab="residuals of log2(Omega)", main=paste("Distribution of Omega according to essentiality",sep=""), horizontal=FALSE, col = c("#00BFFF"), fill=c("blue"), panel = function(x,y,..., box.ratio, col, pch){ panel.violin(x=x, y=y,..., cut = 0, varwidth = TRUE, box.ratio = 4*box.ratio, col=col) panel.bwplot(x=x, y=y, ..., varwidth = TRUE ,box.ratio = .5, pch='|', notch=TRUE)}, par.settings = list(box.rectangle=list(col=my.col[2], lwd=2), plot.symbol = list(pch='.', cex = 0.1, col=my.col[2]), box.umbrella=list(col=my.col[2])), scales=list(x=list(rot=10, labels=c("Not Essential", "Essential")))) dev.copy2pdf(device=quartz, file=paste(folderAnalysis, organism,"OmegaEssentialityNewAres.pdf", sep=""),onefile=TRUE)#,paper="A4r" dataPF <- data.frame(dataPlot) dataPF$Omega <- as.numeric(dataPF$Omega) t.test(Omega~Essentiality, data=dataPF) t.test(Omega~Essentiality, data=dataPF, alternative = "greater") ################# ################# ################### ###################################### ###Correlation between correlations### ###################################### ######################### dataCorME <- read.table(paste(folderAnalysis, "Mouse.txt",sep=""), header=TRUE) dataCorHF <- read.table(paste(folderAnalysis, "Human.txt",sep=""), header=TRUE) dataCor <- merge(dataCorME, dataCorHF, by=c("x1","x2"), all.x=TRUE, all.y=TRUE) dataCor <- dataCor[,c("x1", "x2", "corValue.x", "corValue.y")] colnames(dataCor) <- c("x1", "x2", "ENCODE", "Fagerberg") head(dataCor) dataPlot <- dataCor[,c(-2)] head(dataPlot) v <- colnames(dataPlot[,-1]) dataPlot$x1 <- ifelse(dataPlot$x1 == "Omega", "Omega", "Others") dataPlot <- as.matrix(dataPlot) dev.new(height=12, width=16) trellis.par.set(list(background=list(col=my.col[1]), add.text=list(col=my.col[2], cex=1.5),axis.line=list(col=my.col[2]), axis.text=list(col=my.col[2], cex=1.5), par.main.text=list(col=my.col[2], cex=2.5), par.xlab.text=list(col=my.col[2], cex=1.5), par.ylab.text=list(col=my.col[2], cex=1.7), plot.line=list(col=my.col[2]), dot.line=list(lwd=1, lty=2, col="#4B4B4B"))) #trellis.par.get() xyplot(as.numeric(dataPlot[,3]) ~ as.numeric(dataPlot[,2]), groups=dataPlot[,1], col=c("#FF0000F0", "#0000CCF0"), panel = function(x, y, ...){ panel.superpose(x, y, ..., panel.groups=function(x, y, col, col.symbol, ...) { panel.xyplot(x, y, col=col, ...) #panel.abline(lm(y~x), col.line=col.symbol) panel.text(0.75,0.9, "y=0.002 + 0.89*x", col="black") panel.text(-0.47,-0.023, "GC content to Stage number ->", col="black", cex=1) panel.text(0.44,-0.035, "<- GC content to Maximal expression", col="black", cex=1) }, panel.abline(lm(y~x), col.line="black", lwd=2, lty=2) )}, xlab="Mouse correlation values", ylab="Human correlation values", main=paste("Partial coefficient correlations in human and mouse",sep=""), pch="*", cex=6, xlim=c(-1,1), ylim=c(-1,1) ) dev.copy2pdf(device=quartz, file=paste(folderAnalysis, "MusHumanCorrelationCorrelationsOmega.pdf", sep=""),onefile=TRUE) ########### ####################################