#################################################################################################################################################### #################################################################################################################################################### ### R scripts for "Global Analysis of Genetic, Epigenetic and Transcriptional Polymorphisms in Arabidopsis thaliana Using Whole Genome Tiling Arrays" ### by Xu Zhang and Justin Borevitz #################################################################################################################################################### #################################################################################################################################################### ###This is a function for read in CEL intensity and do spatial correction library(affy) library(stats) readcel <- function(cel.files, probe.number, array.size, filter.size ){ mprobe.mean <- matrix(NA, nr=probe.number, nc=length(cel.files) ) for (i in 1:length(cel.files) ) { probe.mean <- log(intensity(read.affybatch (filenames=cel.files [i] ) ) ) mprobe.mean[,i] <- matrix(probe.mean, nc=array.size, byrow=T) [array.size:1,] [probe.ok] } # calculate median probe within array meds <- apply(mprobe.mean,2,median) mprobe.resid <- mprobe.mean - meds filter.size <- 2*trunc(filter.size /2)+1 # force to be odd so the edge will be same length = filter.size/2 fe <- (filter.size-1)/2 # filter edge mprobe.bg <- mprobe.resid # do spatial correction for (i in 1:length(cel.files)) { sp.mat <- matrix(0,nr= array.size,nc=array.size) sp.mat[probe.ok] <- mprobe.resid[,i] sp.mat.sum <- apply(sp.mat,2,filter,rep(1,filter.size)) # from start to end, every 51 probes sum in moving sp.mat.sum <- apply(sp.mat.sum,1,filter,rep(1,filter.size)) sp.mat.count <- apply(probe.ok,2,filter,rep(1,filter.size)) sp.mat.count <- apply(sp.mat.count,1,filter,rep(1,filter.size)) #how many probes have been added up to contribute the sp.mat.sum spatial.corrected <- t(sp.mat.sum/sp.mat.count) # fill in edges, with the background of edge boarder spatial.corrected[1:fe,] <- rep(spatial.corrected[fe+1, ], each = fe) spatial.corrected[(array.size+1 - fe):array.size, ] <- rep(spatial.corrected[array.size - fe, ], each = fe) spatial.corrected[,1:(fe)] <- spatial.corrected[,fe+1] spatial.corrected[,(array.size+1 - fe):array.size] <- spatial.corrected[,array.size - fe] mprobe.bg[,i] <- spatial.corrected[probe.ok] } mprobe.mean <- (mprobe.mean - mprobe.bg)[chrom.order,] } #################################################################################################################################################### ### This script sets the probe positions on arabidopsis 1.0F array, calls readcel function to read in the genomic DNA CEL intensity according to probe ### positions and do spatial correction, then quantile normalizes across samples setwd ("final2") load ("attile1V7anno.RData") source("readcel.R") #This is avaible online library(affy) array.size <- 2560 probe.ok <- matrix(FALSE, nr=array.size, nc=array.size) probe.ok.chrom <- matrix(NA, nr=array.size, nc=array.size) probe.ok.position <- matrix(NA, nr=array.size, nc=array.size) probe.ok[cbind(attile1$xpos+1, attile1$ypos+1)] <- TRUE probe.ok.chrom[cbind(attile1$xpos + 1, attile1$ypos+1)] <-attile1$chr probe.ok.position [cbind(attile1$xpos + 1, attile1$ypos+1)] <- attile1$bpstart probe.ok.chrom.valid <- probe.ok.chrom[probe.ok] probe.ok.position.valid <- probe.ok.position [probe.ok] chrom.order <- order(probe.ok.chrom.valid, probe.ok.position.valid) setwd ("../CEL/mDNA") cel.files <- list.celfiles() mprobe.mean <- readcel (cel.files=cel.files, probe.number=nrow(attile1), array.size=2560, filter.size=81) setwd ("../../final2") save(mprobe.mean, file="mDNA.sc.RData", compress=T) load ("attile1V7anno.RData") load ("mDNA.sc.RData") mDNA.nq <- normalize.quantiles(mprobe.mean) attile.ccgg <- attile1[ grep("ccgg", as.character(attile1$seq) ), ] write.table (attile.ccgg, "table.csv") attile.ccgg <- read.table("table.csv", header=T) save( attile.ccgg, file="attile.ccgg.RData", compress=T) mDNA.ccgg <- mDNA.nq[ grep("ccgg", as.character(attile1$seq) ), ] save( mDNA.ccgg, file="mDNA.ccgg.RData", compress=T) q("no") ###################################################################################################################################################### ### This script detect SFPs between Col and Van setwd ("final2") load ("mDNA.sc.RData") library(affy) mDNA.nq <- normalize.quantiles(mprobe.mean) mDNA.nq <- mDNA.nq[, 1:16] library (siggenes) cl<- c( rep(1, 8), rep(0, 8) ) system.time( sam.out <- sam(mDNA.nq, cl, rand=123, p0=0.95)) save (sam.out, file="mDNA.SFP.samout.RData", compress=T) summary(sam.out, seq(0.5, 1, 0.05) ) plot(sam.out, 0.95) q("no") ##################################################################################################################################################### ##################################################################################################################################################### ### This script models probe intensity for enzyme-digested genomic DNA samples between Col and Van by genotype*enz+plant(random) load ("mDNA.ccgg.RData") mDNA.ccgg <- mDNA.ccgg[, 1:16] add <- rep( c(1,-1), each=8) enz <- rep( c(1,-1), 8) addenz <- add*enz plant <- as.factor( rep(LETTERS[1:8], each = 2) ) fac.mat <- data.frame(add, enz, addenz) library(nlme) for (i in 1:nrow(mDNA.ccgg) ) { y <- mDNA.ccgg[i,] fit <- try( lme ( y ~ add*enz, random =~1|plant ), TRUE) while( mode(fit)=="character") {y <- jitter(y, amount=0); fit <- lme( y~ add*enz, random=~1|plant) } mDNA.ccgg [i,] <- fit$coef$fixed [-1] %*% t(fac.mat) + fit$coef$fixed[1] +fit$resid[,1] if( i/1000 == trunc(i/1000) ) cat (i, "\n") } methyl.main <- list() fit <- lsfit(fac.mat, t(mDNA.ccgg) ) methyl.main [[1]] <- fit$coef methyl.main [[2]] <- fit$resid names(methyl.main) <- c("coef", "resid") save(methyl.main, file="methyl.main2.RData", compress=T) save(mDNA.ccgg, file="corrected.mDNA.ccgg2.RData", compress=T) q("no") #################################################################################################################################################### ### This is permutation for genotype, enzyme, and genotype*enzyme effects k <- 1 #1 for genotype effect, 2 for enzyme effect, 3 for gentoype*enzyme effect setwd ("final2") load ("corrected.mDNA.ccgg2.RData") load ("samp.matrix16.RData") add <- rep( c(1,-1), each=8) enz <- rep( c(1,-1), 8) addenz <- add*enz plant <- as.factor( rep(LETTERS[1:8], each = 2) ) fac.mat <- data.frame(add, enz, addenz) rfit <- lsfit(fac.mat[, -k], t(mDNA.ccgg) ) predict <- t( as.matrix(fac.mat[,-k]) %*% rfit$coef [-1,] ) + rfit$coef[1,] resid <- t(rfit$resid) nperm <- 1000 matr <- matrix(NA, nr=nrow(mDNA.ccgg), nc=nperm) perm.dstat <- list( coef=matr, std=matr) denom <- rep( sqrt( sum( (add-mean(add) )^2) ), 3) for (iperm in 1:nperm) { perm.dat <- predict + resid[, samp.matrix[iperm,] ] pfit <- lsfit(fac.mat, t(perm.dat) ) perm.dstat[[1]][,iperm] <- pfit$coef[k+1,] perm.dstat[[2]][,iperm] <- sqrt( colSums ( (pfit$resid)^2) / 12) / denom[k] if (iperm/100==trunc(iperm/100)) cat(iperm, "\n") } save(perm.dstat, file =paste("perm2.dstat", k, ".RData", sep=""), compress=T) q("no") #################################################################################################################################################### ### Calculate p value based on permutation for genotype, enzyme, and genotype*enzyme effects k <- 1 #1 for genotype effect, 2 for enzyme effect, 3 for gentoype*enzyme effect setwd ("final2") load ("methyl.main2.RData") load ( paste ("perm2.dstat", k, ".RData", sep="") ) load ("corrected.mDNA.ccgg2.RData") add <- rep( c(1,-1), each=8) enz <- rep( c(1,-1), 8) addenz <- add*enz denom <- rep( sqrt( sum( (add-mean(add) )^2) ), 3) s0 <- quantile( perm.dstat[[2]], 0.05) std <- sqrt( colSums( methyl.main[[2]]^2)/12 ) / denom[k] main <- methyl.main[[1]][k+1,] / (std + s0) names(main) <- rownames(mDNA.ccgg) perm.mat <- perm.dstat[[1]]/(perm.dstat[[2]]+s0) vec <- rep(NA, length(main) ) pval <- list(vec, vec) main.sort <- sort(main) for( i in 1:length(main) ) { pval[[1]][i] <- sum( perm.mat <= main.sort[i])/ (1000*length(main) ) if (i/100 == trunc(i/100) ) cat(i, "\n") } names(pval[[1]]) <- names(main.sort) main.sort <- sort(main, decreasing=T) for (i in 1:length(main) ) { pval[[2]][i] <- sum( perm.mat >= main.sort[i])/ (1000*length(main) ) if (i/100 == trunc(i/100) ) cat(i, "\n") } names(pval[[2]]) <- names(main.sort) methyl.summary <- pval save(methyl.summary, file=paste("methyl2.summary", k, ".RData", sep=""), compress=T) cut <- c(0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1) print( t( sapply(cut, function(x) c( length(methyl.summary[[1]][methyl.summary[[1]]<= x]) , length(methyl.summary[[2]][methyl.summary[[2]]<= x]) ) )) ) q("no") #################################################################################################################################################### #################################################################################################################################################### ### Analysis of genic distribution of enzyme and genotyep*enzyme effects k <- 3 #2 for genotype, 3 for genotyep*enzyme effect setwd ("final2") load ("methyl.main2.RData") load (paste("perm2.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart) - 24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1), each=8) enz <- rep( c(1,-1), 8) addenz <- add*enz denom <- rep( sqrt( sum( (add-mean(add) )^2) ), 3) s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/12 ) / denom[k] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) #this is for enzyme effect #main.sort <- sort(main, decreasing=T) #hpa <- names(main.sort)[1:3583] #0.03 #hpa <- names(main.sort)[1:4522] #0.05 #hpa <- names(main.sort)[1:6324] #0.1 #this is for genotype*enzyme effect main.sort <- sort(main) #hpa <- names(main.sort)[1:407] #0.01 #hpa <- names(main.sort)[1:944] #0.03 hpa <- names(main.sort)[1:1515] #0.05 main.sort <- sort(main, decreasing=T) #hpa <- c(hpa, names(main.sort)[1:1062]) #0.01 #hpa <- c(hpa, names(main.sort)[1:2389]) #0.03 hpa <- c(hpa, names(main.sort)[1:3700]) #0.05 cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe tab <- matrix(NA, nr=3, nc=8) rownames(tab) <- c("enz", "total", "percentage") #rownames(tab) <- c("addenz", "total", "percentage") colnames(tab) <- c("cDNA", "CD", "intron", "utr5", "utr3", "promoter", "downstream", "intergenic") setwd ("../ftp") ### for cdna cdna <- read.delim("TAIR7_cdna_20070425", header=F) cdna <- as.character( cdna[1:nrow(cdna), ] ) cdna <- cdna[ grep(">", cdna, fixed=T) ] # 33122 gene <- matrix( unlist(strsplit(cdna, " | ", fixed=T) ), nr=4)[1,] # 33122 coord <- matrix( unlist(strsplit(cdna, " | ", fixed=T) ), nr=4)[4,] # 33122 gene <- gene[ grep("AT[12345]G", gene) ] #32843 coord <- coord [ grep("chr[12345]", coord) ] #32843 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] gene <- gene[isoform=="1"] # 27838 coord <- coord[isoform=="1"] # 27838 strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #cdna has no reverse pos summary(pos[,2]-pos[,1]) p1 <- p2 <- 0 for (i in 1:nrow(pos) ) { probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,1] & cg[,2] < pos[i,2] ) p1 <- p1 + length(probe) probe <- which(ccgg[,1] == chr[i] & ccgg[,2] > pos[i,1] & ccgg[,2] < pos[i,2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,1] <- c(p1, p2) ### for cd cd <- read.delim("TAIR7_cds_20070425", header=F) cd <- as.character( cd[1:nrow(cd), ] ) cd <- cd[ grep(">", cd, fixed=T) ] #31921 gene <- matrix( unlist( strsplit(cd, " | ", fixed=T) ), nr=4)[1,] #31921 coord <- matrix( unlist( strsplit(cd, " | ", fixed=T) ), nr=4)[4,] #31921 gene <- gene[ grep("AT[12345]G", gene) ] #31711 coord <- coord [ grep("chr[12345]", coord) ] #31711 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] gene.cd <- gene[isoform=="1"] #26784 coord <- coord[isoform=="1"] #26784 strand.cd <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr.cd <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos.cd <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #cd has no reverse pos summary(pos.cd[,2]-pos.cd[,1]) intron <- read.delim("TAIR7_intron_20070227", header=F) intron <- as.character( intron[1:nrow(intron),] ) intron <- intron[ grep(">", intron, fixed=T) ] # 148558 gene <- matrix( unlist( strsplit(intron, " | ", fixed=T) ), nr=3)[1,] #148558 coord <- matrix( unlist( strsplit(intron, " | ", fixed=T) ), nr=3)[3,] #148558 gene <- gene[ grep("AT[12345]G", gene) ] # 148516 coord <- coord [ grep("chr[12345]", coord) ] # 148516 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] isoform <- matrix( unlist(strsplit(isoform, "-", fixed=T) ), nr=2)[1,] gene <- gene[isoform=="1"] # 117219 coord <- coord[isoform=="1"] # 117219 strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #intron has no reverse pos summary(pos[,2]-pos[,1]) gene.exon <- pos.exon <- chr.exon <- strand.exon <- c() gene.list <- intersect(gene.cd, gene ) for (i in 1:length(gene.list) ) { index1 <- which( gene == gene.list[i]) intron <- pos[index1, ] index2 <- which(gene.cd == gene.list[i]) stap <- pos.cd[index2, ] if(strand.cd[index2]=="REVERSE") ifelse(length(intron)==2, intron <- intron[2:1], intron <- intron[ order(intron[,1]), ]) ifelse( length(intron)==2, intron <- intron[intron[1]>stap[1]&intron[1]stap[1]&intron[2]stap[1] & intron[,1]stap[1] &intron[,2] 1) {exon <- cbind(intron[,1]-1, intron[,2]+1); gene.exon <- c(gene.exon, rep(gene.cd[index2], n+1) ); pos.exon <- c(pos.exon, stap[1], c(t(exon) ), stap[2] ); chr.exon <- c(chr.exon, rep( chr.cd[index2],n+1) ); strand.exon <- c(strand.exon, rep( strand.cd[index2],n+1) ) } if( i/1000==trunc(i/1000) ) cat(i, "\n") } gene <- gene.exon chr <- chr.exon strand <- strand.exon pos <- matrix(as.numeric(pos.exon), byrow=T, nc=2) summary(pos[,2]-pos[,1]) p1 <- p2 <- 0 for (i in 1:nrow(pos) ) { probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,1] & cg[,2] < pos[i,2] ) p1 <- p1 + length(probe) probe <- which(ccgg[,1] == chr[i] & ccgg[,2] > pos[i,1] & ccgg[,2] < pos[i,2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,2] <- c(p1, p2) ### for 5'UTR utr5 <- read.delim("TAIR7_5_utr_20070227", header=F) utr5 <- as.character( utr5[1:nrow(utr5),] ) utr5 <- utr5[ grep(">", utr5, fixed=T) ] # 22998 gene <- matrix( unlist(strsplit(utr5, " | ", fixed=T) ), nr=3)[1,] #22998 utr <- matrix( unlist(strsplit(utr5, " | ", fixed=T) ), nr=3)[2,] #22998 coord <- matrix( unlist(strsplit(utr5, " | ", fixed=T) ), nr=3)[3,] #22998 utr <- utr[ grep("AT[12345]G", gene) ] #22994 gene <- gene[ grep("AT[12345]G", gene) ] #22994 coord <- coord [ grep("chr[12345]", coord) ] #22994 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] gene <- gene[isoform=="1"] #18258 utr <- utr[isoform=="1"] #18258 coord <- coord[isoform=="1"] #18258 strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #utr5 has no reverse pos utr <- gsub( "[", "", utr, fixed=T) utr <- gsub( "]", "", utr, fixed=T) utr <- sapply(utr, function(x) unlist( strsplit(x, " ") ) ) utr <- lapply(utr, function(x) matrix( as.numeric( unlist( strsplit(x, "-", fixed=T) ) ), byrow=T, nc=2) ) len <- unlist( lapply(utr, function(x)length(x)/2) ) strand <- rep.int(strand, len) chr <- rep.int(chr, len) gene <- rep.int(gene, len) posm <- c() for (i in 1:length(utr) ) { pos1 <- pos[i,1] + utr[[i]][,1] -1 pos2 <- pos[i,1] + utr[[i]][,2] -1 posm <- c(posm, rbind(pos1, pos2) ) } pos <- matrix(posm, byrow=T, nc=2) summary(pos[,2]-pos[,1]) p1 <- p2 <- 0 for (i in 1:nrow(pos) ) { probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,1] & cg[,2] < pos[i,2] ) p1 <- p1 + length(probe) probe <- which(ccgg[,1] == chr[i] & ccgg[,2] > pos[i,1] & ccgg[,2] < pos[i,2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,4] <- c(p1, p2) ### for 3'UTR utr3 <- read.delim("TAIR7_3_utr_20070227", header=F) utr3 <- as.character( utr3[1:nrow(utr3),] ) utr3 <- utr3[ grep(">", utr3, fixed=T) ] #24016 gene <- matrix( unlist(strsplit(utr3, " | ", fixed=T) ), nr=3)[1,] utr <- matrix( unlist(strsplit(utr3, " | ", fixed=T) ), nr=3)[2,] coord <- matrix( unlist(strsplit(utr3, " | ", fixed=T) ), nr=3)[3,] utr <- utr[ grep("AT[12345]G", gene) ] #24016 gene <- gene[ grep("AT[12345]G", gene) ] #24016 coord <- coord [ grep("chr[12345]", coord) ] #24016 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] gene <- gene[isoform=="1"] #19219 utr <- utr[isoform=="1"] #19219 coord <- coord[isoform=="1"] #19219 strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #utr3 has no reverse pos utr <- gsub( "[", "", utr, fixed=T) utr <- gsub( "]", "", utr, fixed=T) utr <- sapply(utr, function(x) unlist( strsplit(x, " ") ) ) utr <- lapply(utr, function(x) matrix( as.numeric( unlist( strsplit(x, "-", fixed=T) ) ), byrow=T, nc=2) ) utr.len <- lapply(utr, function(x) x[,2]-x[,1]+1 ) utr.sta <- lapply(utr, function(x) c(1, (x[,1]-x[1,1]+1)[-1] ) ) len <- unlist( lapply(utr, function(x)length(x)/2) ) strand <- rep.int(strand, len) chr <- rep.int(chr, len) gene <- rep.int(gene, len) posm <- c() for (i in 1:length(utr) ) { pos1 <- pos[i,1] + utr.sta[[i]] -1 pos2 <- pos1 + utr.len[[i]] -1 posm <- c(posm, rbind(pos1, pos2) ) } pos <- matrix(posm, byrow=T, nc=2) summary(pos[,2]-pos[,1]) p1 <- p2 <- 0 for (i in 1:nrow(pos) ) { probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,1] & cg[,2] < pos[i,2] ) p1 <- p1 + length(probe) probe <- which(ccgg[,1] == chr[i] & ccgg[,2] > pos[i,1] & ccgg[,2] < pos[i,2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,5] <- c(p1, p2) ### for intron intron <- read.delim("TAIR7_intron_20070227", header=F) intron <- as.character( intron[1:nrow(intron),] ) intron <- intron[ grep(">", intron, fixed=T) ] # 148558 gene <- matrix( unlist( strsplit(intron, " | ", fixed=T) ), nr=3)[1,] coord <- matrix( unlist( strsplit(intron, " | ", fixed=T) ), nr=3)[3,] gene <- gene[ grep("AT[12345]G", gene) ] #148516 coord <- coord [ grep("chr[12345]", coord) ] #148516 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] isoform <- matrix( unlist(strsplit(isoform, "-", fixed=T) ), nr=2)[1,] gene <- gene[isoform=="1"] # 117219 coord <- coord[isoform=="1"] # 117219 strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #intron has no reverse pos summary(pos[,2]-pos[,1]) p1 <- p2 <- 0 for (i in 1:nrow(pos) ) { probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,1] & cg[,2] < pos[i,2] ) p1 <- p1 + length(probe) probe <- which(ccgg[,1] == chr[i] & ccgg[,2] > pos[i,1] & ccgg[,2] < pos[i,2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,3] <- c(p1, p2) ### for intergenic intergenic <- read.delim("TAIR7_intergenic_20070227", header=F) intergenic <- as.character( intergenic[1:nrow(intergenic),] ) intergenic <- intergenic[ grep(">", intergenic, fixed=T) ] #30413 coord <- matrix( unlist( strsplit(intergenic, " | ", fixed=T) ), nr=2)[2,] coord <- coord[ grep("chr[12345]" , coord) ] #30149 coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #intergenic has no reverse pos, and always forward direction regardless flanking genes summary(pos[,2]-pos[,1]) p1 <- p2 <- 0 for (i in 1:nrow(pos) ) { probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,1] & cg[,2] < pos[i,2] ) p1 <- p1 + length(probe) probe <- which(ccgg[,1] == chr[i] & ccgg[,2] > pos[i,1] & ccgg[,2] < pos[i,2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,8] <- c(p1, p2) ### for promoter 1000 promoter <- read.delim("TAIR7_upstream_1000_20070405", header=F) promoter <- as.character( promoter[1:nrow(promoter),]) promoter <- promoter[ grep(">", promoter, fixed=T) ] #32041 gene <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[1,] coord <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[2,] gene <- gene[ grep("AT[12345]G", gene) ] #31762 coord <- coord [ grep("chr[12345]", coord) ] #31762 gene <- sub( ">", "", gene, fixed=T) strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #promoter has no reverse pos p1 <- p2 <- 0 for (i in 1:nrow(pos) ) { probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,1] & cg[,2] < pos[i,2] ) p1 <- p1 + length(probe) probe <- which(ccgg[,1] == chr[i] & ccgg[,2] > pos[i,1] & ccgg[,2] < pos[i,2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,6] <- c(p1, p2) ### for downstream1000 downstream <- read.delim("TAIR7_downstream_1000_20070405", header=F) downstream <- as.character( downstream[1:nrow(downstream), ] ) downstream <- downstream[ grep(">", downstream, fixed=T) ] #32041 gene <- matrix( unlist( strsplit(downstream, " | ", fixed=T) ), nr=2)[1,] coord <- matrix( unlist( strsplit(downstream, " | ", fixed=T) ), nr=2)[2,] gene <- gene[ grep("AT[12345]G", gene) ] #31762 coord <- coord [ grep("chr[12345]", coord) ] #31762 gene <- sub( ">", "", gene, fixed=T) strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #downstream has no reverse pos p1 <- p2 <- 0 for (i in 1:nrow(pos) ) { probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,1] & cg[,2] < pos[i,2] ) p1 <- p1 + length(probe) probe <- which(ccgg[,1] == chr[i] & ccgg[,2] > pos[i,1] & ccgg[,2] < pos[i,2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,7] <- c(p1, p2) tab[3,] <- tab[1,]/tab[2,] print(tab) q("no") #################################################################################################################################################### ### analysis of enzyme and genotype*enzyme effect in position quantiles of gene (includes CD and intron) k <- 3 #2 for enzyme effect, 3 for genotype*enzyme effect setwd ("final2") load ("methyl.main2.RData") load (paste("perm2.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart) - 24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1), each=8) enz <- rep( c(1,-1), 8) addenz <- add*enz denom <- rep( sqrt( sum( (add-mean(add) )^2) ), 3) s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/12 ) / denom[k] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) #main.sort <- sort(main, decreasing=T) #hpa <- names(main.sort)[1:3583] #0.03 #hpa <- names(main.sort)[1:4522] #0.05 #hpa <- names(main.sort)[1:6324] #0.1 main.sort <- sort(main) #hpa <- names(main.sort)[1:407] #0.01 #hpa <- names(main.sort)[1:944] #0.03 hpa <- names(main.sort)[1:1515] #0.05 main.sort <- sort(main, decreasing=T) #hpa <- c(hpa, names(main.sort)[1:1062]) #0.01 #hpa <- c(hpa, names(main.sort)[1:2389]) #0.03 hpa <- c(hpa, names(main.sort)[1:3700]) #0.05 cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe setwd ("../ftp") cd <- read.delim("TAIR7_cds_20070425", header=F) cd <- as.character( cd[1:nrow(cd), ] ) cd <- cd[ grep(">", cd, fixed=T) ] #31921 gene <- matrix( unlist( strsplit(cd, " | ", fixed=T) ), nr=4)[1,] #31921 coord <- matrix( unlist( strsplit(cd, " | ", fixed=T) ), nr=4)[4,] #31921 gene <- gene[ grep("AT[12345]G", gene) ] #31711 coord <- coord [ grep("chr[12345]", coord) ] #31711 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] gene <- gene[isoform=="1"] #26784 coord <- coord[isoform=="1"] #26784 strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #cd has no reverse pos ### for all cd size pos <- apply(pos, 1, function(x) { a <- rep( (x[2] - x[1])/10, 10); a <- cumsum(a); c(x[1], x[1] + a) }) pos <- t(pos) #for each cd divided to 10 parts tab <- matrix(NA, nr=3, nc=10) #rownames(tab) <- c("enz", "total", "percentage") rownames(tab) <- c("addenz", "total", "percentage") colnames(tab) <- paste( "CDquant", 1:10, sep="" ) for (k in 1:10) { p1 <- p2 <- 0 for (i in 1:nrow(pos) ) { ifelse( strand[i] == "FORWARD", probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,k] & cg[,2] < pos[i,k+1] ), probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,11-k] & cg[,2] < pos[i,11-k+1] ) ) p1 <- p1 + length(probe) ifelse( strand[i] == "FORWARD", probe <- which(ccgg[,1] == chr[i] & ccgg[,2] > pos[i,k] & ccgg[,2] < pos[i,k+1] ), probe <- which (ccgg[,1]== chr[i] & ccgg[,2] > pos[i,11-k] & ccgg[,2] < pos[i,11-k+1] ) ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,k] <- c(p1, p2) } tab[3,] <- tab[1,]/tab[2,] print(tab) q("no") ### for seperate cd size cd.len <- pos[,2]-pos[,1]+1 #cut <- c(1000, 2000, 3000) size <- list() size[[1]] <- which( cd.len <= 1000) size[[2]] <- which( cd.len > 1000 & cd.len <= 2000) size[[3]] <- which( cd.len >2000 & cd.len <= 3000) size[[4]] <- which( cd.len >3000) pos <- apply( pos, 1, function(x) { a <- rep( (x[2] - x[1])/10, 10); a <- cumsum(a); c(x[1], x[1] + a) }) pos <- t(pos) for (j in 1:length(size)) { index <- size[[j]] gene.temp <- gene[index] strand.temp <- strand[index] chr.temp <- chr[index] pos.temp <- pos[index,] tab <- matrix(0, nr=3, nc=10) #rownames(tab) <- c("enz", "total", "percentage") rownames(tab) <- c("addenz", "total", "percentage") colnames(tab) <- paste("quant.i", 1:10, sep="") for (k in 1:10){ p1 <- p2 <- 0 for (i in 1:nrow(pos.temp) ) { ifelse( strand.temp[i] == "FORWARD", probe <- which (cg[,1] == chr.temp[i] & cg[,2] >= pos.temp[i,k] & cg[,2]= pos.temp[i,11-k] & cg[,2] < pos.temp[i,11-k+1] ) ) p1 <- p1+length(probe) ifelse( strand.temp[i] == "FORWARD", probe <- which (ccgg[,1] ==chr.temp[i] & ccgg[,2]>= pos.temp[i,k] & ccgg[,2]= pos.temp[i,11-k] & ccgg[,2] < pos.temp[i,11-k+1] ) ) p2 <- p2+length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,k] <- c(p1, p2) } tab[3,] <- tab[1,]/tab[2,] print(tab) } q("no") #################################################################################################################################################### ### analysis of enzyme effect and genotype*enzyem effect in promoter intervals k <- 3 #2 for enzyme effect, 3 for genotype*enzyme effect setwd ("final2") load ("methyl.main2.RData") load (paste("perm2.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart) -24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1), each=8) enz <- rep( c(1,-1), 8) addenz <- add*enz denom <- rep( sqrt( sum( (add-mean(add) )^2) ), 3) s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/12 ) / denom[k] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) #main.sort <- sort(main, decreasing=T) #hpa <- names(main.sort)[1:3583] #0.03 #hpa <- names(main.sort)[1:4522] #0.05 #hpa <- names(main.sort)[1:6324] #0.1 main.sort <- sort(main) #hpa <- names(main.sort)[1:407] #0.01 #hpa <- names(main.sort)[1:944] #0.03 hpa <- names(main.sort)[1:1515] #0.05 main.sort <- sort(main, decreasing=T) #hpa <- c(hpa, names(main.sort)[1:1062]) #0.01 #hpa <- c(hpa, names(main.sort)[1:2389]) #0.03 hpa <- c(hpa, names(main.sort)[1:3700]) #0.05 cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe bin <- c( seq(100, 1000, 100), 2000, 3000) len <- c( rep(100, 10), rep(1000, 2) ) tab <- matrix(NA, nr=3, nc=12) #rownames(tab) <- c("enz", "total", "percentage") rownames(tab) <- c("addenz", "total", "percentage") colnames(tab) <- paste("prom", bin, sep="") setwd ("../ftp") promoter <- read.delim("TAIR7_upstream_500_20070405", header=F) promoter <- as.character( promoter[1:nrow(promoter),]) promoter <- promoter[ grep(">", promoter, fixed=T) ] #32041 gene <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[1,] coord <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[2,] gene <- gene[ grep("AT[12345]G", gene) ] #31762 coord <- coord [ grep("chr[12345]", coord) ] #31762 gene <- sub( ">", "", gene, fixed=T) strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #promoter has no reverse pos posf <- matrix( sapply( bin, function(x) pos[,2] - x), nc=12) posr <- matrix( sapply( bin, function(x) pos[,1] + x), nc=12) for (k in 1:12) { p1 <- p2 <- 0 for (i in 1:length(gene) ) { ifelse( strand[i]=="FORWARD", posm <- c(posf[i,k], posf[i,k]+len[k]), posm <- c(posr[i,k]-len[k], posr[i,k]) ) ##for bining #ifelse(strand[i]=="FORWARD",posm <- c(posf[i,k], pos[i,2]), posm <- c(pos[i,1],posr[i,k]) ) #for cumulative probe <- which (cg[,1] == chr[i] & cg[,2] >= posm[1] & cg[,2] < posm[2] ) p1 <- p1 + length(probe) probe <- which (ccgg[,1]== chr[i] & ccgg[,2] >= posm[1] & ccgg[,2] < posm[2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,k] <- c(p1,p2) } tab[3,] <- tab[1,]/tab[2,] print(tab) q("no") #################################################################################################################################################### ### analysis of enzyme effect and genotype*enzyme efffect in downstream intervals k <- 3 #2 for enzyme effect, 3 for genotype*enzyme effect setwd ("final2") load ("methyl.main2.RData") load (paste("perm2.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart) -24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1), each=8) enz <- rep( c(1,-1), 8) addenz <- add*enz denom <- rep( sqrt( sum( (add-mean(add) )^2) ), 3) s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/12 ) / denom[k] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) #main.sort <- sort(main, decreasing=T) #hpa <- names(main.sort)[1:3583] #0.03 #hpa <- names(main.sort)[1:4522] #0.05 #hpa <- names(main.sort)[1:6324] #0.1 main.sort <- sort(main) #hpa <- names(main.sort)[1:407] #0.01 #hpa <- names(main.sort)[1:944] #0.03 hpa <- names(main.sort)[1:1515] #0.05 main.sort <- sort(main, decreasing=T) #hpa <- c(hpa, names(main.sort)[1:1062]) #0.01 #hpa <- c(hpa, names(main.sort)[1:2389]) #0.03 hpa <- c(hpa, names(main.sort)[1:3700]) #0.05 cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe bin <- c( seq(100, 1000, 100), 2000, 3000) len <- c( rep(100, 10), rep(1000, 2) ) tab <- matrix(NA, nr=3, nc=12) #rownames(tab) <- c("enz", "total", "percentage") rownames(tab) <- c("addenz", "total", "percentage") colnames(tab) <- paste("down", bin, sep="") setwd ("../ftp") downstream <- read.delim("TAIR7_downstream_500_20070405", header=F) downstream <- as.character( downstream[1:nrow(downstream), ] ) downstream <- downstream[ grep(">", downstream, fixed=T) ] #32041 gene <- matrix( unlist( strsplit(downstream, " | ", fixed=T) ), nr=2)[1,] coord <- matrix( unlist( strsplit(downstream, " | ", fixed=T) ), nr=2)[2,] gene <- gene[ grep("AT[12345]G", gene) ] #31762 coord <- coord [ grep("chr[12345]", coord) ] #31762 gene <- sub( ">", "", gene, fixed=T) strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #downstream has no reverse pos posf <- matrix( sapply( bin, function(x) pos[,1] + x), nc=12) posr <- matrix( sapply( bin, function(x) pos[,2] - x), nc=12) for (k in 1:12) { p1 <- p2 <- 0 for (i in 1:length(gene) ) { ifelse( strand[i]=="FORWARD", posm <- c(posf[i,k]-len[k], posf[i,k]), posm <- c(posr[i,k], posr[i,k]+len[k]) ) #ifelse( strand[i]=="FORWARD", posm <- c(pos[i,1], posf[i,k]), posm <- c(posr[i,k], pos[i,2]) ) probe <- which (cg[,1] == chr[i] & cg[,2] >= posm[1] & cg[,2] < posm[2] ) p1 <- p1 + length(probe) probe <- which (ccgg[,1]== chr[i] & ccgg[,2] >= posm[1] & ccgg[,2] < posm[2] ) p2 <- p2 + length(probe) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab[1:2,k] <- c(p1,p2) } tab[3,] <- tab[1,]/tab[2,] print(tab) q("no") #################################################################################################################################################### ### genomic distribution of enzyme effect and genotype*enzyme effect k <- 3 #2 for enzyme effect, 3 for genotype*enzyme effect setwd ("final2") load ("methyl.main2.RData") load (paste("perm2.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart) - 24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1), each=8) enz <- rep( c(1,-1), 8) addenz <- add*enz denom <- rep( sqrt( sum( (add-mean(add) )^2) ), 3) s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/12 ) / denom[k] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) #main.sort <- sort(main, decreasing=T) #hpa <- names(main.sort)[1:3583] #0.03 #hpa <- names(main.sort)[1:4522] #0.05 #hpa <- names(main.sort)[1:6324] #0.1 main.sort <- sort(main) #hpa <- names(main.sort)[1:407] #0.01 #hpa <- names(main.sort)[1:944] #0.03 hpa <- names(main.sort)[1:1515] #0.05 main.sort <- sort(main, decreasing=T) #hpa <- c(hpa, names(main.sort)[1:1062]) #0.01 #hpa <- c(hpa, names(main.sort)[1:2389]) #0.03 hpa <- c(hpa, names(main.sort)[1:3700]) #0.05 cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe cen.pos <- matrix( c(15088487, 15089486, 3607927, 3608926, 13810404, 13811403, 3956019, 3957018, 11742255, 11743254), byrow=T, nc=2) chr.len <- c(30432563, 19705359, 23470805, 18585042, 26992728) left.arm <- cbind( rep(1,5), cen.pos[,1]) right.arm <- cbind( cen.pos[,2], chr.len) bin <- 1000000 cut.left <- apply(left.arm, 1, function(x) { a <- trunc( (x[2]-x[1])/bin ); a <- rep(bin, a); a <-cumsum(a); c(x[2], x[2]-a) } ) cut.right <- apply(right.arm, 1, function(x) { a <- trunc( (x[2]-x[1])/bin ); a <- rep(bin, a); a <- cumsum(a); c(x[1], x[1]+a) } ) count.left <- count.right <- list() for (k in 1:5) { left <- cut.left[[k]] right <- cut.right[[k]] tab <- matrix(NA, nr=3, nc=length(left)-1) #rownames(tab) <- c("enz", "total", "percentage") rownames(tab) <- c("addenz", "total", "percentage") colnames(tab) <- paste("left", k, "-", 1:(length(left)-1), sep="") p1 <- p2 <- 0 for (i in 1:(length(left)-1) ) { probe <- which (cg[,1] == k & cg[,2] > left[i+1] & cg[,2] <= left[i] ) p1 <- p1 + length(probe) probe <- which (ccgg[,1]== k & ccgg[,2] > left[i+1] & ccgg[,2] <= left[i] ) p2 <- p2 + length(probe) tab[1:2,i] <- c(p1, p2) } tab[3,] <- tab[1,]/tab[2,] count.left[[k]] <- tab tab <- matrix(NA, nr=3, nc=length(right)-1) #rownames(tab) <- c("enz", "total", "percentage") rownames(tab) <- c("addenz", "total", "percentage") colnames(tab) <- paste("right", k, "-", 1:(length(right)-1), sep="") p1 <- p2 <- 0 for (i in 1:(length(right)-1) ) { probe <- which (cg[,1] == k & cg[,2] >= right[i] & cg[,2] < right[i+1] ) p1 <- p1 + length(probe) probe <- which (ccgg[,1]== k & ccgg[,2] >= right[i] & ccgg[,2] < right[i+1] ) p2 <- p2 + length(probe) tab[1:2,i] <- c(p1, p2) } tab[3,] <- tab[1,]/tab[2,] count.right[[k]] <- tab } centr.count <- list(cut.left, cut.right, count.left, count.right) #save(centr.count, file="review/centr.count.enz2.RData", compress=T) save(centr.count, file="review/centr.count.addenz2.RData", compress=T) q("no") ###plotting pdf ("fromCentromere.pdf", height=4, width=4) { par( mfrow=c(5,1), omi=c(0.3, 0, 0.2, 0.62), mai=c(0, 0.82, 0, 0), yaxp=c(0, 0.8, 1), tck=-0.04 ) for (k in 1:5) { load ("centr.count.enz2.RData") xl <- centr.count[[1]][[k]] xr <- centr.count[[2]][[k]] xl <- (xl+500000)[2:length(xl)] xr <- (xr+500000)[1:(length(xr)-1)] yl <- centr.count[[3]][[k]][3,]*100 yr <- centr.count[[4]][[k]][3,]*100 if (k == 1) {plot(xl, yl, xlim=c(0, 30432563 ), ylim= c(2, 37), ylab="", xlab="", col=heat.colors(18)[10], pch=15, cex=0.8, xaxt="n", cex.axis=0.8, mgp=c(3, 0.1, 0) ); legend("topleft", c("enzyme", "genotype x enzyme"), col=c( heat.colors(18)[10], "green"), pch=15, cex=0.8, bty="n"); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n") } if (k ==3) {plot(xl, yl, xlim=c(0, 30432563 ), ylim= range( c(2, 37) ), ylab="percentage", xlab="", col=heat.colors(18)[10], pch=15, cex=0.8, xaxt="n", cex.axis=0.8, mgp=c(1.5, 0.1, 0) ); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n")} if (k ==5) {plot(xl, yl, xlim=c(0, 30432563 ), ylim= range( c(2, 37) ), ylab="", xlab="", col=heat.colors(18)[10], pch=15, cex=0.8, cex.axis=0.8, mgp=c(3, 0.1, 0) ) ; legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n"); mtext( "bp", at=15000000, line=-7.4, cex=0.7 ) } if (k ==2 | k==4) {plot(xl, yl, xlim=c(0, 30432563 ), ylim= range( c(2, 37) ), ylab="", xlab="", col=heat.colors(18)[10], pch=15, cex=0.8, xaxt="n", cex.axis=0.8, mgp=c(3, 0.1, 0) ); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n")} lines(xl, yl, col=heat.colors(18)[10], lwd=1) points(xr, yr, col=heat.colors(18)[10], pch=15, cex=0.8) lines(xr,yr, col=heat.colors(18)[10], lwd=1) load ("centr.count.addenz2.RData") xl <- centr.count[[1]][[k]] xr <- centr.count[[2]][[k]] xl <- (xl+500000)[2:length(xl)] xr <- (xr+500000)[1:(length(xr)-1)] yl <- centr.count[[3]][[k]][3,]*100 yr <- centr.count[[4]][[k]][3,]*100 if (k ==3) points(xl, yl, xlim=c(0, 30432563 ), ylim= range( c(2, 37) ), ylab="", xlab="", col="green", pch=15, cex=0.8, xaxt="n" ) if (k ==5) points(xl, yl, xlim=c(0, 30432563 ), ylim= range( c(2, 37) ), ylab="", xlab="", col="green", pch=15, cex=0.8) if (k ==1 | k ==2 | k==4) points(xl, yl, xlim=c(0, 30432563 ), ylim= range( c(2, 37) ), ylab="", xlab="", col="green", pch=15, cex=0.8, xaxt="n" ) lines(xl, yl, col="green", lwd=1) points(xr, yr, col="green", pch=15, cex=0.8) lines(xr,yr, col="green", lwd=1) } } dev.off() #################################################################################################################################################### ### lowess smoothing of enzyme effect and genotpe*enzyme effect d scores k <- 3 #2 for enzyme effect, 3 for genotype*enzyme effect setwd ("final2") load ("methyl.main2.RData") load (paste("perm2.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart) - 24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1), each=8) enz <- rep( c(1,-1), 8) addenz <- add*enz denom <- rep( sqrt( sum( (add-mean(add) )^2) ), 3) s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/12 ) / denom[k] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) main <- cbind(ccgg, main) #save (main, file="review/main.enz.RData") save (main, file="review/main.addenz.RData") q("no") ### shuffling 1kb setwd ("final2") #load ("review/main.enz.RData") load ("review/main.addenz.RData") chr.len <- c(30432563, 19705359, 23470805, 18585042, 26992728) for( k in 1:5) { mainm <- main[ main[,1]==k, ] tab <- trunc( chr.len[k]/1000 ) +1 tab <- seq(1, tab, 1) tab <- cbind( 1000*(tab-1)+1, 1000*tab) vec <- list() for (i in 1:nrow(tab) ) {probe <- which(mainm[,2]>= tab[i,1] & mainm[,2] <= tab[i,2] ); vec[[i]] <- mainm[probe, 3]; if(i/100==trunc(i/100) ) cat(k, i, sep="\t", "\n")} main [ main[,1]==k, 3] <- unlist( sample(vec) ) } suff <- main #save(suff, file="review/suff.enz.RData", compress=T) save(suff, file="review/suff.addenz.RData", compress=T) q("no") ### plot setwd ("final2") pdf ("review/lowess.enzyme.pdf", height=4, width=4) { par( mfrow=c(5,1), omi=c(0.3, 0, 0.2, 0.62), mai=c(0, 0.82, 0, 0), yaxp=c(0, 0.8, 1), tck=-0.04 ) load ("review/main.enz.RData") load ("review/suff.enz.RData") for (k in 1:5) { x1 <- main[,2][ main[,1]==k] y1 <- main[,3][ main[,1]==k] x2 <- suff[,2][ suff[,1]==k] y2 <- suff[,3][ suff[,1]==k] if (k == 1) {plot( lowess(x1, y1, f=0.01), xlim=c(0, 30432563 ), ylim=c(-1.0, 0.9), ylab="", xlab="", col=heat.colors(18)[10], xaxt="n", pch=".", cex.axis=0.8, mgp=c(3, 0.1, 0) ); legend("topleft", c("enzyme effect", "shuffled enzyme effect"), col=c( heat.colors(18)[10], "black"), pch=15, bty="n", cex=0.8); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n") } if (k ==3) {plot( lowess (x1, y1,f=0.01), xlim=c(0, 30432563 ), ylim=c(-1.0, 0.9), ylab="lowess(d)", cex=0.8, xlab="", col=heat.colors(18)[10], xaxt="n", pch=".", cex.axis=0.8, mgp=c(1.5, 0.1, 0) ); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n") } if (k ==2 | k==4 ) {plot(lowess(x1, y1, f=0.01), xlim=c(0, 30432563 ), ylim=c(-1.0, 0.9), ylab="", xlab="", col=heat.colors(18)[10], xaxt="n", pch=".", cex.axis=0.8, mgp=c(3, 0.1, 0) ); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n") } if (k==5) {plot ( lowess (x1, y1,f=0.01), xlim=c(0, 30432563 ), ylim=c(-1.0, 0.9), ylab="", cex.lab=0.8, xlab="", col=heat.colors(18)[10], pch=".", cex.axis=0.8, mgp=c(3, 0.1, 0) ); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n"); mtext( "bp", at=15000000, line=-7.4, cex=0.7 ) } points( lowess(x2, y2, f=0.01), col="black", pch="." ) } } dev.off() pdf ("review/lowess.addenz.pdf", height=4, width=4) { par( mfrow=c(5,1), omi=c(0.3, 0, 0.2, 0.62), mai=c(0, 0.82, 0, 0), yaxp=c(0, 0.8, 1), tck=-0.04 ) load ("review/main.addenz.RData") load ("review/suff.addenz.RData") for (k in 1:5) { x1 <- main[,2][ main[,1]==k] y1 <- main[,3][ main[,1]==k] x2 <- suff[,2][ suff[,1]==k] y2 <- suff[,3][ suff[,1]==k] if (k == 1) {plot( lowess(x1, y1, f=0.01), xlim=c(0, 30432563 ), ylim=c(-0.12, 0.52), ylab="", xlab="", col="green", xaxt="n", pch=".", cex.axis=0.8, mgp=c(3, 0.1, 0) ); legend("topleft", c("genotype x enzyme", "shuffled genotype x enzyme"), col=c("green", "black"), pch=15, bty="n", cex=0.8); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n") } if (k ==3) {plot( lowess (x1, y1,f=0.01), xlim=c(0, 30432563 ), ylim=c(-0.12, 0.52), ylab="lowess(d)", cex=0.8, xlab="", col="green", xaxt="n", pch=".", cex.axis=0.8, mgp=c(1.5, 0.1, 0) ); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n") } if (k ==2 | k==4 ) {plot(lowess(x1, y1, f=0.01), xlim=c(0, 30432563 ), ylim=c(-0.12, 0.52), ylab="", xlab="", col="green", xaxt="n", pch=".", cex.axis=0.8, mgp=c(3, 0.1, 0) ); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n") } if (k==5) {plot ( lowess (x1, y1,f=0.01), xlim=c(0, 30432563 ), ylim=c(-0.12, 0.52), ylab="", cex.lab=0.8, xlab="", col="green", pch=".", cex.axis=0.8, mgp=c(3, 0.1, 0) ); legend("topright", c("chr1", "chr2", "chr3", "chr4", "chr5")[k], bty="n"); mtext( "bp", at=15000000, line=-7.4, cex=0.7 ) } points( lowess(x2, y2, f=0.01), col="black", pch="." ) } } dev.off() #################################################################################################################################################### ### plotting of Figures ### Figure3A a <- scan("plotGeno.txt", what="a") a <- sub("%", "", a) a <- as.numeric(a) a <- matrix(a, byrow=T, nr=2) pdf ("enz.geno.pdf") barplot(a, beside=T, ylab="percentage", col= c(heat.colors(18)[10], "green"), names.arg=c("cd", "intron", "utr5", "utr3", "up1k", "down1k", "interg"), space=c(0,1.2), legend.text=c("genotype", "genotype x enzyme")) dev.off() ### Figure3C a <- scan("plotQuantile.txt", what="a", nline=4) a <- sub("%", "", a) a <- matrix( as.numeric(a), nr=10) b <- seq(10, 100, 10) pdf ("enz.quantile.size.pdf") plot ( 0, 0, xlim=c(10, 100), ylim=range(a), pch=".", col="transparent", xlab="genic region %", ylab="CmCGG/CCGG %") for (i in 1:ncol(a)) { points(b, a[,i], col=heat.colors(18) [4+2*i], pch=15) lines(b, a[,i], col=heat.colors(18) [4+2*i], lwd=2) } legend ("topleft", c("<1kb", "1kb-2kb", "2kb-3kb", ">3kb"), col=heat.colors(18)[ seq(6, 12, 2)], pch=15 , bty="n") dev.off() a <- scan("plotQuantile.txt", skip=5, what="a") a <- sub("%", "", a) a <- matrix( as.numeric(a), nr=10) b <- seq(10, 100, 10) pdf ("addenz.quantile.size.pdf") plot ( 0, 0, xlim=c(10, 100), ylim=range(a), pch=".", col="transparent", xlab="genenic region %", ylab="CmCGG polymorphism/CCGG %") for (i in 1:ncol(a)) { points(b, a[,i], col=heat.colors(18) [4+2*i], pch=15) lines(b, a[,i], col=heat.colors(18) [4+2*i], lwd=2) } legend ("topleft", c("<1kb", "1kb-2kb", "2kb-3kb", ">3kb"), col=heat.colors(18)[ seq(6, 12, 2)], pch=15 , bty="n") dev.off() ### Figure3B a1 <- scan("plotGene.txt", nlines=5, what="a") a2 <- scan("plotGene.txt", skip=6, what="a") a <- cbind(a1, a2) a <- apply(a, 2, function(x) as.numeric( sub("%", "", x) ) ) a [1:12,] <- a[12:1,] b <- c(-2500, -1500, seq(-950, -50, 100), 50, seq(180, 1620, 160), 1800, seq(1950, 2850, 100), 3400, 4400) #utr5 100bp, utr3 200bp, gene 1600bp pdf ("Gene.pdf") plot(b, a[,1], xlim=c(-3000, 4900), ylim=c(-0.2, 16), ylab="percentage", xlab="relative position to gene", col=heat.colors(18)[10], type="p", pch=15, xaxt="n", cex.axis=0.8) lines(b,a[,1], col=heat.colors(18)[10],lwd=2) points(b, a[,2], col="green", pch=15) lines(b, a[,2], col="green", lwd=2) lines( c(100,1700), c(0,0), lwd=8, col="blue", lend=2) lines( c(-3000, 0), c(0,0), lwd=2, col="black", lend=2) lines( c(1900, 4900), c(0,0), lwd=2, col="black", lend=2) lines( c(0, 100), c(0,0), lwd=8, col="red", lend=2) lines( c(1700, 1900), c(0,0), lwd=8, col="red", lend=2) legend ("topleft", c("enzyme", "genotype x enzyme"), col=c( heat.colors(18)[10], "green"), bty="n", pch=15) axis( side=1, at= c(-2000, -1000, 100, 900, 1700, 2900, 3900), labels = c("-2kb", "-1kb", "0%", "50%", "100%", "1kb", "2kb"), tick = TRUE, cex.axis=0.8) dev.off() #################################################################################################################################################### #################################################################################################################################################### ### These scripts model (add+dom+mat)*enz+plant(random) effect of Col, Van and reciprocal hybrid lines load ("mDNA.ccgg.RData") add <- rep( c(1,-1,0, 0), each=8) #contrast add, dom and mat so that each is independently tested dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz plant <- as.factor( rep(LETTERS[1:16], each = 2) ) fac.mat <- data.frame(add, dom, mat, enz, addenz=add*enz, domenz=dom*enz, matenz=mat*enz) library(nlme) #get the mDNA.ccgg removed off random effect for (i in 1:nrow(mDNA.ccgg) ) { y <- mDNA.ccgg[i,] fit <- try( lme ( y ~ (add+dom+mat)*enz, random =~1|plant ), TRUE) while( mode(fit)=="character") {y <- jitter(y, amount=0); fit <- lme( y~ (add+dom+mat)*enz, random=~1|plant) } mDNA.ccgg [i, ] <- fit$coef$fixed [-1] %*% t(fac.mat) + fit$coef$fixed[1] +fit$resid[,1] #var.total [i,] <- ( c(residual=1, exp(coef (fit$modelStruct) ) )* fit$sigma )^2 if( i/100 == trunc(i/100) ) cat (i, "\n") } methyl.main <- list() fit <- lsfit(fac.mat, t(mDNA.ccgg) ) methyl.main [[1]] <- fit$coef methyl.main [[2]] <- fit$resid names(methyl.main) <- c("coef", "resid") save(methyl.main, file="methyl.main.RData", compress=T) q("no") #################################################################################################################################################### ### permutation by fit lme, correct random effect, leave one out, permutating the residual and lsfit for full model ### generate permutation matrix samp.matrix <- matrix(NA, nr=1000, nc=32) for(i in 1:1000) samp.matrix[i,] <- sample(32) save(samp.matrix, file="samp.matrix.RData", compress=T) q("no") k <- 7 #1 additive, 2 dominant, 3 maternal, 4 enzyme, 5 add*enz, 6 dom*enz, 7 mat*enz setwd ("final2") load ("mDNA.ccgg.RData") load ("samp.matrix.RData") add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz #plant <- as.factor( rep(LETTERS[1:16], each = 2) ) fac.mat <- data.frame(add, dom, mat, enz, addenz=add*enz, domenz=dom*enz, matenz=mat*enz) #library(nlme) #get the mDNA.ccgg removed off random effect #for (i in 1:nrow(mDNA.ccgg) ) #{ # y <- mDNA.ccgg[i,] # fit <- try( lme ( y ~ (add+dom+mat)*enz, random =~1|plant ), TRUE) # while( mode(fit)=="character") {y <- jitter(y, amount=0); fit <- lme( y~ (add+dom+mat)*enz, random=~1|plant) } # mDNA.ccgg [i, ] <- fit$coef$fixed [-1] %*% t(fac.mat) + fit$coef$fixed[1] +fit$resid[,1] #if( i/100 == trunc(i/100) ) cat (i, "\n") #} load ("corrected.mDNA.ccgg.RData") rfit <- lsfit(fac.mat[, -k], t(mDNA.ccgg) ) predict <- t( as.matrix(fac.mat[,-k]) %*% rfit$coef [-1,] ) + rfit$coef[1,] resid <- t(rfit$resid) nperm <- 1000 matr <- matrix(NA, nr=nrow(mDNA.ccgg), nc=nperm) perm.dstat <- list( coef=matr, std=matr) denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) for (iperm in 1:nperm) { perm.dat <- predict + resid[, samp.matrix[iperm,] ] pfit <- lsfit(fac.mat, t(perm.dat) ) perm.dstat[[1]][,iperm] <- pfit$coef[k+1,] perm.dstat[[2]][,iperm] <- sqrt( colSums ( (pfit$resid)^2) / 24) / denom[k] if (iperm/100==trunc(iperm/100)) cat(iperm, "\n") } save(perm.dstat, file =paste("perm.dstat", k, ".RData", sep=""), compress=T) q("no") #################################################################################################################################################### ### p value based on permutation k <- 4 #1 additive, 2 dominant, 3 maternal, 4 enzyme, 5 add*enz, 6 dom*enz, 7 mat*enz setwd ("final2") load ("methyl.main.RData") load ( paste ("perm.dstat", k, ".RData", sep="") ) load ("mDNA.ccgg.RData") add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) s0 <- quantile( perm.dstat[[2]], 0.05) std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) names(main) <- rownames(mDNA.ccgg) perm.mat <- perm.dstat[[1]]/(perm.dstat[[2]]+s0) vec <- rep(NA, length(main) ) pval <- list(vec, vec) main.sort <- sort(main) for( i in 1:length(main) ) { pval[[1]][i] <- sum( perm.mat <= main.sort[i])/ (1000*length(main) ) if (i/100 == trunc(i/100) ) cat(i, "\n") } names(pval[[1]]) <- names(main.sort) main.sort <- sort(main, decreasing=T) for (i in 1:length(main) ) { pval[[2]][i] <- sum( perm.mat >= main.sort[i])/ (1000*length(main) ) if (i/100 == trunc(i/100) ) cat(i, "\n") } names(pval[[2]]) <- names(main.sort) library(qvalue) qval <- lapply(pval, function(x) qvalue(x, pi0.method="bootstrap")$qvalues ) pi0 <- lapply(pval, function(x) qvalue(x, pi0.method="bootstrap")$pi0 ) pi0 methyl.summary <- list(pval=pval, qval=qval) save(methyl.summary, file=paste("methyl.summary", k, ".RData", sep=""), compress=T) cut <- c(0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1) print( t( sapply(cut, function(x) c( length(methyl.summary[["pval"]][[1]][methyl.summary[["pval"]][[1]]<= x]) , length(methyl.summary[["qval"]][[1]][methyl.summary[["qval"]][[1]]<= x]), length(methyl.summary[["pval"]][[2]][methyl.summary[["pval"]][[2]]<= x]), length(methyl.summary[["qval"]][[2]][methyl.summary[["qval"]][[2]]<= x]) ) ) ) ) q("no") #################################################################################################################################################### ### QQ plot setwd ("final2") load ("methyl.main.RData") add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) bitmap ("review/qqplot.png", width=24, height=12) par (mfrow=c(2, 4), mai=c(1.2, 1, 0.82, 0.42) ) for (k in 1:7) { load ( paste ("perm.dstat", k, ".RData", sep="") ) s0 <- quantile( perm.dstat[[2]], 0.05) std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) main <- sort(main) perm.mat <- perm.dstat[[1]]/(perm.dstat[[2]]+s0) for (i in 1:ncol(perm.mat) ) perm.mat[,i] <- sort(perm.mat[,i]) perm.mat <- rowMeans(perm.mat) plot(perm.mat, main, xlab="null d", ylab="d", pch=16, cex=0.5, col="orange", cex.axis=1.6, cex.lab=2) legend( "top", c("additive", "dominant", "maternal", "enzyme", "add x enz", "dom x enz", "mat x enz")[k], bty="n", cex=2) } dev.off() #################################################################################################################################################### #################################################################################################################################################### ### These scripts analyze the correlation between enzyme effect and absolute gene expression level ### mean for gene expression setwd ("final") load ("attile.nonSFP.exon.RData") load ("mRNA.sc.nq.RData") mRNA <- mRNA.nq[which(rownames(mRNA.nq) %in% rownames(attile.nonSFP.exon) ),] mRNA <- mRNA - rowMeans(mRNA) gene <- names(table( as.character(attile.nonSFP.exon$gene) ) ) mea <- rep(NA, length(gene) ) for (i in 1:length(gene)) {probe <- which( as.character(attile.nonSFP.exon$gene) == gene[i] ) gmean <- mRNA[probe, ] mea [i] <- mean(gmean) if (i/1000 == trunc(i/1000) ) cat(i, "\n") } names(mea) <- gene save(mea, file="../final2/gene.expression.mea.RData") q("no") ### for promoter regions k <- 4 setwd ("final2") load ("methyl.main.RData") load (paste("perm.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart)-24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) #denom for intercept and 7 coefs. s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) main.sort <- sort(main, decreasing=T) #hpa <- names(main.sort)[1:2762] #hpa <- names(main.sort)[1:4095] hpa <- names(main.sort)[1:5016] #hpa <- names(main.sort)[1:6876] cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe load ("gene.expression.mea.RData") quant <- seq(0.05, 1, 0.05) quante <- sapply(quant, function(x) quantile(mea, x) ) quante <- cbind( c(0, quante[1: (length(quante)-1)]), quante) quante <- apply(quante, 1, function(x) mea[ which(mea > x[1] & mea <= x[2] ) ] ) setwd ("../ftp") promoter <- read.delim("TAIR7_upstream_500_20070405", header=F) promoter <- as.character( promoter[1:nrow(promoter),]) promoter <- promoter[ grep(">", promoter, fixed=T) ] #32041 gene <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[1,] coord <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[2,] gene <- gene[ grep("AT[12345]G", gene) ] #31762 coord <- coord [ grep("chr[12345]", coord) ] #31762 gene <- sub( ">", "", gene, fixed=T) strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #promoter has no reverse pos mgene <- tgene <- c() for (i in 1:length(gene) ) { mgene <- c(mgene, rep(gene[i], length (which (cg[,1] == chr[i] & cg[,2] >= pos[i,1] & cg[,2] <= pos[i,2] ) ) ) ) tgene <- c(tgene, rep(gene[i], length (which (ccgg[,1] == chr[i] & ccgg[,2] >= pos[i,1] & ccgg[,2] <= pos[i,2] ) ) ) ) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab <- matrix(NA, nr=3, length(quante) ) rownames(tab) <- c("enz", "total", "percentage") colnames(tab) <- quant for (i in 1:length(quante) ) { egene <- names(quante[[i]]) tab [1,i] <- length( which(mgene %in% egene ) ) tab [2,i] <- length( which(tgene %in% egene ) ) } tab[3,] <- tab[1,]/tab[2,] print(tab) for (i in 1:length(quante) ) { egene <- names(quante[[i]]) tab [1,i] <- length( table ( mgene[ which(mgene %in% egene ) ] ) ) tab [2,i] <- length( table ( tgene[ which(tgene %in% egene ) ] ) ) } tab[3,] <- tab[1,]/tab[2,] print(tab) ### for genic regions (cdna) k <- 4 setwd ("final2") load ("methyl.main.RData") load (paste("perm.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart)-24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) #denom for intercept and 7 coefs. s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) main.sort <- sort(main, decreasing=T) #hpa <- names(main.sort)[1:2762] #hpa <- names(main.sort)[1:4095] hpa <- names(main.sort)[1:5016] #hpa <- names(main.sort)[1:6876] cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe load ("gene.expression.mea.RData") quant <- seq(0.05, 1, 0.05) quante <- sapply(quant, function(x) quantile(mea, x) ) quante <- cbind( c(0, quante[1: (length(quante)-1)]), quante) quante <- apply(quante, 1, function(x) mea[ which(mea > x[1] & mea <= x[2] ) ] ) setwd ("../ftp") cdna <- read.delim("TAIR7_cdna_20070425", header=F) cdna <- as.character( cdna[1:nrow(cdna), ] ) cdna <- cdna[ grep(">", cdna, fixed=T) ] # 33122 gene <- matrix( unlist(strsplit(cdna, " | ", fixed=T) ), nr=4)[1,] # 33122 coord <- matrix( unlist(strsplit(cdna, " | ", fixed=T) ), nr=4)[4,] # 33122 gene <- gene[ grep("AT[12345]G", gene) ] #32843 coord <- coord [ grep("chr[12345]", coord) ] #32843 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] gene <- gene[isoform=="1"] # 27838 coord <- coord[isoform=="1"] # 27838 strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #cdna has no reverse pos mgene <- tgene <- c() for (i in 1:length(gene) ) { mgene <- c(mgene, rep( gene[i], length( which (cg[,1] == chr[i] & cg[,2] >= pos[i,1] & cg[,2] <= pos[i,2] ) ) ) ) tgene <- c(tgene, rep( gene[i], length( which (ccgg[,1] == chr[i] & ccgg[,2] >= pos[i,1] & ccgg[,2] <= pos[i,2] ) ) ) ) if (i/1000== trunc(i/1000) ) cat(i, "\n") } tab <- matrix(NA, nr=3, length(quante) ) rownames(tab) <- c("enz", "total", "percentage") colnames(tab) <- quant for (i in 1:length(quante) ) { egene <- names(quante[[i]]) tab [1,i] <- length( which(mgene %in% egene ) ) tab [2,i] <- length( which(tgene %in% egene ) ) } tab[3,] <- tab[1,]/tab[2,] print(tab) for (i in 1:length(quante) ) { egene <- names(quante[[i]]) tab [1,i] <- length( table ( mgene[ which(mgene %in% egene ) ] ) ) tab [2,i] <- length( table ( tgene[ which(tgene %in% egene ) ] ) ) } tab[3,] <- tab[1,]/tab[2,] print(tab) ### for intron, exon, utr5, utr3 ### for downstream region k <- 4 setwd ("final2") load ("methyl.main.RData") load (paste("perm.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart)-24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) #denom for intercept and 7 coefs. s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) main.sort <- sort(main, decreasing=T) #hpa <- names(main.sort)[1:2762] #hpa <- names(main.sort)[1:4095] hpa <- names(main.sort)[1:5016] #hpa <- names(main.sort)[1:6876] cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe load ("gene.expression.mea.RData") quant <- seq(0.05, 1, 0.05) quante <- sapply(quant, function(x) quantile(mea, x) ) quante <- cbind( c(0, quante[1: (length(quante)-1)]), quante) quante <- apply(quante, 1, function(x) mea[ which(mea > x[1] & mea <= x[2] ) ] ) setwd ("../ftp") downstream <- read.delim("TAIR7_downstream_500_20070405", header=F) downstream <- as.character( downstream[1:nrow(downstream), ] ) downstream <- downstream[ grep(">", downstream, fixed=T) ] #32041 gene <- matrix( unlist( strsplit(downstream, " | ", fixed=T) ), nr=2)[1,] coord <- matrix( unlist( strsplit(downstream, " | ", fixed=T) ), nr=2)[2,] gene <- gene[ grep("AT[12345]G", gene) ] #31762 coord <- coord [ grep("chr[12345]", coord) ] #31762 gene <- sub( ">", "", gene, fixed=T) strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #downstream has no reverse pos mgene <- tgene <- c() for (i in 1:length(gene) ) { mgene <- c(mgene, rep(gene[i], length (which (cg[,1] == chr[i] & cg[,2] >= pos[i,1] & cg[,2] <= pos[i,2] ) ) ) ) tgene <- c(tgene, rep(gene[i], length (which (ccgg[,1] == chr[i] & ccgg[,2] >= pos[i,1] & ccgg[,2] <= pos[i,2] ) ) ) ) if(i/1000==trunc(i/1000) ) cat(i, "\n") } tab <- matrix(NA, nr=3, length(quante) ) rownames(tab) <- c("enz", "total", "percentage") colnames(tab) <- quant for (i in 1:length(quante) ) { egene <- names(quante[[i]]) tab [1,i] <- length( which(mgene %in% egene ) ) tab [2,i] <- length( which(tgene %in% egene ) ) } tab[3,] <- tab[1,]/tab[2,] print(tab) for (i in 1:length(quante) ) { egene <- names(quante[[i]]) tab [1,i] <- length( table ( mgene[ which(mgene %in% egene ) ] ) ) tab [2,i] <- length( table ( tgene[ which(tgene %in% egene ) ] ) ) } tab[3,] <- tab[1,]/tab[2,] print(tab) ### plotting enzyme effect and absolute gene expression correlation a <- scan ("plotME.txt", what="a", nlines=6) a <- sub("%", "", a) a <- as.numeric(a) a <- matrix(a, nr=20) am <- a for (i in 2:19) a[i,] <- (am[i-1,] +am[i,] + am[i+1,])/3 b <- seq( 5, 100, 5) pdf ("ME.mean.gene.pdf") plot ( 0, 0, xlim=c(5, 100), ylim=range(a), pch=".", col="transparent", xlab="expression level %", ylab="CCmGG genes/CCGG genes %") for (i in 1:ncol(a)) { points(b, a[,i], col=heat.colors(18) [3+2*(i-1)], pch=15) lines(b, a[,i], col=heat.colors(18) [3+2*(i-1)], lwd=2) } legend("topleft", c("CD", "intron", "utr5", "utr3", "promoter 500", "downstream 500"), col=heat.colors(18)[ seq(3, 15, 2) ], pch=15, bty="n" ) dev.off() a <- scan ("plotME.txt", what="a", skip=7) a <- sub("%", "", a) a <- as.numeric(a) a <- matrix(a, nr=20) am <- a for (i in 2:19) a[i,] <- (am[i-1,] +am[i,] + am[i+1,])/3 b <- seq( 5, 100, 5) pdf ("ME.mean.count.pdf") plot ( 0, 0, xlim=c(5, 100), ylim=range(a), pch=".", col="transparent", xlab="expression level %", ylab="CCmGG sites/CCGG sites %") for (i in 1:ncol(a)) { points(b, a[,i], col=heat.colors(18) [3+2*(i-1)], pch=15) lines(b, a[,i], col=heat.colors(18) [3+2*(i-1)], lwd=2) } legend("topleft", c("CD", "intron", "utr5", "utr3", "promoter 500", "downstream 500"), col=heat.colors(18)[ seq(3, 15, 2) ], pch=15, bty="n" ) dev.off() #################################################################################################################################################### #################################################################################################################################################### ### These scripts analyze the correlation between additive*enzyme effect of CCGG methylation and additive effect of gene expression ### for promoter regions k <- 5 setwd ("final2") load ("methyl.main.RData") load (paste("perm.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart) -24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) #denom for intercept and 7 coefs. s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) main.sort <- sort(main) #hpa <- names(main.sort)[1:514] #hpa <- names(main.sort)[1:1164] hpa <- names(main.sort)[1:1801] main.sort <- sort(main, decreasing=T) #hpa <- c(hpa, names(main.sort)[1:1268]) #hpa <- c(hpa, names(main.sort)[1:2781]) hpa <- c(hpa, names(main.sort)[1:4170]) cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe setwd ("../final") load ("expr.main.RData") load ("Main.expr.perm1.RData") s0 <- quantile( expr.perm [[2]], 0.5) expr <- expr.main[[1]][,1]/ (expr.main[[1]][,2] + s0) names(expr) <- rownames(expr.main[[1]]) setwd ("../ftp") promoter <- read.delim("TAIR7_upstream_500_20070405", header=F) promoter <- as.character( promoter[1:nrow(promoter),]) promoter <- promoter[ grep(">", promoter, fixed=T) ] #32041 gene <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[1,] coord <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[2,] gene <- gene[ grep("AT[12345]G", gene) ] #31762 coord <- coord [ grep("chr[12345]", coord) ] #31762 gene <- sub( ">", "", gene, fixed=T) strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #promoter has no reverse pos bin <- seq(100, 1000, 100) len <- rep(100, 10) posf <- matrix( sapply( bin, function(x) pos[,2] - x), nc=10) posr <- matrix( sapply( bin, function(x) pos[,1] + x), nc=10) methyl <- expression <- list() for (k in 1:10) { probes <- gene.name <- c() for (i in 1:length(gene) ) { ifelse( strand[i]=="FORWARD", posm <- c(posf[i,k], posf[i,k]+len[k]), posm <- c(posr[i,k]-len[k], posr[i,k]) ) ##for bining probe <- which (cg[,1] == chr[i] & cg[,2] >= posm[1] & cg[,2] <= posm[2] ) if (length(probe)>0 ) {probes <- c(probes, probe); gene.name <- c(gene.name, rep(gene[i], length(probe) ) )} if(i/1000==trunc(i/1000) ) cat(i, "\n") } mainm <- main[ match( rownames(cg) [probes], names(main) ) ] methyl[[k]] <- tapply(mainm, gene.name, function(x) sum(x)/length(x) ) expression[[k]] <- expr[ match(names(methyl[[k]]), names(expr) ) ] } methyl.expr <- list(methyl, expression) save(methyl.expr, file="../final2/review/ME.add005.promoter.RData", compress=T) q("no") ### see genic region (cd) setwd ("../ftp") cd <- read.delim("TAIR7_cds_20070425", header=F) cd <- as.character( cd[1:nrow(cd), ] ) cd <- cd[ grep(">", cd, fixed=T) ] #31921 gene <- matrix( unlist( strsplit(cd, " | ", fixed=T) ), nr=4)[1,] #31921 coord <- matrix( unlist( strsplit(cd, " | ", fixed=T) ), nr=4)[4,] #31921 gene <- gene[ grep("AT[12345]G", gene) ] #31711 coord <- coord [ grep("chr[12345]", coord) ] #31711 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] gene <- gene[isoform=="1"] #26784 coord <- coord[isoform=="1"] #26784 strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #cd has no reverse pos probes <- gene.name <- c() for (i in 1:length(gene) ) { probe <- which (cg[,1]== chr[i] & cg[,2] > pos[i,1] & cg[,2] < pos[i,2] ) if (length(probe)>0 ) {probes <- c(probes, probe); gene.name <- c(gene.name, rep(gene[i], length(probe) ) )} if(i/1000==trunc(i/1000) ) cat(i, "\n") } mainm <- main[ match( rownames(cg) [probes], names(main) ) ] methyl <- tapply(mainm, gene.name, function(x) sum(x)/length(x) ) expression <- expr[ match(names(methyl), names(expr) ) ] methyl.expr <- list(methyl, expression) save(methyl.expr, file="../final2/review/ME.add005.cd.RData", compress=T) q("no") ### downstream setwd ("../ftp") bin <- seq(100, 1000, 100) len <- rep(100, 10) downstream <- read.delim("TAIR7_downstream_500_20070405", header=F) downstream <- as.character( downstream[1:nrow(downstream), ] ) downstream <- downstream[ grep(">", downstream, fixed=T) ] #32041 gene <- matrix( unlist( strsplit(downstream, " | ", fixed=T) ), nr=2)[1,] coord <- matrix( unlist( strsplit(downstream, " | ", fixed=T) ), nr=2)[2,] gene <- gene[ grep("AT[12345]G", gene) ] #31762 coord <- coord [ grep("chr[12345]", coord) ] #31762 gene <- sub( ">", "", gene, fixed=T) strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #downstream has no reverse pos posf <- matrix( sapply( bin, function(x) pos[,1] + x), nc=10) posr <- matrix( sapply( bin, function(x) pos[,2] - x), nc=10) methyl <- expression <- list() for (k in 1:10) { probes <- gene.name <- c() for (i in 1:length(gene) ) { ifelse( strand[i]=="FORWARD", posm <- c(posf[i,k]-len[k], posf[i,k]), posm <- c(posr[i,k], posr[i,k]+len[k]) ) probe <- which (cg[,1] == chr[i] & cg[,2] >= posm[1] & cg[,2] <= posm[2] ) if (length(probe)>0 ) {probes <- c(probes, probe); gene.name <- c(gene.name, rep(gene[i], length(probe) ) )} if(i/1000==trunc(i/1000) ) cat(i, "\n") } mainm <- main[ match( rownames(cg) [probes], names(main) ) ] methyl[[k]] <- tapply(mainm, gene.name, function(x) sum(x)/length(x) ) expression[[k]] <- expr[ match(names(methyl[[k]]), names(expr) ) ] } methyl.expr <- list(methyl, expression) save(methyl.expr, file="../final2/review/ME.add005.downstream2.RData", compress=T) q("no") ### fitting correlation load ( "ME.add005.promoter.RData") fit <- list() for (k in 1:10) {express <- methyl.expr[[2]][[k]] [methyl.expr[[1]][[k]] >0] methyl <- methyl.expr[[1]][[k]] [methyl.expr[[1]][[k]] >0] fit[[k]] <- summary( lm( express~methyl, data=data.frame( express, methyl ) ) )} fit <- list() for (k in 1:10) {express <- methyl.expr[[2]][[k]] [methyl.expr[[1]][[k]] <0] methyl <- methyl.expr[[1]][[k]] [methyl.expr[[1]][[k]] <0] fit[[k]] <- summary( lm( express~methyl, data=data.frame( express, methyl ) ) )} ### plot fitting line ### for promoter pdf("ME.promoter.pdf", height=8, width=8) { par( mfrow=c(3,3) ) load ("ME.add005.promoter.RData") for (k in 2:10 ) { express <- methyl.expr[[2]][[k]][ methyl.expr[[1]][[k]]>0] methyl <- methyl.expr[[1]][[k]][ methyl.expr[[1]][[k]]>0] methyl <- methyl [express >-5] express <- express[express >-5] fit <- lm( express~methyl, data=data.frame( express, methyl) ) p <- paste("p=", round( summary(fit)$coef[2,4], 5), sep="" ) r <- paste( "r=", round( sqrt( summary(fit)$r.squared), 2), sep="") plot(methyl, express, "p", xlab="differential methylation d scores", ylab="differential expression d scores") abline(fit$coef[1], fit$coef[2]) ifelse(fit$coef[2]>0, text <- paste( "y=", round(fit$coef[1], 2), "+", round(fit$coef[2],2), "x", sep=""), text <- paste( "y=", round(fit$coef[1], 2), round(fit$coef[2],2), "x", sep="") ) legend( "topright", c(text, p, r ), bty="n") legend( "bottomright", paste("-", seq(200, 1000, 100), "bp", sep="") [k-1], bty="n") } } dev.off() ### for downstream pdf("ME.downstream.pdf", height=8, width=8) { par( mfrow=c(3,3) ) load ("ME.add005.downstream.RData") for (k in 2:10 ) { express <- methyl.expr[[2]][[k]][ methyl.expr[[1]][[k]]>0] methyl <- methyl.expr[[1]][[k]][ methyl.expr[[1]][[k]]>0] methyl <- methyl [express >-5] express <- express[express >-5] fit <- lm( express~methyl, data=data.frame( express, methyl) ) p <- paste("p=", round( summary(fit)$coef[2,4], 5), sep="" ) r <- paste( "r=", round( sqrt( summary(fit)$r.squared), 2), sep="") plot(methyl, express, "p", xlab="differential methylation d scores", ylab="differential expression d scores") abline(fit$coef[1], fit$coef[2]) ifelse(fit$coef[2]>0, text <- paste( "y=", round(fit$coef[1], 2), "+", round(fit$coef[2],2), "x", sep=""), text <- paste( "y=", round(fit$coef[1], 2), round(fit$coef[2],2), "x", sep="") ) legend( "topright", c(text, p, r ), bty="n") legend( "bottomright", paste("+", seq(200, 1000, 100), "bp",sep="") [k-1], bty="n") } } dev.off() ### for cd pdf("ME.cd.pdf") { load ("ME.add005.cd.RData") express <- methyl.expr[[2]][ methyl.expr[[1]]>1] methyl <- methyl.expr[[1]][ methyl.expr[[1]]>1] methyl <- methyl [express >-5] express <- express[express >-5] fit <- lm( express~methyl, data=data.frame( express, methyl) ) p <- paste("p=", round( summary(fit)$coef[2,4], 5), sep="" ) r <- paste( "r=", round( sqrt( summary(fit)$r.squared), 2), sep="") plot(methyl, express, "p", xlab="differential methylation d scores", ylab="differential expression d scores") abline(fit$coef[1], fit$coef[2]) ifelse(fit$coef[2]>0, text <- paste( "y=", round(fit$coef[1], 2), "+", round(fit$coef[2],2), "x", sep=""), text <- paste( "y=", round(fit$coef[1], 2), round(fit$coef[2],2), "x", sep="") ) legend( "topright", c(text, p, r ), bty="n") } dev.off() ### for promoter-cd-downstream pdf("ME.pdc.pdf", height=3, width=9) { par( mfrow=c(1,3) ) load ("ME.add005.promoter.RData") express <- methyl.expr[[2]][[1]][ methyl.expr[[1]][[1]]>0] methyl <- methyl.expr[[1]][[1]][ methyl.expr[[1]][[1]]>0] fit <- lm( express~methyl, data=data.frame( express, methyl) ) p <- paste("p=", signif( summary(fit)$coef[2,4], 2), sep="" ) r <- paste( "r=", signif( sqrt( summary(fit)$r.squared), 2), sep="") par( mai=c(0.6, 0.6, 0.25, 0.2) ) plot(methyl, express, "p", xlab="", ylab="differential expression d score") abline(fit$coef[1], fit$coef[2]) ifelse(fit$coef[2]>0, text <- paste( "y=", round(fit$coef[1], 2), "+", round(fit$coef[2],2), "x", sep=""), text <- paste( "y=", round(fit$coef[1], 2), round(fit$coef[2],2), "x", sep="") ) legend( "topright", c(text, p, r ), bty="n") legend( "bottomright", "upstream 100bp", bty="n") load ("ME.add005.cd.RData") express <- methyl.expr[[2]][ methyl.expr[[1]]>1] methyl <- methyl.expr[[1]][ methyl.expr[[1]]>1] methyl <- methyl [express >-5] express <- express[express >-5] fit <- lm( express~methyl, data=data.frame( express, methyl) ) p <- paste("p=", signif( summary(fit)$coef[2,4], 2), sep="" ) r <- paste( "r=", signif( sqrt( summary(fit)$r.squared), 2), sep="") plot(methyl, express, "p", xlab="differential methylation d scores", ylab="") abline(fit$coef[1], fit$coef[2]) ifelse(fit$coef[2]>0, text <- paste( "y=", round(fit$coef[1], 2), "+", round(fit$coef[2],2), "x", sep=""), text <- paste( "y=", round(fit$coef[1], 2), round(fit$coef[2],2), "x", sep="") ) legend( "topright", c(text, p, r ), bty="n") legend( "bottomright", "genic region", bty="n") load ("ME.add005.downstream2.RData") express <- methyl.expr[[2]][[1]][ methyl.expr[[1]][[1]]>0] methyl <- methyl.expr[[1]][[1]][ methyl.expr[[1]][[1]]>0] fit <- lm( express~methyl, data=data.frame( express, methyl) ) p <- paste("p=", signif( summary(fit)$coef[2,4], 2), sep="" ) r <- paste( "r=", signif( sqrt( summary(fit)$r.squared), 2), sep="") p <- "p=0.0005" plot(methyl, express, "p", xlab="", ylab="") abline(fit$coef[1], fit$coef[2]) ifelse(fit$coef[2]>0, text <- paste( "y=", round(fit$coef[1], 2), "+", round(fit$coef[2],2), "x", sep=""), text <- paste( "y=", round(fit$coef[1], 2), round(fit$coef[2],2), "x", sep="") ) legend( "topright", c(text, p, r ), bty="n") legend( "bottomright", "downstream 100bp", bty="n") } dev.off() #################################################################################################################################################### #################################################################################################################################################### ### These scripts do parametric analysis of gene set enrichment k <- 5 setwd ("final2") load ("methyl.main.RData") load ( paste("perm.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart) ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) #denom for intercept and 7 coefs. s0 <- quantile( perm.dstat[[2]], 0.05) std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) setwd ("../ftp") cd <- read.delim("TAIR7_cds_20070425", header=F) cd <- as.character( cd[1:nrow(cd), ] ) cd <- cd[ grep(">", cd, fixed=T) ] #31921 gene <- matrix( unlist( strsplit(cd, " | ", fixed=T) ), nr=4)[1,] #31921 coord <- matrix( unlist( strsplit(cd, " | ", fixed=T) ), nr=4)[4,] #31921 gene <- gene[ grep("AT[12345]G", gene) ] #31711 coord <- coord [ grep("chr[12345]", coord) ] #31711 genem <- sub( ">", "", gene, fixed=T) gene <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[1,] isoform <- matrix( unlist(strsplit(genem, ".", fixed=T) ), nr=2)[2,] gene <- gene[isoform=="1"] #26784 coord <- coord[isoform=="1"] #26784 strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #cd has no reverse pos setwd("../final2") probes <- gene.name <- c() for (i in 1:nrow(pos) ) { probe <- which(ccgg[,1] == chr[i] & ccgg[,2] >= pos[i,1] & ccgg[,2] <= pos[i,2] ) if( length(probe) >0 ) { probes <- c(probes, probe); gene.name <- c(gene.name, rep(gene[i], length(probe) ) ) } if(i/1000==trunc(i/1000) ) cat(i, "\n") } main <- main[probes] main <- tapply(main, gene.name, function(x) sum(x)/length(x) ) setwd ("../ftp") go <- c("proc", "func", "comp") for (j in 1:length(go) ) { load ( paste("go.", go[j], ".RData", sep="") ) gene.list <- intersect( names(main), rownames(go.mat) ) go.mat <- go.mat[ which(rownames(go.mat) %in% gene.list), ] go.mat <- go.mat[, colSums(go.mat) >= 15] go.mat <- go.mat[rowSums(go.mat) >0,] gene.list <- intersect(names(main), rownames(go.mat) ) main <- main[ match(gene.list, names(main) ) ] go.mat <- go.mat[ match(gene.list, rownames(go.mat) ), ] vec <- rep(NA, ncol(go.mat) ) names(vec) <- colnames(go.mat) z <- p <- vec mu <- mean(main ) std <- sqrt( var(main) ) for (i in 1:ncol(go.mat) ) { hit <- rownames(go.mat)[ go.mat[,i]==1 ] d <- main [ which(names(main) %in% hit ) ] sm <- mean (d) n <- length(hit) z[i] <- (sm-mu)/ (std /sqrt(n) ) ifelse( z[i] <=0, p[i] <- -pnorm(z[i] ), p[i] <- pnorm(z[i], lower.tail=F) ) } page <- cbind(z, p) save(page, file=paste("page.methyl.", go[j], k, "cd.RData", sep=""), compress=T ) } q("no") ### for promoter setwd ("../ftp") promoter <- read.delim("TAIR7_upstream_500_20070405", header=F) promoter <- as.character( promoter[1:nrow(promoter),]) promoter <- promoter[ grep(">", promoter, fixed=T) ] #32041 gene <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[1,] coord <- matrix( unlist( strsplit(promoter, " | ", fixed=T) ), nr=2)[2,] gene <- gene[ grep("AT[12345]G", gene) ] #31762 coord <- coord [ grep("chr[12345]", coord) ] #31762 gene <- sub( ">", "", gene, fixed=T) strand <- matrix( unlist( strsplit(coord, " ") ), nr=2) [2,] coord <- matrix( unlist( strsplit(coord, " ") ), nr=2) [1,] chr <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [1,] chr <- as.numeric( sub("chr", "", chr) ) pos <- matrix( unlist( strsplit(coord, ":", fixed=T) ), nr=2) [2,] pos <- matrix( as.numeric( unlist( strsplit( pos, "-", fixed=T) ) ), byrow=T, nc=2) #promoter has no reverse pos ### match GO ID to annotation setwd ("final2/go") load ("find.RData") go <- c("proc", "func", "comp") for (k in 4:7) { for( j in 1:3) { load ( paste("page.methyl.", go[j], k, ".pro.RData", sep="") ) page <- page [ order( abs( page[,2] ) ), ] term <- find[ match( rownames(page), names(find) ) ] print <- data.frame( page[,2], term) write.csv ( rbind( print[ print[,1]>0 & print[,1] < 0.1, ], print[ print[,1]<0 & print[,1]>-0.1, ]), "page.methyl.pro.csv", quote=F, append=T) } } #################################################################################################################################################### #################################################################################################################################################### ### These scripts do the comparison between our enzyme methylome data with ChIP-chip data #################################################################################################################################################### ### Joe's data setwd ("final2/comparison") block <- vector("list", length=5) for (chrom in 1:5) { post <- matrix(scan( paste("chr", chrom, ".txt", sep=""), what=1), byrow=T, nc=9) post <- post[, c(1, 6) ] post[,2] <- post[,2]/10000 L <- nrow(post) seg <- post[,2] > 0.5 bound <- diff(seg) meth <- c() for(i in 1:(L - 1)) { if(bound [i] == 1) meth <- c(meth, i+1) if(bound [i] == -1) meth <- c(meth, i) } if( meth[1]== -1) meth <- c(1, meth) if ( meth[ length(meth) ] == 1) meth <- c(meth, L) seg <- matrix(meth, ncol=2, byrow=T) #converting the probe indices into basepairs seg <- cbind( post[,1][seg[,1]], post[,1][seg[,2]] ) #minimum run 50bp seg <- seg [ which(seg [,2]-seg[,1]+1 >=50), ] block[[chrom]] <- seg } save (block, file="joe.block.gap.RData", compress=T) ### setwd ("final2") k <- 4 #k <- 5 load ("methyl.main.RData") load (paste("perm.dstat", k, ".RData", sep="") ) load ("attile.ccgg.RData") load ("mDNA.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart)-24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) rownames(ccgg) <- rownames(attile.ccgg) add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) #denom for intercept and 7 coefs. s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(mDNA.ccgg) main.sort <- sort(main, decreasing=T) #hpa <- names(main.sort)[1:2762] #hpa <- names(main.sort)[1:4095] #hpa <- names(main.sort)[1:5016] hpa <- names(main.sort)[1:6876] #main.sort <- sort(main) #hpa <- names(main.sort)[1:514] #hpa <- names(main.sort)[1:1164] #hpa <- names(main.sort)[1:1801] #main.sort <- sort(main, decreasing=T) #hpa <- c(hpa, names(main.sort)[1:1268]) #hpa <- c(hpa, names(main.sort)[1:2781]) #hpa <- c(hpa, names(main.sort)[1:4170]) cg <- ccgg[ which ( rownames(ccgg) %in% hpa ), ] #pos for the hpa high probe save(cg, ccgg, file="comparison/enz.RData", compress=T) #save(cg, ccgg, file="comparison/addenz.RData", compress=T) ### setwd ("final2/comparison") load ("joe.block.gap.RData") load ("enz.RData") enz <- vector("list", length=5) for (chrom in 1:5) { cg.chr <- cg[ cg[,1]== chrom, ] b.chr <- block[[chrom]] #the block will be from bpstart of left boundary to bpend of right boundary b.chr[,2] <- b.chr[,2]+24 enz.chr <- rep(NA, nrow(b.chr) ) for (i in 1:nrow(b.chr) ) enz.chr [i] <- length( which( cg.chr[,2] >= b.chr[i,1] & cg.chr[,2] <= b.chr[i,2] ) ) enz[[chrom]] <- cbind( (b.chr[,1]+b.chr[,2])/2, enz.chr) } load ("addenz.RData") addenz <- vector("list", length=5) for (chrom in 1:5) { cg.chr <- cg[ cg[,1]== chrom, ] b.chr <- block[[chrom]] #the block will be from bpstart of left boundary to bpend of right boundary b.chr[,2] <- b.chr[,2]+24 addenz.chr <- rep(NA, nrow(b.chr) ) for (i in 1:nrow(b.chr) ) addenz.chr [i] <- length( which( cg.chr[,2] >= b.chr[i,1] & cg.chr[,2] <= b.chr[i,2] ) ) addenz[[chrom]] <- cbind( (b.chr[,1]+b.chr[,2])/2, addenz.chr) } count <- vector("list", length=5) for (chrom in 1:5) { ccgg.chr <- ccgg[ ccgg[,1]== chrom, ] b.chr <- block[[chrom]] #the block will be from bpstart of left boundary to bpend of right boundary b.chr[,2] <- b.chr[,2]+24 count.chr <- rep(NA, nrow(b.chr) ) for (i in 1:nrow(b.chr) ) count.chr [i] <- length( which( ccgg.chr[,2] >= b.chr[i,1] & ccgg.chr[,2] <= b.chr[i,2] ) ) count [[chrom]] <- cbind( (b.chr[,1]+b.chr[,2])/2, count.chr) } lapply(enz, function(x) table(x[,2]) ) b <- lapply(enz, function(x) sum( x[,2][x[,2] !=0] ) ) b <- sum (unlist(b)) lapply(count, function(x) table(x[,2]) ) a <- lapply(count, function(x) sum( x[,2][x[,2] !=0] ) ) a <- sum (unlist(a)) b; a; b/a c <- lapply(enz, function(x) nrow( x[which(x[,2]!=0),]) ) sum(unlist(c)) ### see distribution difference of enzyme effect within sizes of hmm segments ctdis <- matrix( rep(numeric(), 5), nc=5 ) for (chrom in 1:5) {enz.ct <- enz[[chrom]][,2] addenz.ct <- addenz[[chrom]][,2] count.ct <- count[[chrom]][,2] b.chr <- block[[chrom]] ct <- cbind(b.chr, enz.ct, addenz.ct, count.ct) ct <- ct[ which(ct[,5] !=0), ] ctdis <- rbind(ctdis, ct) } enz <- (ctdis[,2]-ctdis[,1]+1)[ ctdis[,3]!=0 ] nenz <- (ctdis[,2]-ctdis[,1]+1)[ ctdis[,3]==0 ] addenz <- (ctdis[,2]-ctdis[,1]+1)[ ctdis[,4]!=0 ] naddenz <- (ctdis[,2]-ctdis[,1]+1)[ ctdis[,4]==0 ] range1 <- range( c(enz, nenz) ) range2 <- range( c(addenz, naddenz) ) pdf ("hmm.dis.pdf") par( mfrow=c(2,2) ) hist(enz, breaks=30, xlim=range1, ylim=c(0, 400), xlab="ChIP-chip segment size", ylab="# segments", main="" ) legend("topright", "constitutive", bty="n") hist(nenz, breaks=30, xlim=range1, ylim=c(0, 400), xlab="ChIP-chip segment size", ylab="# segments", main="" ) legend("topright", "non-constitutive", bty="n") hist(addenz, breaks=30, xlim=range2, ylim=c(0, 600), xlab="ChIP-chip segment size", ylab="# segments", main="" ) legend("topright", "polymorphic", bty="n") hist(naddenz, breaks=30, xlim=range2, ylim=c(0, 600), xlab="ChIP-chip segment size", ylab="# segments", main="" ) legend("topright", "non-polymorphic", bty="n") dev.off() plot( jitter( rep(1, length(enz)), factor=2), enz, xlim=c(0, 3), ylim=range, pch=".", cex=0.2) points (jitter( rep(2, length(nenz)), factor=2), nenz, pch=".", cex=0.2) #################################################################################################################################################### ### Henikoff's data setwd ("final2/comparison") a <- scan("GSM138296_wiggle.txt", what="a") a <- matrix( a[- c(1:14)], byrow=T, nc=4) a[,1] <- sub("chr","", a[,1]) b <- scan("GSM138297_wiggle.txt", what="a") b <- matrix( b[- c(1:14)], byrow=T, nc=4) ng <- cbind( as.numeric(a[,1]), as.numeric(a[,2]), as.numeric(a[,3]), (as.numeric(a[,4]) + as.numeric(b[,4]) )/2 ) rm(a, b); gc() ng <- ng[ which(ng[,4]>=1.28),] ### setwd ("comparison") load ("ng.RData") load ("enz.RData") enz.count <- vector("list", length=5) for (chrom in 1:5) {ng.chr <- ng[ which(ng[,1]==chrom),] cg.chr <- cg[ which(cg[,1]==chrom),] enz.count.chr <- rep(NA, nrow(ng.chr) ) for (i in 1:nrow(ng.chr) ) enz.count.chr [i] <- length( which( cg.chr[,2] >= ng.chr[i,2] & cg.chr[,2] <= ng.chr[i,3] ) ) enz.count[[chrom]] <- enz.count.chr } a <- lapply(enz.count, function(x) sum(x) ); a <- sum(unlist(a)) b <- nrow(cg) c <- lapply(enz.count, function(x) length (which(x!=0)) );c <- sum( unlist(c)) load ("addenz.RData") addenz.count <- vector("list", length=5) for (chrom in 1:5) {ng.chr <- ng[ which(ng[,1]==chrom),] cg.chr <- cg[ which(cg[,1]==chrom),] addenz.count.chr <- rep(NA, nrow(ng.chr) ) for (i in 1:nrow(ng.chr) ) addenz.count.chr [i] <- length( which( cg.chr[,2] >= ng.chr[i,2] & cg.chr[,2] <= ng.chr[i,3] ) ) addenz.count[[chrom]] <- addenz.count.chr } c <- lapply(addenz.count, function(x) sum(x) ) c <- sum(unlist(c)) d <- nrow(cg) load ("../attile.ccgg.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart)-24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) ccgg.count <- vector("list", length=5) for (chrom in 1:5) {ng.chr <- ng[ which(ng[,1]==chrom),] ccgg.chr <- ccgg[ which(ccgg[,1]==chrom),] ccgg.count.chr <- rep(NA, nrow(ng.chr) ) for (i in 1:nrow(ng.chr) ) ccgg.count.chr [i] <- length( which( ccgg.chr[,2] >= ng.chr[i,2] & ccgg.chr[,2] <= ng.chr[i,3] ) ) ccgg.count[[chrom]] <- ccgg.count.chr } e <- lapply(ccgg.count, function(x) sum(x) ) e <- sum(unlist(e)) #################################################################################################################################################### ### the effect of relative CCGG position within probes setwd ("final2") load ("attile.ccgg.RData") load ("methyl.main.RData") add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) #denom for intercept and 7 coefs. k <- 4 load (paste("perm.dstat", k, ".RData", sep="") ) s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main.enz <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main.enz) <- rownames(attile.ccgg) k <- 5 load (paste("perm.dstat", k, ".RData", sep="") ) s0 <- quantile( perm.dstat[[2]], 0.05) rm(perm.dstat) gc() std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main.addenz <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main.addenz) <- rownames(attile.ccgg) seq <- as.character(attile.ccgg$seq) multi <- gregexpr("ccgg", seq) ccgg <- regexpr("ccgg", seq) multi <- unlist( lapply(multi, function(x)length(x)) ) which <- which( multi == 1) ccgg <- ccgg[ which ] ccgg <- ccgg + 1 enz.pos <- cbind( ccgg, main.enz[which] ) addenz.pos <- cbind( ccgg, main.addenz[which] ) pdf ("comparison/ccgg.position.pdf", width=9, height=3) par(mfrow=c(1,3) ) hist( ccgg, breaks=25, main="", xlab="position of cCgg within feature", ylab="# features") plot( jitter(enz.pos[,1], factor=2), enz.pos[,2], pch=".", cex=0.2, xlab="position of cCgg within feature", ylab="constitutive methylation d scores") plot( jitter(addenz.pos[,1], factor=2), addenz.pos[,2], pch=".", cex=0.2, xlab="position of cCgg within feature", ylab="polymorphic methylation d scores") dev.off() #multi # 1 2 3 #53509 1002 8 #the enz effect and addenz effect overlap 1044/54519 features. #################################################################################################################################################### ### distribution of CCGG probes within genome setwd ("final2") load ("attile.ccgg.RData") load ("attile1V7anno.RData") seq <- as.character(attile.ccgg$seq) bp <- as.numeric( attile.ccgg$bpstart)-24 ccgg <- regexpr("ccgg", seq) ccgg <- bp+ccgg #so the second c is what we test ccgg <- cbind( as.numeric(attile.ccgg$chr), ccgg) mid <- as.numeric( attile1$bpstart) -24 + 12 mid <- cbind( as.numeric(attile1$chr), mid) setwd ("comparison") chr.len <- c(30432563, 19705359, 23470805, 18585042, 26992728) size <- 100000 pdf ("CCGG.dist.pdf", width=9, height=6) par(mfrow=c(2,3) ) for (chrom in 1:5) { a <- ccgg[,2][ccgg[,1]==chrom] b <- mid[,2][ mid[,1]==chrom] page <- trunc( chr.len[chrom] /size ) counta <- countb <- rep(NA, page) for (i in 1:page ) { counta[i] <- length( which ( a > (i-1)*size & a <= i*size ) ) countb[i] <- length( which ( b > (i-1)*size & b <= i*size ) )} plot( seq(50000, page*size, 100000), (counta/countb)*100, "l", xlim=c(0, chr.len[1]), ylim=c(0, 12), xlab="bp", ylab="percent CCGG site" ) legend("bottomright", paste("chr", chrom, sep=""), bty="n") } dev.off() #################################################################################################################################################### ### effect of fragment size setwd ("ftp") fr <- vector("list", length=5) for (chrom in 1:5) {a <- scan ( paste("ATH1_chr", chrom, ".1con.01222004", sep=""), what="a") a <- a[-1] a <- paste(a, collapse="") fr[[chrom]] <- gregexpr("CCGG", a) } save(fr, file="../final2/comparison/fragment.RData", compress=T) q("no") ### setwd ("final2/comparison") load ("fragment.RData") load ("../attile1V7anno.RData") load ("../mDNA.sc.RData") library(affy) mDNA.nq <- normalize.quantiles(mprobe.mean) colm <- rowMeans( mDNA.nq[, seq(2,8,2)] ) colh <- rowMeans( mDNA.nq[, seq(1,8,2)] ) ; rm(mDNA.nq); gc() chr <- as.numeric(attile1$chr) bpend <- as.numeric(attile1$bpstart) bpstart <- bpend-24 chr.len <- c(30432563, 19705359, 23470805, 18585042, 26992728) fr.len <- pr.mean.m <- pr.mean.hm <- c() for (chrom in 1:5) {bps <- bpstart[chr==chrom] bpe <- bpend[chr==chrom] com <- colm[chr==chrom] coh <- colh[chr==chrom] pos <- unlist(fr[[chrom]]) pos <- pos+1 #so the second cytosine pos <- cbind(pos[-length(pos)], pos[-1]) len <- medm <- medhm <- rep(NA, nrow(pos) ) for (i in 1:nrow(pos) ) { len [i] <- pos[i,2]-pos[i,1]+1 m <- which( bps> pos[i,1] & bpe < pos[i,2]) medm [i] <- mean( com[m] ) medhm[i] <- mean( coh[m]-com[m] ) if( i/1000==trunc(i/1000) ) cat(i, "\n") } fr.len <- c(fr.len, len) pr.mean.m <- c(pr.mean.m, medm) pr.mean.hm <- c(pr.mean.hm, medhm) } save(fr.len, pr.mean.m, pr.mean.hm, file="plot.fragment.RData", compress=T) load ("plot.fragment.RData") pdf ("fragment.pdf", width=12, height=6) par ( mfrow=c(1, 2) ) nas <- which( ( is.na(pr.mean.m) | is.na(pr.mean.hm) ) ) fr.len <- fr.len[-nas] pr.mean.m <- pr.mean.m[-nas] pr.mean.hm <- pr.mean.hm[-nas] quant <- quantile(fr.len, seq(0.05, 1, 0.05) ) cut <- cut(fr.len, breaks=c(0, quant), labels=seq(5,100,5) ) cut <- as.numeric( as.character(cut) ) plot ( jitter(cut, factor=2), pr.mean.m, pch=".", cex=0.2, xlab="quantiles of fragment length", ylab="mean probe intensity" ) plot ( jitter(cut, factor=2), pr.mean.hm, pch=".", cex=0.2, xlab="quantiles of fragment length", ylab="mean probe intensity difference" ) dev.off() ### setwd ("final2/comparison") load ("fragment.RData") load ("../attile.ccgg.RData") chr <- as.numeric(attile.ccgg$chr) bp <- as.numeric( attile.ccgg$bpstart) - 24 seq <- as.character(attile.ccgg$seq) ccgg <- regexpr("ccgg", seq) bp <- bp+ccgg #so the second c is what we test ind <- c(1:length(bp) ) len <- all.ind <- c() for (chrom in 1:5) { pos <- unlist(fr[[chrom]]) pos <- pos+1 #so the second cytosine bp.chr <- bp[chr==chrom] ind.chr <- ind[ chr==chrom] if(chrom==4) {bp.chr <- bp.chr[ - c(1, length(bp.chr) ) ]; ind.chr <- ind.chr[-c(1, length(ind.chr)) ]} all.ind <- c(all.ind, ind.chr) index <- which(pos %in% bp.chr) len <- c(len, pos[index+1]-pos[index-1] + 1) } add <- rep( c(1,-1,0, 0), each=8) dom <- rep( c(0, 0, 1, 1), each=8) mat <- rep( c(0, 0, -1, 1), each=8) enz <- rep( c(1,-1), 16) addenz <- add*enz domenz <- dom*enz matenz <- mat*enz denom <- c( sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (add-mean(add) )^2) ), sqrt( sum( (dom-mean(dom) )^2) ), sqrt( sum( (mat-mean(mat) )^2) ) ) #denom for intercept and 7 coefs. load ("../methyl.main.RData") pdf ("frag.dscore.pdf", width=8, height=4) par(mfrow=c(1,2)) k <- 4 load (paste("../perm.dstat", k, ".RData", sep="") ) s0 <- quantile( perm.dstat[[2]], 0.05); rm(perm.dstat); gc() std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) main <- main[all.ind] quant <- quantile(len, seq(0.05, 1, 0.05) ) cut <- cut(len, breaks=c(0, quant), labels=seq(5,100,5) ) cut <- as.numeric( as.character(cut) ) plot ( jitter(cut, factor=2), main, pch=".", cex=0.2, xlab="quantiles of fragment length", ylab="constitutive methylation d scores" ) k <- 5 load (paste("../perm.dstat", k, ".RData", sep="") ) s0 <- quantile( perm.dstat[[2]], 0.05); rm(perm.dstat); gc() std <- sqrt( colSums( methyl.main[[2]]^2)/24 ) / denom[k+1] main <- methyl.main[[1]][k+1,] / (std + s0) #dstatistic of probes names(main) <- rownames(attile.ccgg) main <- main[all.ind] quant <- quantile(len, seq(0.05, 1, 0.05) ) cut <- cut(len, breaks=c(0, quant), labels=seq(5,100,5) ) cut <- as.numeric( as.character(cut) ) plot ( jitter(cut, factor=2), main, pch=".", cex=0.2, xlab="quantiles of fragment length", ylab="polymorphic methylation d scores" ) dev.off() #################################################################################################################################################### ####################################################################################################################################################