#import seqmonk data #reads were mapped to the SK1 genome, non-unique reads allowed (top line) and SGD1.01, only unique reads (bottom line) #note that read counts are provided as log to base 2 values data.full<-as.data.frame(read.delim("100bp trf4 vs wt log quant 12-6-14.txt",,header=T, sep="\t", colClasses= c("character","numeric","numeric","character","character","numeric","numeric"))) data.unique<-as.data.frame(read.delim("100bp trf4 vs wt log quant 12-6-14 unique.txt",,header=T, sep="\t", colClasses= c("character","numeric","numeric","character","character","numeric","numeric"))) #sort the chromosome definitions data.full$chr<-as.roman(data.full$Chromosome) #combine the datasets into a usable single dataframe data.full$wt.unique<-data.unique$wt data.full$trf4.unique<-data.unique$trf4 #filter out the rDNA #rdna37-1 is 458432-451575, 2 is 467569-460712 data<-data.full[data.full$chr!=12 | (data.full$Start<451500 | (data.full$Start>=458500 & data.full$Start<460700) | data.full$Start>=467600),] #need to do a bit of normalisation, so normalise the wt signals to have the same total counts as the trf4, but keep a copy of the original #note that we need to de-log and re-log here mnorm<-sum(2^data$wt)/sum(2^data$trf4) data$wt.original<-data$wt data$wt<-log(2^data$wt.original/mnorm,base=2) unorm<-sum(2^data$wt.unique)/sum(2^data$trf4.unique) data$wt.unique.original<-data$wt.unique data$wt.unique<-log(2^data$wt.unique.original/mnorm,base=2) ############################################################################################# #include these lines to do the analysis on the unique mapping reads only data$wt<-data$wt.unique data$trf4<-data$trf4.unique unique=" unique" ############################################################################################# #calculate the trf4 to wt difference #subtract not divide as we are using log values data$dif<-data$trf4-data$wt #this cut off is based on log values CUT.cutoff=2 #define a set of regions (cuts_a) that contains only 100bp regions in which the trf4 read count exceeds the #wt read count by the cut off cuts_a<-data[data$dif>CUT.cutoff & is.na(data$chr)==FALSE,] #print out the number of segments nrow(data) nrow(cuts_a) #################################################### #first find the real start and end of each CUT by combining adjacent segments #we designate a grey ares of 200bp (2 segment) as things that close are probably a single feature, #so the test range is 2 segments (200bp) test.range=300 #BEWARE - some scope breaking assignment going on here to make the iteration work #watch the scoping operators as they could do some unexpected things if reused #we scroll through the list of CUTs, changing the real_start value to be the same as the real_start of the previous one if they are adjacent #but this means that the values of real_start need to change in real time which doesn't work with sapply #so whenever we hit non-adjacent sites, we return Start and set a to Start #whenever we have an adjacent value we return a a<-cuts_a$Start[1] cuts_a$real_start[1]<-a cuts_a$real_start[2:nrow(cuts_a)]<-sapply(2:nrow(cuts_a),function(x) if(cuts_a$Start[x]<=cuts_a$Start[x-1]+test.range) a else { a<<-cuts_a$Start[x] ; cuts_a$Start[x] }) #find the real ends in the same way a=cuts_a$End[nrow(cuts_a)] cuts_a$real_end[nrow(cuts_a)]=a cuts_a$real_end[seq(nrow(cuts_a)-1,1, by=-1)]<-sapply(seq(nrow(cuts_a)-1,1, by=-1),function(x) if(cuts_a$End[x]>=cuts_a$End[x+1]-test.range) a else { a<<-cuts_a$End[x] ; cuts_a$End[x] }) #now average the read counts across each cut #a is a counter for number of segments #b is the average #returns the average read count in the final entry for each CUT, or -1 for the others cuts_a$wt_mean<-sapply(1:nrow(cuts_a),function(x) { if(cuts_a$Start[x]==cuts_a$real_start[x]) { a<<-1 ; b<<-cuts_a$wt[x] } else { a<<-a+1 ; b<<-b+cuts_a$wt[x] } if(cuts_a$End[x]==cuts_a$real_end[x]) { b/a } else { -1 } }) cuts_a$trf4_mean<-sapply(1:nrow(cuts_a),function(x) { if(cuts_a$Start[x]==cuts_a$real_start[x]) { a<<-1 ; b<<-cuts_a$trf4[x] } else { a<<-a+1 ; b<<-b+cuts_a$trf4[x] } if(cuts_a$End[x]==cuts_a$real_end[x]) { b/a } else { -1 } }) #now filter the dataset to get cuts_b which contains only one line per CUT and gets the mean signal difference that CUT cuts_b<-cuts_a[cuts_a$End==cuts_a$real_end,] cuts_b$dif<-cuts_b$trf4_mean-cuts_b$wt_mean #chuck out some columns that are no longer meaningful cuts_b$Probe<-NULL cuts_b$Chromosome<-NULL cuts_b$Start<-NULL cuts_b$End<-NULL cuts_b$wt<-NULL cuts_b$trf4<-NULL cuts_b$dif<-NULL #add some that are cuts_b$size<-cuts_b$real_end-cuts_b$real_start+1 cuts_b$difference<-cuts_b$trf4_mean-cuts_b$wt_mean ############################################ #now each CUT has a single row in cuts_b #print out some data from the CUTs "cut difference cut off" 2^CUT.cutoff "cuts were merged over a range of" test.range "giving this number of cuts" nrow(cuts_b) #want to process the data a bit to get only entries with 3 or more consequtive hits (this is the size cutoff) min.size=300 cuts_c<-cuts_b[cuts_b$size>=min.size,] "apply size cut off" min.size "giving this number of cuts" nrow(cuts_c) noise.cutoff=6 cuts_d<-cuts_c[cuts_c$trf4_mean>noise.cutoff,] "noise cut off of minimum counts in trf4" noise.cutoff "leaves" nrow(cuts_d) #finally write out as a list of cuts fname<-paste("meiotic cuts mindiff=",2^CUT.cutoff," minsize=",min.size," noise=",noise.cutoff,unique,".txt",sep="") write.table(cuts_d,fname,sep="\t",append=F,quote=F,row.names=F) ############################################################ #find how much CUT material is associated with cbc2 sum.wt = sum(2^data$wt) sum.trf4 = sum(2^data$trf4) sum.wt.cuts = sum(2^cuts_a$wt) sum.trf4.cuts = sum(2^cuts_a$trf4) "sum of wt reads" sum.wt "sum of trf4 reads" sum.trf4 "percentages of cut sequences" sum.wt.cuts/sum.wt*100 sum.trf4.cuts/sum.trf4*100 ############################################################################### #now want to compare the CUTs we find to those that other studies found #for the 3 datasets, we have 6 comparisons as they are directional (multiple CUTs in one set can map to a one CUT in another) #can just import the list with these two lines without doing the processing again cuts_d<-as.data.frame(read.delim(fname,,header=T, sep="\t", colClasses= c("character","character","character","numeric","numeric","numeric","numeric","numeric","numeric"))) cuts_d$chr<-as.roman(cuts_d$chr) #this file was processed out of the Xu et al SI Xu_CUTs<-as.data.frame(read.delim("List of CUTs from Lars.txt",,header=T, sep="\t", colClasses= c("numeric","character","numeric","numeric","character"))) #overlap range limit ol.range<-500 cuts_d$Xu_match<-0 Xu_CUTs$Meiotic_match<-0 cuts_d$Xu_match<-sapply(1:nrow(cuts_d),function(y) { tc<-cuts_d$chr[y] te<-cuts_d$real_end[y]+ol.range ts<-cuts_d$real_start[y]-ol.range sum(sapply(1:nrow(Xu_CUTs),function(x) { if(Xu_CUTs$chr[x]==tc) { if(Xu_CUTs$start[x]ts){Xu_CUTs$Meiotic_match[x]<<-Xu_CUTs$Meiotic_match[x]+1; 1} else 0 } else 0 })) }) ol.range print("Number of CUTS in Xu et al") nrow(Xu_CUTs) print("Number of Xu CUTs that overlap with meiotic CUTs") nrow(Xu_CUTs[Xu_CUTs$Meiotic_match != 0,]) print("Number of Meiotic CUTs that overlap with Xu CUTs") nrow(cuts_d[cuts_d$Xu_match != 0,]) #now for Alain's CUTs #processed from the Neil et al SI Neil_CUTs<-as.data.frame(read.delim("List of CUTs from Alain.txt",,header=T, sep="\t", colClasses= c("character","character","numeric","numeric"))) Neil_CUTs$chr<-as.roman(Neil_CUTs$chr) #overlap range limit cuts_d$Neil_match<-0 Neil_CUTs$Meiotic_match<-0 cuts_d$Neil_match<-sapply(1:nrow(cuts_d),function(y) { tc<-cuts_d$chr[y] te<-cuts_d$real_end[y]+ol.range ts<-cuts_d$real_start[y]-ol.range sum(sapply(1:nrow(Neil_CUTs),function(x) { if(Neil_CUTs$chr[x]==tc) { if(Neil_CUTs$start[x]ts) {Neil_CUTs$Meiotic_match[x]<<-Neil_CUTs$Meiotic_match[x]+1; 1} else 0 } else 0 })) }) ol.range print("Number of CUTS in Neil et al") nrow(Neil_CUTs) print("Number of Neil CUTs that overlap with meiotic CUTs") nrow(Neil_CUTs[Neil_CUTs$Meiotic_match != 0,]) print("Number of Meiotic CUTs that overlap with Neil CUTs") nrow(cuts_d[cuts_d$Neil_match != 0,]) #and compare them to each other Neil_CUTs$Xu_match<-0 Xu_CUTs$Neil_match<-0 Neil_CUTs$Xu_match<-sapply(1:nrow(Neil_CUTs),function(y) { tc<-Neil_CUTs$chr[y] te<-Neil_CUTs$end[y]+ol.range ts<-Neil_CUTs$start[y]-ol.range sum(sapply(1:nrow(Xu_CUTs),function(x) { if(Xu_CUTs$chr[x]==tc) { if(Xu_CUTs$start[x]ts) {Xu_CUTs$Neil_match[x]<<-Xu_CUTs$Neil_match[x]+1; 1} else 0 } else 0 })) }) ol.range print("Number of Neil CUTs that overlap with Xu CUTs") nrow(Neil_CUTs[Neil_CUTs$Xu_match != 0,]) print("Number of Xu CUTs that overlap with Neil CUTs") nrow(Xu_CUTs[Xu_CUTs$Neil_match != 0,]) print("Number of meiotic CUTs matching both") nrow(cuts_d[cuts_d$Neil_match != 0 & cuts_d$Xu_match != 0 ,]) print("Number of Neil CUTs matching both") nrow(Neil_CUTs[Neil_CUTs$Meiotic_match != 0 & Neil_CUTs$Xu_match != 0 ,]) print("Number of Xu CUTs matching both") nrow(Xu_CUTs[Xu_CUTs$Neil_match != 0 & Xu_CUTs$Meiotic_match != 0 ,])