#import seqmonk data
#reads were mapped to the SK1 genome, non-unique reads allowed (top line) and SGD1.01, only unique reads (bottom line)
#note that read counts are provided as log to base 2 values
data.full<-as.data.frame(read.delim("100bp trf4 vs wt log quant 12-6-14.txt",,header=T, sep="\t", colClasses= c("character","numeric","numeric","character","character","numeric","numeric")))
data.unique<-as.data.frame(read.delim("100bp trf4 vs wt log quant 12-6-14 unique.txt",,header=T, sep="\t", colClasses= c("character","numeric","numeric","character","character","numeric","numeric")))

#sort the chromosome definitions
data.full$chr<-as.roman(data.full$Chromosome)

#combine the datasets into a usable single dataframe
data.full$wt.unique<-data.unique$wt
data.full$trf4.unique<-data.unique$trf4

#filter out the rDNA
#rdna37-1 is 458432-451575, 2 is 467569-460712
data<-data.full[data.full$chr!=12 | (data.full$Start<451500 | (data.full$Start>=458500 & data.full$Start<460700) | data.full$Start>=467600),]

#need to do a bit of normalisation, so normalise the wt signals to have the same total counts as the trf4, but keep a copy of the original
#note that we need to de-log and re-log here
mnorm<-sum(2^data$wt)/sum(2^data$trf4)
data$wt.original<-data$wt
data$wt<-log(2^data$wt.original/mnorm,base=2)

unorm<-sum(2^data$wt.unique)/sum(2^data$trf4.unique)
data$wt.unique.original<-data$wt.unique
data$wt.unique<-log(2^data$wt.unique.original/mnorm,base=2)

#############################################################################################
#include these lines to do the analysis on the unique mapping reads only
data$wt<-data$wt.unique
data$trf4<-data$trf4.unique
unique=" unique"
#############################################################################################


#calculate the trf4 to wt difference
#subtract not divide as we are using log values
data$dif<-data$trf4-data$wt

#this cut off is based on log values
CUT.cutoff=2

#define a set of regions (cuts_a) that contains only 100bp regions in which the trf4 read count exceeds the
#wt read count by the cut off
cuts_a<-data[data$dif>CUT.cutoff & is.na(data$chr)==FALSE,]
#print out the number of segments
nrow(data)
nrow(cuts_a)

####################################################
#first find the real start and end of each CUT by combining adjacent segments
#we designate a grey ares of 200bp (2 segment) as things that close are probably a single feature, 
#so the test range is 2 segments (200bp)
test.range=300

#BEWARE - some scope breaking assignment going on here to make the iteration work
#watch the scoping operators as they could do some unexpected things if reused
#we scroll through the list of CUTs, changing the real_start value to be the same as the real_start of the previous one if they are adjacent
#but this means that the values of real_start need to change in real time which doesn't work with sapply
#so whenever we hit non-adjacent sites, we return Start and set a to Start
#whenever we have an adjacent value we return a

a<-cuts_a$Start[1]
cuts_a$real_start[1]<-a
cuts_a$real_start[2:nrow(cuts_a)]<-sapply(2:nrow(cuts_a),function(x) if(cuts_a$Start[x]<=cuts_a$Start[x-1]+test.range) a else { a<<-cuts_a$Start[x] ; cuts_a$Start[x] })

#find the real ends in the same way

a=cuts_a$End[nrow(cuts_a)]
cuts_a$real_end[nrow(cuts_a)]=a
cuts_a$real_end[seq(nrow(cuts_a)-1,1, by=-1)]<-sapply(seq(nrow(cuts_a)-1,1, by=-1),function(x) if(cuts_a$End[x]>=cuts_a$End[x+1]-test.range) a else { a<<-cuts_a$End[x] ; cuts_a$End[x] })

#now average the read counts across each cut
#a is a counter for number of segments
#b is the average
#returns the average read count in the final entry for each CUT, or -1 for the others
cuts_a$wt_mean<-sapply(1:nrow(cuts_a),function(x)
{ 
	if(cuts_a$Start[x]==cuts_a$real_start[x]) { a<<-1 ; b<<-cuts_a$wt[x] } else { a<<-a+1 ; b<<-b+cuts_a$wt[x] }
	if(cuts_a$End[x]==cuts_a$real_end[x]) { b/a } else { -1 } 
})
cuts_a$trf4_mean<-sapply(1:nrow(cuts_a),function(x)
{ 
	if(cuts_a$Start[x]==cuts_a$real_start[x]) { a<<-1 ; b<<-cuts_a$trf4[x] } else { a<<-a+1 ; b<<-b+cuts_a$trf4[x] }
	if(cuts_a$End[x]==cuts_a$real_end[x]) { b/a } else { -1 } 
})

#now filter the dataset to get cuts_b which contains only one line per CUT and gets the mean signal difference that CUT
cuts_b<-cuts_a[cuts_a$End==cuts_a$real_end,]
cuts_b$dif<-cuts_b$trf4_mean-cuts_b$wt_mean

#chuck out some columns that are no longer meaningful
cuts_b$Probe<-NULL
cuts_b$Chromosome<-NULL
cuts_b$Start<-NULL
cuts_b$End<-NULL
cuts_b$wt<-NULL
cuts_b$trf4<-NULL
cuts_b$dif<-NULL

#add some that are
cuts_b$size<-cuts_b$real_end-cuts_b$real_start+1
cuts_b$difference<-cuts_b$trf4_mean-cuts_b$wt_mean

############################################
#now each CUT has a single row in cuts_b
#print out some data from the CUTs

"cut difference cut off"
2^CUT.cutoff
"cuts were merged over a range of"
test.range
"giving this number of cuts"
nrow(cuts_b)

#want to process the data a bit to get only entries with 3 or more consequtive hits (this is the size cutoff)
min.size=300

cuts_c<-cuts_b[cuts_b$size>=min.size,] 
"apply size cut off"
min.size
"giving this number of cuts"
nrow(cuts_c)

noise.cutoff=6
cuts_d<-cuts_c[cuts_c$trf4_mean>noise.cutoff,]
"noise cut off of minimum counts in trf4"
noise.cutoff
"leaves"
nrow(cuts_d)

#finally write out as a list of cuts
fname<-paste("meiotic cuts mindiff=",2^CUT.cutoff," minsize=",min.size," noise=",noise.cutoff,unique,".txt",sep="") 
write.table(cuts_d,fname,sep="\t",append=F,quote=F,row.names=F)

############################################################
#find how much CUT material is associated with cbc2

sum.wt = sum(2^data$wt)
sum.trf4 = sum(2^data$trf4)
sum.wt.cuts = sum(2^cuts_a$wt)
sum.trf4.cuts = sum(2^cuts_a$trf4)


"sum of wt reads"
sum.wt
"sum of trf4 reads"
sum.trf4
"percentages of cut sequences"
sum.wt.cuts/sum.wt*100
sum.trf4.cuts/sum.trf4*100

###############################################################################
#now want to compare the CUTs we find to those that other studies found
#for the 3 datasets, we have 6 comparisons as they are directional (multiple CUTs in one set can map to a one CUT in another)

#can just import the list with these two lines without doing the processing again
cuts_d<-as.data.frame(read.delim(fname,,header=T, sep="\t", colClasses= c("character","character","character","numeric","numeric","numeric","numeric","numeric","numeric")))
cuts_d$chr<-as.roman(cuts_d$chr)

#this file was processed out of the Xu et al SI
Xu_CUTs<-as.data.frame(read.delim("List of CUTs from Lars.txt",,header=T, sep="\t", colClasses= c("numeric","character","numeric","numeric","character")))

#overlap range limit
ol.range<-500
cuts_d$Xu_match<-0
Xu_CUTs$Meiotic_match<-0

cuts_d$Xu_match<-sapply(1:nrow(cuts_d),function(y)
{
	tc<-cuts_d$chr[y]
	te<-cuts_d$real_end[y]+ol.range
	ts<-cuts_d$real_start[y]-ol.range
	sum(sapply(1:nrow(Xu_CUTs),function(x)
	{
		if(Xu_CUTs$chr[x]==tc)
		{
			if(Xu_CUTs$start[x]<te & Xu_CUTs$end[x]>ts){Xu_CUTs$Meiotic_match[x]<<-Xu_CUTs$Meiotic_match[x]+1; 1} else 0
		} else 0
	}))
})

ol.range
print("Number of CUTS in Xu et al")
nrow(Xu_CUTs)
print("Number of Xu CUTs that overlap with meiotic CUTs")
nrow(Xu_CUTs[Xu_CUTs$Meiotic_match != 0,])
print("Number of Meiotic CUTs that overlap with Xu CUTs")
nrow(cuts_d[cuts_d$Xu_match != 0,])


#now for Alain's CUTs

#processed from the Neil et al SI
Neil_CUTs<-as.data.frame(read.delim("List of CUTs from Alain.txt",,header=T, sep="\t", colClasses= c("character","character","numeric","numeric")))
Neil_CUTs$chr<-as.roman(Neil_CUTs$chr)

#overlap range limit
cuts_d$Neil_match<-0
Neil_CUTs$Meiotic_match<-0

cuts_d$Neil_match<-sapply(1:nrow(cuts_d),function(y)
{
	tc<-cuts_d$chr[y]
	te<-cuts_d$real_end[y]+ol.range
	ts<-cuts_d$real_start[y]-ol.range
	sum(sapply(1:nrow(Neil_CUTs),function(x)
	{
		if(Neil_CUTs$chr[x]==tc)
		{
			if(Neil_CUTs$start[x]<te & Neil_CUTs$end[x]>ts) {Neil_CUTs$Meiotic_match[x]<<-Neil_CUTs$Meiotic_match[x]+1; 1} else 0
		} else 0
	}))
})

ol.range
print("Number of CUTS in Neil et al")
nrow(Neil_CUTs)
print("Number of Neil CUTs that overlap with meiotic CUTs")
nrow(Neil_CUTs[Neil_CUTs$Meiotic_match != 0,])
print("Number of Meiotic CUTs that overlap with Neil CUTs")
nrow(cuts_d[cuts_d$Neil_match != 0,])

#and compare them to each other

Neil_CUTs$Xu_match<-0
Xu_CUTs$Neil_match<-0

Neil_CUTs$Xu_match<-sapply(1:nrow(Neil_CUTs),function(y)
{
	tc<-Neil_CUTs$chr[y]
	te<-Neil_CUTs$end[y]+ol.range
	ts<-Neil_CUTs$start[y]-ol.range
	sum(sapply(1:nrow(Xu_CUTs),function(x)
	{
		if(Xu_CUTs$chr[x]==tc)
		{
			if(Xu_CUTs$start[x]<te & Xu_CUTs$end[x]>ts) {Xu_CUTs$Neil_match[x]<<-Xu_CUTs$Neil_match[x]+1; 1} else 0
		} else 0
	}))
})

ol.range
print("Number of Neil CUTs that overlap with Xu CUTs")
nrow(Neil_CUTs[Neil_CUTs$Xu_match != 0,])
print("Number of Xu CUTs that overlap with Neil CUTs")
nrow(Xu_CUTs[Xu_CUTs$Neil_match != 0,])

print("Number of meiotic CUTs matching both")
nrow(cuts_d[cuts_d$Neil_match != 0 & cuts_d$Xu_match != 0 ,])
print("Number of Neil CUTs matching both")
nrow(Neil_CUTs[Neil_CUTs$Meiotic_match != 0 & Neil_CUTs$Xu_match != 0 ,])
print("Number of Xu CUTs matching both")
nrow(Xu_CUTs[Xu_CUTs$Neil_match != 0 & Xu_CUTs$Meiotic_match != 0 ,])