# Written by Erin Becker on July 6th, 2013. 
# This script looks for enrichmed gene clusters in odd GC regions extracted from GFF files using 
# the Supplementary_Code_4_GC_analysis.ipynb ipython notebook.

# First we calculate the frequency of each cluster represented in the odd GC regions.
# Input path to text file containing cluster number assignments of features in abnormal G+C regions.
odd_GC_clusters = 
  readLines("")
odd_GC_clusters = as.numeric(odd_GC_clusters)
unique_odd_GC_clusters = sort(unique(odd_GC_clusters))
odd_GC_clusters_freq = 
  sapply(unique_odd_GC_clusters, function(x) length(which(odd_GC_clusters == x)))

# Next we calculate the frequency of each cluster in the genomes as a whole.
# Input path to text file containing all possible cluster numbers for protein set. 
all_clusters = 
  readLines("")
all_clusters = as.numeric(all_clusters)
unique_all_clusters = sort(unique(all_clusters))
all_clusters_freq = 
  sapply(unique_all_clusters, function(x) length(which(all_clusters == x)))

# Then we build a dataframe containing this data. 
df = data.frame(cluster_num = unique_all_clusters, overall_freq = all_clusters_freq, 
                odd_GC_freq = as.numeric(0))

# Match up the overall cluster frequency with frequency in the odd GC set and add to dataframe. 
for (i in unique_odd_GC_clusters) {
  j = which(df$cluster_num == i)
  m = which(unique_odd_GC_clusters == i)
  df$odd_GC_freq[j] = odd_GC_clusters_freq[m]
}

df$overall_percent = df$overall_freq/length(all_clusters)*100
df$odd_percent = df$odd_GC_freq/length(odd_GC_clusters)*100
df$fold_enrichment = df$odd_percent/df$overall_percent

# Next step, add "consensus annotation" to df for each cluster. 
# Cluster numbers abover 17591 are singletons that were not included in any cluster by Tribe-MCL. 
# Some singletons do have cluster numbers (under 17591), these should be excluded from enrichment analysis.

# First, subset to have only rows of df where overall_freq is > 1.
non_singletons = df[df$overall_freq > 1,]

singleton_cluster_nums = df[df$overall_freq == 1,]
singleton_cluster_nums = singleton_cluster_nums[singleton_cluster_nums$cluster_num <= 17591,]
singleton_cluster_nums = singleton_cluster_nums$cluster_num

# Input path to text file containing annotations in ascending order of cluster number. 
annotations = 
  readLines("")

# remove singleton lines from annotations
annotations2 = annotations[-singleton_cluster_nums]

non_singletons$annotation = annotations2

# Write dataframe to file
# Output file should be a .csv file.
write.table(non_singletons, 
            file = "", 
            sep = "\t", row.names = T)

# Some possible enrichment cutoffs.
over_four = non_singletons[non_singletons$fold_enrichment >=4,]
write.table(over_four, 
            file = "",
            sep = "\t", row.names = T)

over_six = non_singletons[non_singletons$fold_enrichment >=6,]
write.table(over_six, 
            file = "",
            sep = "\t", row.names = T)

over_eight = non_singletons[non_singletons$fold_enrichment >=8,]
write.table(over_eight, 
            file = "",
            sep = "\t", row.names = T)

over_twelve = non_singletons[non_singletons$fold_enrichment >=12,]
write.table(over_twelve, 
            file = "",
            sep = "\t", row.names = T)