# Written by Erin Becker on July 6th, 2013. # This script looks for enrichmed gene clusters in odd GC regions extracted from GFF files using # the Supplementary_Code_4_GC_analysis.ipynb ipython notebook. # First we calculate the frequency of each cluster represented in the odd GC regions. # Input path to text file containing cluster number assignments of features in abnormal G+C regions. odd_GC_clusters = readLines("") odd_GC_clusters = as.numeric(odd_GC_clusters) unique_odd_GC_clusters = sort(unique(odd_GC_clusters)) odd_GC_clusters_freq = sapply(unique_odd_GC_clusters, function(x) length(which(odd_GC_clusters == x))) # Next we calculate the frequency of each cluster in the genomes as a whole. # Input path to text file containing all possible cluster numbers for protein set. all_clusters = readLines("") all_clusters = as.numeric(all_clusters) unique_all_clusters = sort(unique(all_clusters)) all_clusters_freq = sapply(unique_all_clusters, function(x) length(which(all_clusters == x))) # Then we build a dataframe containing this data. df = data.frame(cluster_num = unique_all_clusters, overall_freq = all_clusters_freq, odd_GC_freq = as.numeric(0)) # Match up the overall cluster frequency with frequency in the odd GC set and add to dataframe. for (i in unique_odd_GC_clusters) { j = which(df$cluster_num == i) m = which(unique_odd_GC_clusters == i) df$odd_GC_freq[j] = odd_GC_clusters_freq[m] } df$overall_percent = df$overall_freq/length(all_clusters)*100 df$odd_percent = df$odd_GC_freq/length(odd_GC_clusters)*100 df$fold_enrichment = df$odd_percent/df$overall_percent # Next step, add "consensus annotation" to df for each cluster. # Cluster numbers abover 17591 are singletons that were not included in any cluster by Tribe-MCL. # Some singletons do have cluster numbers (under 17591), these should be excluded from enrichment analysis. # First, subset to have only rows of df where overall_freq is > 1. non_singletons = df[df$overall_freq > 1,] singleton_cluster_nums = df[df$overall_freq == 1,] singleton_cluster_nums = singleton_cluster_nums[singleton_cluster_nums$cluster_num <= 17591,] singleton_cluster_nums = singleton_cluster_nums$cluster_num # Input path to text file containing annotations in ascending order of cluster number. annotations = readLines("") # remove singleton lines from annotations annotations2 = annotations[-singleton_cluster_nums] non_singletons$annotation = annotations2 # Write dataframe to file # Output file should be a .csv file. write.table(non_singletons, file = "", sep = "\t", row.names = T) # Some possible enrichment cutoffs. over_four = non_singletons[non_singletons$fold_enrichment >=4,] write.table(over_four, file = "", sep = "\t", row.names = T) over_six = non_singletons[non_singletons$fold_enrichment >=6,] write.table(over_six, file = "", sep = "\t", row.names = T) over_eight = non_singletons[non_singletons$fold_enrichment >=8,] write.table(over_eight, file = "", sep = "\t", row.names = T) over_twelve = non_singletons[non_singletons$fold_enrichment >=12,] write.table(over_twelve, file = "", sep = "\t", row.names = T)