/******************************************************************************/ /* */ /* Author: Kareem Carr */ /* */ /* Purpose: To compute feature vectors of a protein */ /* */ /* Usage: The directory in which the executable is stored is assumed to con- */ /* tain protein sequences stored in text files. The program computes */ /* feature vectors for these protein sequences and stores them in a */ /* csv file with the name of the text file adjacent to the 60 numbers */ /* which comprise the feature vector. */ /* */ /* Input: The program uses text files (*.txt) containing the fasta sequence. */ /* */ /* Output: A list of feature vectors stored in the file 'parameteroutput.csv' */ /* */ /******************************************************************************/ #include #include #include #include #include #include #include #include #include using namespace std; /* Function: gettextfiles (string directory, vector &filelist) Collects the names of all files with a *.txt extension. */ void gettextfiles (string directory, vector &filelist) { DIR *directorypointer; struct dirent *information; directorypointer = opendir(directory.c_str()); if(directorypointer == NULL) { std::cout << "There was a failure to open directory:" << directory<<"."<d_name)); while (information != NULL) { filename = string(information->d_name); if(filename.find(".txt",0)!=string::npos) { filelist.push_back(filename); } information = readdir(directorypointer); } closedir(directorypointer); } /* Function: getdata(string filename, string &data) Returns a string containing the contents of a specified file. It assumes the file is shorter than 20,000 cha- racters. For longer files, increase the number. */ void getdata(string filename, string &data) { data.clear(); char buffer[20000]; std::ifstream infile (filename.c_str()); if (infile.bad()) { std::cout<<"Unable to open file:"< filelist = vector(); gettextfiles(directory,filelist); long double n[20]; long double T[20]; long double D[20]; /* This is a basic a priori assumption about the expected frequency of amino acid. It is possible to compute the probabilities for the sample and use that instead.*/ long double pr[20] = {4,6,2,2,2, 2,2,4,2,3, 6,2,1,2,4, 6,4,1,2,4}; for(long int i=0;i<20;i++) pr[i] = pr[i]/61; long double nm,nv,Tm,Tv,Dm,Dv; int p; string data; for (int i = 0;i < filelist.size();i++) { for (p = 0;p < 20;p++) { n[p] = 0; T[p] = 0; D[p] = 0; } getdata(filelist[i],data); std::cout< 1) { D[p] = D[p]/(n[p]-1); D[p] = (D[p] * (N-1) )/N; } /* normalization of the parameters for the theoretical population. Most of the formulas are introductory statistics.*/ nm = pr[p]; nv = (pr[p]*(1-pr[p]))/N; Tm = (N+1)/2; Tv = ((N+1)*(N-n[p]))/(12*n[p]); Dm = ((N*N)-1)/12; /* based on the "The variance of the variance of samples from a finite population." Cho, Cho and Eltinge */ Dv = ((N-n[p])*(N-1)*(N-1)*(N+1)*( (2*n[p]*N) + (3*N) + (3*n[p]) + 3))/(360*n[p]*(n[p]-1)*N); nv = sqrt(nv); Tv = sqrt(Tv); Dv = sqrt(Dv); T[p] = (T[p] - Tm)/Tv; D[p] = (D[p] - Dm)/Dv; /* The edge cases for small numbers of amino acids when some of the defined values fail to exist and when the whole string corresponding to a particular type of amino acid is a single amino acid. */ if (n[p] <= 1) { D[p] = 0; } if (n[p] == 0) T[p] = 0; if (n[p] == N) D[p] = 0; if (n[p] == N) T[p] = 0; /* n[p] is needed above in its unchanged form so converting it to a probability is done here */ n[p] = n[p]/N; n[p] = (n[p] - nm)/nv; /* Output feature vectors information to a file */ outfile<<","<