import os import re def main(): with open(pos_f, 'r') as inf: #opens file contaning snp coordinates, stores as a dictionary srt_pos = inf.read().split('\n') positions = [x.split('\t') for x in srt_pos if x] srt_pos = ['\t'.join(x[0:3]) for x in positions] positions = {'\t'.join(x[0:3]):x[3] for x in positions} for infile in infiles: #opens mpileup, determines number of reference and alt allele calls dic = {pos:'0\t%s\t0'%positions[pos] for pos in positions} with open(inf_f+infile, 'r') as inf: for line in inf: line = line.rstrip('\n').split('\t') if line[0] == 'mitochondria':continue pos = str(int(line[0]))+'\t'+line[1]+'\t'+line[2] if pos not in positions: continue dic[pos] = BaseCaller(line,positions[pos]) with open(inf_f+infile.rstrip('_mpileup.txt')+'_snps.txt', 'w') as outf: #writes out a text file containing genotype information at each snp. file has extension '_snps.txt' outf.write('\n'.join([pos+'\t'+'\t'.join(dic[pos]) for pos in srt_pos])) def BaseCaller(line,alt): #Takes a line from an mpileup and converts it to a base call if type(line) == str: line = line.split('\t') ref, cov = line[2], int(line[3]) calls = line[4].upper().replace(',', '.').replace('.', ref) #replaces mpileup reference notation with reference base calls = [x for i, x in enumerate(calls) if (line[4][i-1] != '^' and line[4][i] != '^')] #removes low quality base calls ref_c = calls.count(ref) #determines number of reference allele calls alt_c = calls.count(alt) #determines number of alt allele calls return [str(ref_c), alt, str(alt_c)] if __name__ == '__main__': inf_f = '/Users/lab/Documents/sequencingbulkdata/test/' #working directory pos_f = '/Users/lab/Documents/sequencingbulkdata/050815yeast/all_mpileups/3SxBYsnps.txt' #file containing SNP coordinates infiles = ["R21x69con_mpileup.txt"] #list of mpileups main()