Supplemental material - script (a) Script to generate mutation context spectrum data ''' Version 1.22 by Brendan Kohrn Last updated on July 29, 2014 Find the mutation context for a list of mutants in mutpos format, as follows: Chromosome ReferenceNT Position Depth number_of_mutants mutants_to_T mutants_to_C mutants_to_G mutants_to_A insertions deletions ''' import sys import re import collections from collections import defaultdict import argparse from argparse import ArgumentParser import numpy as np import matplotlib.pyplot as plt class Files: def __init__(self, inFasta, inMutPos, minC, maxC, bufferSize = 3): # DEBUG sys.stderr.write('>>Starting File Initialization\n') self.lineNum = 0 tmpMutPos = open(inMutPos, 'r') self.MutPos = mutPosFile(tmpMutPos) tmpFasta = open(inFasta, 'r') self.Fasta = fastaWin(tmpFasta, bufferSize) #self.Fasta.advance(endChr = self.MutPos.chrom, endPos = self.MutPos.pos) self.minClonality = minC self.maxClonality = maxC # DEBUG sys.stderr.write('>>File Initialization Complete\n') def __iter__(self): return(self) def next(self): # DEBUG sys.stderr.write('>>Advancing...\n') endTest = False while endTest == False: if self.MutPos.line.fileEnd == False: endTest = self.MutPos.next() else: break if self.MutPos.line.fileEnd == False: fastaTest = self.Fasta.advance(endChr = self.MutPos.chrom, endPos = self.MutPos.pos) # DEBUG sys.stderr.write('>>Not EOF = %s\n' % fastaTest) if fastaTest == False: raise StopIteration self.lineNum = self.MutPos.lineNum #if self.lineNum % 1000 == 0: #print('%s lines processed' % self.lineNum) # DEBUG sys.stderr.write('>>Advancing complete.\n') return(self.MutPos.line) else: raise StopIteration def close(self): # DEBUG sys.stderr.write('>>Closing Files\n') self.MutPos.close() self.Fasta.close() # DEBUG sys.stderr.write('>>Closing Complete\n') class mutPosFile: def __init__(self, inFile): # DEBUG sys.stderr.write('>>Initializing MutPos\n') self.file = inFile self.chrom = "" self.pos = 1 self.line = mutPosLine("-",0) self.lineNum = 0 #self.next() def next(self): self.lineNum += 1 self.line = mutPosLine(self.file.readline(), self.lineNum) if self.line.fileEnd == False: while self.line.fileEnd == False and self.line.Ts == 0 and self.line.Cs == 0 and self.line.Gs == 0 and self.line.As == 0: self.lineNum += 1 self.line = mutPosLine(self.file.readline(), self.lineNum) if self.line.fileEnd == False: self.pos = self.line.pos self.chrom = self.line.chrom # DEBUG sys.stderr.write('>>MutPos Advanced to line %s...\n' % self.lineNum) return(True) else: return(False) else: sys.stderr.write('>>MutPos EOF reached\n') return(False) def __str__(self): return(str(self.line)) def close(self): self.file.close() return(True) class mutPosLine: def __init__(self, inLine, inLineNum): if inLine == "-": self.fileEnd = False self.lineNum = -1 linebins = inLine.split() self.chrom = '' self.refBase = '' self.pos = -1 self.depth = -1 self.muts = -1 self.Ts = -1 self.Cs = -1 self.Gs = -1 self.As = -1 self.ins = -1 self.dels = -1 #self.Ns = -1 #self.clonalDepth = self.depth - self.Ns self.clonality = 0 elif inLine == "": self.fileEnd = True self.lineNum = -1 linebins = inLine.split() self.chrom = '' self.refBase = '' self.pos = -1 self.depth = -1 self.muts = -1 self.Ts = -1 self.Cs = -1 self.Gs = -1 self.As = -1 self.ins = -1 self.dels = -1 #self.Ns = -1 #self.clonalDepth = self.depth - self.Ns self.clonality = 0 else: self.lineNum = inLineNum linebins = inLine.split() self.fileEnd = False self.chrom = linebins[0] self.refBase = linebins[1].upper() self.pos = int(linebins[2]) self.depth = int(linebins[3]) self.muts = int(linebins[4]) self.Ts = int(linebins[5]) self.Cs = int(linebins[6]) self.Gs = int(linebins[7]) self.As = int(linebins[8]) self.ins = int(linebins[9]) self.dels = int(linebins[10]) #self.Ns = int(linebins[11]) #self.clonalDepth = self.depth - self.Ns self.clonality = max(float(self.Ts)/self.depth, float(self.Cs)/self.depth, float(self.Gs)/self.depth, float(self.As)/self.depth) def makeMuts(self): outMuts = [] if self.Ts: outMuts.append(Mutation(self.chrom, self.pos, self.refBase, "T")) if self.Cs: outMuts.append(Mutation(self.chrom, self.pos, self.refBase, "C")) if self.Gs: outMuts.append(Mutation(self.chrom, self.pos, self.refBase, "G")) if self.As: outMuts.append(Mutation(self.chrom, self.pos, self.refBase, "A")) return(outMuts) def __str__(self): return('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.lineNum, self.chrom, self.refBase, self.pos, self.depth, self.Ts, self.Cs, self.Gs, self.As, self.ins, self.dels)) class fastaWin: """ This class enables most of the oporations carried out by the program; it handles the rolling window and sequence retrieval sections. """ def __init__(self, dataSource, bufferSize = 3): #~ self.debugCtr = 0 # DEBUG sys.stderr.write('>>Initializing FASTAWIN\n') self.sourceFile = dataSource ##initialize the window in the first chromosome line = self.sourceFile.readline().strip().split(">")[1].split(" ")[0] self.chrom = line line = self.sourceFile.readline().strip().upper() self.sizeMax=2*len(line) #size of the window self.bsize=int(bufferSize) #size of the buffer desired self.eof=False #has the end of the file been reached? self.really=False #Are you sure? #how long is one line in this reference genome anyway? self.lLength=len(line) self.data = [] #do one line of N's to make sure that any mutations near #the begining can be processed self.data.extend(list('N'*self.lLength)) self.data.extend(list(line))#load in the first line #the leftmost position on the reference genome. 1-indexed self.minPos=1-self.lLength #the rightmost position on the reference genome. 1-indexed self.maxPos=self.lLength self.pos = 1 def advance(self, endChr = None, endPos = None): # DEBUG sys.stderr.write('>>Advancing FASTA to %s:%s\n' % (endChr, endPos)) if endChr == None and endPos == None: self.pos += 1 if self.pos > self.maxPos - self.bsize: self.moveWin() elif endChr != None and endPos == None: while self.chrom != endChr and self.chrom != '': self.usedChrs.append(self.chrom) self.chrom = self.NewChrom() elif endPos != None and endChr == None: self.pos = endPos while self.pos > self.maxPos - self.bsize: self.moveWin() elif endPos != None and endChr != None: while self.chrom != endChr and self.chrom != '': self.chrom = self.NewChrom() self.pos = endPos while self.pos > self.maxPos - self.bsize and self.chrom != '': self.moveWin() if self.chrom == '': return(False) else: return(True) def moveWin(self): dataSource = self.sourceFile.readline().strip().upper() #move the window over one line if self.eof == False: if dataSource=="": self.eof=True else: inSeq=dataSource while len(inSeq) != self.lLength: inSeq+="N" self.maxPos+=self.lLength self.minPos+=self.lLength wPos=self.minPos%self.sizeMax-1 for base in list(inSeq): try: self.data[wPos]=base wPos+=1 except: print('%s\n%s\t%s\t%s\t%s' % (self.data, wPos, base, inSeq, self.lLength)) raise return(True) else: if self.really == False: inSeq="N"*self.lLength self.maxPos+=self.lLength self.minPos+=self.lLength wPos=self.minPos%self.sizeMax-1 for base in list(inSeq): self.data[wPos]=base wPos+=1 self.really=True else: return(False) def getSeq(self, pos): seq="" while int(pos) - 1 + 3 >= self.maxPos: fTest=self.moveWin() if fTest==False: return("") for i in range(int(pos) - 1, int(pos) -1 + 3): b=(i+self.lLength)%self.sizeMax-1 if b == -1: b = self.sizeMax - 1 seq+=str(self.data[b]) #~ print(pos, seq) #~ self.debugCtr += 1 #~ if self.debugCtr == 5: #~ exit() return(seq) def NewChrom(self): # DEBUG sys.stderr.write('>>Switching Chromosomes\n') # DEBUG sys.stderr.write('>>>>Old Chromosome: %s\n' % self.chrom) mvTest=self.sourceFile.readline().strip() while ">" not in mvTest and mvTest != "": mvTest=self.sourceFile.readline().strip() if mvTest != "": self.chrom=mvTest.split(">")[1].split(" ")[0] self.data = [] newData = self.sourceFile.readline().strip().upper() self.lLength = len(newData) self.data.extend(list('N'*self.lLength)) self.data.extend(list(newData)) self.minPos=1-self.lLength self.maxPos=self.lLength self.sizeMax = 2 * self.lLength else: return('') # DEBUG sys.stderr.write('>>>>New Chromosome: %s\n' % self.chrom) return(self.chrom) def close(self): self.sourceFile.close() return(True) #~ class mutType: #~ def __init__(self, inRef, inMut): #~ self.refNT = inRef #~ self.mutNT = inMut #~ #~ def __str__(self): #~ return("%s>%s" % (self.refNT, self.mutNT)) class Mutation: def __init__(self, inChrom, inPos, inRef, inMut): # DEBUG sys.stderr.write('>>Initializing mutation with MutPos Line %s\n' % inMutPosLine.lineNum) self.chrom = inChrom self.pos = inPos self.Type = "%s>%s" % (inRef, inMut) self.context = '' def setContext(self, fasta): # DEBUG sys.stderr.write('>>Getting Context...\n') self.context = fasta.getSeq(self.pos) def __str__(self): strToReturn = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.chrom, self.pos, self.Type, self.context) return(strToReturn) def revComp(inSeq): revCompTable = {"A": "T", "G": "C", "C": "G", "T": "A", "N": "N"} revSeq = inSeq.upper()[::-1] revCompSeq = "" for base in revSeq: revCompSeq += revCompTable[base] return(revCompSeq) def main(): parser=ArgumentParser() parser.add_argument("--ref", action="store", dest="ref", help="The reference genome in FASTA format", required = True) parser.add_argument("--mutpos", action="store", dest="mutpos", help="The mutpos file", required = True) parser.add_argument("--label", action="store", dest="label", required = True) parser.add_argument("--debug", action="store_true", dest="debug", help="Print out all mutations processed with +strand sequence context.") parser.add_argument('-c', '--minClonality', action = 'store', type = float, dest = 'minClonal', default = 0, help = 'The minimum clonality at which to examine a mutation. [0]' ) parser.add_argument('-C', '--maxClonality', action = 'store', type = float, dest = 'maxClonal', default = 1, help = 'The maximum clonality at which to examine a mutation. [1]' ) o=parser.parse_args() allFiles = Files(o.ref, o.mutpos, 0, 1, 1) allMuts = [] mutSpect = { "C>A":{"ACA":0., "ACC":0., "ACG":0., "ACT":0., "CCA":0., "CCC":0., "CCG":0., "CCT":0., "GCA":0., "GCC":0., "GCG":0., "GCT":0., "TCA":0., "TCC":0., "TCG":0., "TCT":0.}, "C>G":{"ACA":0., "ACC":0., "ACG":0., "ACT":0., "CCA":0., "CCC":0., "CCG":0., "CCT":0., "GCA":0., "GCC":0., "GCG":0., "GCT":0., "TCA":0., "TCC":0., "TCG":0., "TCT":0.}, "C>T":{"ACA":0., "ACC":0., "ACG":0., "ACT":0., "CCA":0., "CCC":0., "CCG":0., "CCT":0., "GCA":0., "GCC":0., "GCG":0., "GCT":0., "TCA":0., "TCC":0., "TCG":0., "TCT":0.}, "T>A":{"ATA":0., "ATC":0., "ATG":0., "ATT":0., "CTA":0., "CTC":0., "CTG":0., "CTT":0., "GTA":0., "GTC":0., "GTG":0., "GTT":0., "TTA":0., "TTC":0., "TTG":0., "TTT":0.}, "T>C":{"ATA":0., "ATC":0., "ATG":0., "ATT":0., "CTA":0., "CTC":0., "CTG":0., "CTT":0., "GTA":0., "GTC":0., "GTG":0., "GTT":0., "TTA":0., "TTC":0., "TTG":0., "TTT":0.}, "T>G":{"ATA":0., "ATC":0., "ATG":0., "ATT":0., "CTA":0., "CTC":0., "CTG":0., "CTT":0., "GTA":0., "GTC":0., "GTG":0., "GTT":0., "TTA":0., "TTC":0., "TTG":0., "TTT":0.} } mutSpectKeys = {"all":["C>A","C>G", "C>T", "T>A", "T>C", "T>G"], "C":["ACA", "ACC", "ACG", "ACT", "CCA", "CCC", "CCG", "CCT", "GCA", "GCC", "GCG", "GCT", "TCA", "TCC", "TCG", "TCT"], "T":["ATA", "ATC", "ATG", "ATT", "CTA", "CTC", "CTG", "CTT", "GTA", "GTC", "GTG", "GTT", "TTA", "TTC", "TTG", "TTT"]} typeTrans = {"A>C":"T>G","A>G":"T>C","A>T":"T>A","G>A":"C>T","G>C":"C>G","G>T":"C>A"} lineNum=0#DEBUG print("Processing mutations...") for line in allFiles: lineNum += 1 if lineNum % 1000 == 0: print('%s mutation sites processed' % lineNum) if o.minClonal <= line.clonality <= o.maxClonal: newMuts = line.makeMuts() for mut in newMuts: mut.setContext(allFiles.Fasta) if o.debug: print(mut.chrom, mut.pos, mut.Type, mut.context) if mut.Type in typeTrans.keys(): mutSpect[typeTrans[mut.Type]][revComp(mut.context)] += 1 else: try: mutSpect[mut.Type][mut.context] += 1 except Exception: print(mut.Type, mut.context) print(mut.chrom, mut.pos) print("fasta dump") print("Position: ", allFiles.Fasta.chrom, allFiles.Fasta.pos) print(allFiles.Fasta.minPos, allFiles.Fasta.maxPos) print("Line Data: ", allFiles.Fasta.lLength) print(allFiles.Fasta.data) raise allCounts = [0] allLabels = [''] metaLabels = [''] print("Preparing data table...") for mutType in mutSpectKeys["all"]: for mutKey in mutSpectKeys[mutType.split(">")[0]]: allCounts.append(mutSpect[mutType][mutKey]) allLabels.append(mutKey) metaLabels.append(mutType) allCounts.append(0) allLabels.append('') metaLabels.append('') totalCounts = sum(allCounts) dataFile = open("%s.mcs.dat.txt" % o.label, "w") dataFile.write("Type\tContext\tCount\tProportion") for ind in xrange(len(allCounts)): dataFile.write("\n%s\t%s\t%s\t" % (metaLabels[ind],allLabels[ind],allCounts[ind])) allCounts[ind] /= totalCounts dataFile.write("%s" % allCounts[ind]) dataFile.close() colTrans = {"C>A":'c', "C>G":'0.2', "C>T":'r', "T>A":'0.75', "T>C":'g', "T>G":'m', "":'w'} print("Building Figure...") plt.figure(figsize=(11,3)) ind=np.arange(len(allCounts)) width = 1 rects = plt.bar(ind, allCounts, width, color=[colTrans[x] for x in metaLabels]) plt.ylabel("Proportion of Mutations") plt.title("Mutation Spectrum: %s" % o.label) plt.xticks(ind+width/2., allLabels, rotation='vertical', fontsize=7) plt.xlim([0, ind.size]) plt.yticks(fontsize=7) legendCreator = [] for mutType in mutSpectKeys["all"]: legendCreator.append(plt.Rectangle((0,0), 1, 1, fc=colTrans[mutType])) plt.figlegend(legendCreator, mutSpectKeys["all"], loc=5, fontsize=7) print("Saving...") plt.savefig("%s.mcs.png" % o.label) allFiles.close() if __name__ == "__main__": main() (b) Script to generate amino acid change data #Note: #1. BED file of human mitochondrial protein coding genes is required as an input for this script. The file should be generated as a standard BED file format with following information: chrom, chromStartm chromEnd, name, score, and strand. More detailed instruction can be found from https://genome.ucsc.edu/FAQ/FAQformat.html#format1 #2. Sequence of each human mitochondrial protein coding genes is found from http://www.mitomap.org/bin/view.pl/MITOMAP/HumanMitoCode (same information can be found from http://www.mitomap.org/bin/view.pl/MITOMAP/HumanMitoCode). #3. Amino acids and their correspoinding DNA codons table is required as a input in transtable format. The table for human mitochondrial DNA is provided at the end of this document (file name of an example table is VertebrateMito.transtable). #4. A mutpos file is required as an input for this script. More information about mutpos file and the script that generates mutpos file (mut-position.py) can be find from https://github.com/loeblab/Duplex-Sequencing/blob/master/mut-position.p # MutationConsequences.py # Version 1.12 # Brendan Kohrn # Last updated on 07/21/2014 # # Output line: # chrom, pos, direction, refNt, Compact NT, gene name, AA#, refAA, TmutAA, CmutAA, GmutAA, AmutAA, aaCompact # # An AA of ref means that this is the reference AA, an AA of NP means there is no mutation of that type, and an AA of NC means the region is non-coding (as determined from the bed file). import sys import re import collections from collections import defaultdict import argparse from argparse import ArgumentParser class Files: def __init__(self, inBed, inFasta, inCode, inMutPos, outFileName, minC, maxC): self.lineNum = 0 tmpMutPos = open(inMutPos, 'r') self.MutPos = mutPosFile(tmpMutPos) tmpFasta = open(inFasta, 'r') self.Fasta = fastaWin(tmpFasta) #self.Fasta.advance(endChr = self.MutPos.chrom, endPos = self.MutPos.pos) tmpBed = open(inBed, 'r') self.Bed = BedFile(tmpBed) #self.Bed.move(endChr = self.MutPos.chrom, endPos = self.MutPos.pos) tmpTable = open(inCode, 'r') self.Table = TransTable(tmpTable) self.OutFile = outputFile(outFileName) self.minClonality = minC self.maxClonality = maxC def __iter__(self): return(self) def next(self): endTest = False while endTest == False: if self.MutPos.line.fileEnd == False: endTest = self.MutPos.next() else: break if self.MutPos.line.fileEnd == False: fastaTest = self.Fasta.advance(endChr = self.MutPos.chrom, endPos = self.MutPos.pos) if fastaTest == False: sys.stderr.write("End of fasta file!\n") raise StopIteration self.Bed.move(endChr = self.MutPos.chrom, endPos = self.MutPos.pos) self.lineNum += 1 if self.lineNum % 10000 == 0: print('%s processed' % self.lineNum) return(self.MutPos.line) else: raise StopIteration def close(self): self.MutPos.close() self.Fasta.close() self.OutFile.close() class mutPosFile: def __init__(self, inFile): self.file = inFile self.chrom = "" self.pos = 1 self.line = mutPosLine("-",0) self.lineNum = 0 #self.next() def next(self): self.lineNum += 1 self.line = mutPosLine(self.file.readline(), self.lineNum) if self.line.fileEnd == False: while self.line.fileEnd == False and self.line.Ts == 0 and self.line.Cs == 0 and self.line.Gs == 0 and self.line.As == 0: self.lineNum += 1 self.line = mutPosLine(self.file.readline(), self.lineNum) if self.line.fileEnd == False: self.pos = self.line.pos self.chrom = self.line.chrom return(True) else: return(False) def __str__(self): return(str(self.line)) def close(self): self.file.close() return(True) class mutPosLine: def __init__(self, inLine, inLineNum): if inLine == "-": self.fileEnd = False self.lineNum = -1 linebins = inLine.split() self.chrom = '' self.refBase = '' self.pos = -1 self.depth = -1 self.muts = -1 self.Ts = -1 self.Cs = -1 self.Gs = -1 self.As = -1 self.ins = -1 self.dels = -1 #self.Ns = -1 #self.clonalDepth = self.depth - self.Ns self.clonality = 0 elif inLine == "": self.fileEnd = True self.lineNum = -1 linebins = inLine.split() self.chrom = '' self.refBase = '' self.pos = -1 self.depth = -1 self.muts = -1 self.Ts = -1 self.Cs = -1 self.Gs = -1 self.As = -1 self.ins = -1 self.dels = -1 #self.Ns = -1 #self.clonalDepth = self.depth - self.Ns self.clonality = 0 else: self.lineNum = inLineNum linebins = inLine.split() self.fileEnd = False self.chrom = linebins[0] self.refBase = linebins[1].upper() self.pos = int(linebins[2]) self.depth = int(linebins[3]) self.muts = int(linebins[4]) self.Ts = int(linebins[5]) self.Cs = int(linebins[6]) self.Gs = int(linebins[7]) self.As = int(linebins[8]) self.ins = int(linebins[9]) self.dels = int(linebins[10]) #self.Ns = int(linebins[11]) #self.clonalDepth = self.depth - self.Ns self.clonality = max(float(self.Ts)/self.depth, float(self.Cs)/self.depth, float(self.Gs)/self.depth, float(self.As)/self.depth) def __str__(self): return('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.lineNum, self.chrom, self.refBase, self.pos, self.depth, self.Ts, self.Cs, self.Gs, self.As, self.ins, self.dels)) class outputFile: def __init__(self, outFileLoc): self.fileLoc = outFileLoc self.outFile = None self.firstLine = True def open(self): self.outFile = open(self.fileLoc, 'w') return(True) def write(self, strToWrite): if self.firstLine == True: self.outFile.write("%s" % (strToWrite)) self.firstLine = False else: self.outFile.write("\n%s" % (strToWrite)) return(True) def close(self): self.outFile.close() return(True) class TransTable: def __init__(self, inTable): self.table = {} self.name = "" self.file = inTable for line in self.file: if ">>" in line: self.name = line.strip().strip(">>") else: linebins = line.split() codons = linebins[1].split(",") for codon in codons: self.table[codon] = linebins[0] self.file.close() def translateCodon(self, codon): if "N" not in codon: return(self.table[codon]) else: return("NAC") class ROI: def __init__(self, inLine): linebins = inLine.split() self.chrom = linebins[0] self.start = int(linebins[1]) self.stop = int(linebins[2]) self.name = linebins[3] self.strand = linebins[5] def checkCoord(self, inCoord): # write for reverse mapping as well #posTrans = {1:0, 0:2, 2:1} codonStar = -1 codonPos = -1 AAnum = -1 if self.strand == "+": codonStart = self.start + 3 * ((inCoord - self.start)/3) codonPos = (inCoord - self.start) % 3 AAnum = ((inCoord - self.start)/3) + 1 elif self.strand == "-": codonStart = self.stop - 3 * ((self.stop - inCoord)/3) codonPos = (self.stop - inCoord) % 3 AAnum = ((self.stop - inCoord)/3) + 1 return((codonStart, codonPos, AAnum, self.strand, self.name)) class BedFile: def __init__(self, inFile): self.file = inFile self.ROIs = [] self.openROIs = [] self.chrom = '' self.pos = 1 for line in self.file: if "#" not in line: tmpROI = ROI(line) if self.chrom == '': self.chrom = tmpROI.chrom self.ROIs.append(tmpROI) self.file.close() def checkCodon(self, mut): Coords = [] for ROI in self.openROIs: Coords.append(ROI.checkCoord(mut.pos)) if Coords == []: Coords = [(-1, -1, -1, '.')] return(Coords) def move(self, endChr = None, endPos = None): if endChr == None and endPos == None: self.pos += 1 for ROI in self.ROIs: if ROI.start <= self.pos <= ROI.stop and ROI not in self.openROIs: self.openROIs.append(ROI) indToPop = [] for index in xrange(len(self.openROIs)): if self.openROIs[index].start > self.pos or self.openROIs[index].stop < self.pos: indToPop.append(index) for index in indToPop: self.openROIs.pop(index) elif endChr != None and endPos == None: self.chrom = endChr self.openROIs = [] self.pos = 1 for ROI in self.ROIs: if ROI.start <= self.pos <= ROI.stop and ROI.chrom == self.chrom: self.openROIs.append(ROI) elif endPos != None and endChr == None: self.pos = endPos for ROI in self.ROIs: if ROI.start <= self.pos <= ROI.stop and ROI not in self.openROIs: self.openROIs.append(ROI) indToPop = [] for index in xrange(len(self.openROIs)): if self.openROIs[index].start > self.pos or self.openROIs[index].stop < self.pos: indToPop.append(index) for index in indToPop: self.openROIs.pop(index) elif endPos != None and endChr != None: self.chrom = endChr self.openROIs = [] self.pos = endPos for ROI in self.ROIs: if ROI.start <= self.pos <= ROI.stop and ROI.chrom == self.chrom: self.openROIs.append(ROI) class fastaWin: """ This class enables most of the oporations carried out by the program; it handles the rolling window and sequence retrieval sections. """ def __init__(self, dataSource, bufferSize = 3): self.sourceFile = dataSource ##initialize the window in the first chromosome line = self.sourceFile.readline().strip().split(">")[1].split(" ")[0] self.chrom = line line = self.sourceFile.readline().strip().upper() self.sizeMax=2*len(line) #size of the window #self.bsize=int(bufferSize) #size of the buffer desired self.eof=False #has the end of the file been reached? self.really=False #Are you sure? #how long is one line in this reference genome anyway? self.lLength=len(line) self.data = [] #do one line of N's to make sure that any mutations near #the begining can be processed self.data.extend(list('N'*self.lLength)) self.data.extend(list(line))#load in the first line #the leftmost position on the reference genome. 1-indexed self.minPos=1-self.lLength #the rightmost position on the reference genome. 1-indexed self.maxPos=self.lLength self.pos = 1 def advance(self, endChr = None, endPos = None): if endChr == None and endPos == None: self.pos += 1 if self.pos > self.maxPos - 3: self.moveWin() elif endChr != None and endPos == None: while self.chrom != endChr and self.chrom != '': self.usedChrs.append(self.chrom) self.chrom = self.NewChrom() elif endPos != None and endChr == None: self.pos = endPos while self.pos > self.maxPos - 3: self.moveWin() elif endPos != None and endChr != None: while self.chrom != endChr and self.chrom != '': self.chrom = self.NewChrom() self.pos = endPos while self.pos > self.maxPos - 3 and self.chrom != '': self.moveWin() if self.chrom == '': return(False) else: return(True) #else: #return(False) def moveWin(self): dataSource = self.sourceFile.readline().strip().upper() #move the window over one line if self.eof == False: if dataSource=="": self.eof=True else: inSeq=dataSource while len(inSeq) != self.lLength: inSeq+="N" self.maxPos+=self.lLength self.minPos+=self.lLength wPos=self.minPos%self.sizeMax-1 for base in list(inSeq): self.data[wPos]=base wPos+=1 return(True) else: if self.really == False: inSeq="N"*self.lLength self.maxPos+=self.lLength self.minPos+=self.lLength wPos=self.minPos%self.sizeMax-1 for base in list(inSeq): self.data[wPos]=base wPos+=1 self.really=True else: return(False) def getSeq(self, pos, direction): #Needs rewriting seq="" if direction == "-": for i in range(int(pos)+1-3, int(pos)+1): b=(i+self.lLength)%self.sizeMax-1 if b == -1: b = self.sizeMax - 1 seq+=str(self.data[b]) seq = revComp(seq) elif direction == "+": while int(pos) + 3 >= self.maxPos: fTest=self.moveWin() if fTest==False: return("") for i in range(int(pos), int(pos) + 3): b=(i+self.lLength)%self.sizeMax-1 if b == -1: b = self.sizeMax - 1 seq+=str(self.data[b]) return(seq) def NewChrom(self): # DEBUG sys.stderr.write('>>Switching Chromosomes\n') # DEBUG sys.stderr.write('>>>>Old Chromosome: %s\n' % self.chrom) mvTest=self.sourceFile.readline().strip() while ">" not in mvTest and mvTest != "": mvTest=self.sourceFile.readline().strip() if mvTest != "": self.chrom=mvTest.split(">")[1].split(" ")[0] self.data = [] newData = self.sourceFile.readline().strip().upper() self.lLength = len(newData) self.data.extend(list('N'*self.lLength)) self.data.extend(list(newData)) self.minPos=1-self.lLength self.maxPos=self.lLength self.sizeMax = 2 * self.lLength else: return('') # DEBUG sys.stderr.write('>>>>New Chromosome: %s\n' % self.chrom) return(self.chrom) def close(self): self.sourceFile.close() return(True) class Mutation: def __init__(self,inMutPosLine): self.chrom = inMutPosLine.chrom self.pos = inMutPosLine.pos self.refNt = inMutPosLine.refBase self.T = False if inMutPosLine.Ts == 0 else True self.C = False if inMutPosLine.Cs == 0 else True self.G = False if inMutPosLine.Gs == 0 else True self.A = False if inMutPosLine.As == 0 else True self.name = '.' self.codonCoords = (0, 0, 0) self.direction = "*" self.refCodon = "" self.mutCodonT = "" self.mutCodonC = "" self.mutCodonG = "" self.mutCodonA = "" self.refAA = "" self.TmutAA = "NP" if self.refNt != 'T' else "Ref" self.CmutAA = "NP" if self.refNt != 'C' else "Ref" self.GmutAA = "NP" if self.refNt != 'G' else "Ref" self.AmutAA = "NP" if self.refNt != 'A' else "Ref" self.syn = 0 self.nonSyn = 0 self.ntCompact = '.' compactCore = self.refNt + str(self.pos) if self.T == True and self.refNt != 'T': if self.ntCompact == '.': self.ntCompact = compactCore + 'T' else: self.ntCompact += ',' + compactCore + 'T' if self.C == True and self.refNt != 'C': if self.ntCompact == '.': self.ntCompact = compactCore + 'C' else: self.ntCompact += ',' + compactCore + 'C' if self.G == True and self.refNt != 'G': if self.ntCompact == '.': self.ntCompact = compactCore + 'G' else: self.ntCompact += ',' + compactCore + 'G' if self.A == True and self.refNt != 'A': if self.ntCompact == '.': self.ntCompact = compactCore + 'A' else: self.ntCompact += ',' + compactCore + 'A' self.aaCompact = '.' def setCoords(self, inCoords): self.codonCoords = inCoords self.direction = inCoords[3] self.name = inCoords[4] if self.direction == "-": tmpT = "" tmpA = "" tmpC = "" tmpG = "" if self.T: tmpT = False if self.A != True else tmpT tmpA = True if self.A: tmpT = True tmpA = False if self.T != True else tmpA if self.G: tmpG = False if self.C != True else tmpG tmpC = True if self.C: tmpG = True tmpC = False if self.G != True else tmpC self.T = tmpT if tmpT != "" else self.T self.C = tmpC if tmpC != "" else self.C self.G = tmpG if tmpG != "" else self.G self.A = tmpA if tmpA != "" else self.A self.refNt = revComp(self.refNt) return(True) def defineCodons(self, Fasta, Table): aaAbrev = {'Ala':'A', 'Arg':'R', 'Asn':'N', 'Asp':'D', 'Cys':'C', 'Gln':'Q', 'Glu':'E', 'Gly':'G', 'His':'H', 'Ile':'I', 'Leu':'L', 'Lys':'K', 'Met':'M', 'Phe':'F', 'Pro':'P', 'Ser':'S', 'Thr':'T', 'Trp':'W', 'Tyr':'Y', 'Val':'V', 'Asx':'B', 'Glx':'Z', 'Ter':'Ter', 'NAC':'NAC'} self.refCodon = Fasta.getSeq(self.codonCoords[0], self.direction) #if self.direction == '-': #self.refCodon = revComp(self.refCodon) for index in xrange(len(self.refCodon)): if index == self.codonCoords[1]: self.mutCodonT += "T" if self.T == True else "" self.mutCodonC += "C" if self.C == True else "" self.mutCodonG += "G" if self.G == True else "" self.mutCodonA += "A" if self.A == True else "" else: self.mutCodonT += self.refCodon[index] if self.T == True else "" self.mutCodonC += self.refCodon[index] if self.C == True else "" self.mutCodonG += self.refCodon[index] if self.G == True else "" self.mutCodonA += self.refCodon[index] if self.A == True else "" self.refAA = Table.translateCodon(self.refCodon) self.TmutAA = Table.translateCodon(self.mutCodonT) if self.T == True else self.TmutAA self.CmutAA = Table.translateCodon(self.mutCodonC) if self.C == True else self.CmutAA self.GmutAA = Table.translateCodon(self.mutCodonG) if self.G == True else self.GmutAA self.AmutAA = Table.translateCodon(self.mutCodonA) if self.A == True else self.AmutAA if self.T and self.refAA == self.TmutAA: self.syn += 1 elif self.T: self.nonSyn += 1 if self.C and self.refAA == self.CmutAA: self.syn += 1 elif self.C: self.nonSyn += 1 if self.G and self.refAA == self.GmutAA: self.syn += 1 elif self.G: self.nonSyn += 1 if self.A and self.refAA == self.AmutAA: self.syn += 1 elif self.A: self.nonSyn += 1 compactStarter = aaAbrev[self.refAA] + str(self.codonCoords[2]) for elmt in (self.TmutAA, self.CmutAA, self.GmutAA, self.AmutAA): if elmt != 'NP' and elmt != 'Ref' and elmt != self.refAA: if self.aaCompact == '.': self.aaCompact = compactStarter + aaAbrev[elmt] else: self.aaCompact += ',' + compactStarter + aaAbrev[elmt] def setNonCoding(self): self.codonCoords = (0, 0, "NC", '*') self.TmutAA = "NC" self.CmutAA = "NC" self.GmutAA = "NC" self.AmutAA = "NC" self.refAA = "NC" def __str__(self): strToReturn = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (self.chrom, self.pos, self.direction, self.refNt, self.ntCompact, self.name, self.codonCoords[2], self.refAA, self.TmutAA, self.CmutAA, self.GmutAA, self.AmutAA, self.aaCompact) return(strToReturn) def revComp(inSeq): revCompTable = {"A": "T", "G": "C", "C": "G", "T": "A", "N": "N"} revSeq = inSeq.upper()[::-1] revCompSeq = "" for base in revSeq: revCompSeq += revCompTable[base] return(revCompSeq) def main(): # Read in the list of arguments from the command line parser = ArgumentParser() parser.add_argument('-f', '--refrence', action = 'store', dest = 'ref', help = 'The reference genome in FASTA format.', required=True ) parser.add_argument('-t', '--table', action = 'store', dest = 'table', help = 'The translation table for the organism. The first line should be a name for the table preceded by ">>", the second row should contain Met codons, and the third row should contain Ter codons. All rows other than the first should be: "[Three letter AA abreviation] [DNA codons leading to this AA]".', required=True ) parser.add_argument('-p', '--positions', action = 'store', dest = 'mutPos', help = 'The list of where mutations are as a mutpos file.', required=True ) parser.add_argument('-b', '--bedFile', action = 'store', dest = 'bedFile', help = 'The locations of various exons (ROIs) as a bed file.', required=True ) parser.add_argument('-o', '--outFile', action = 'store', dest = 'outFile', help = 'A name for the output file.', required=True ) parser.add_argument('-c', '--minClonality', action = 'store', type = float, dest = 'minClonal', default = 0, help = 'The minimum clonality at which to examine a mutation. [0]' ) parser.add_argument('-C', '--maxClonality', action = 'store', type = float, dest = 'maxClonal', default = 1, help = 'The maximum clonality at which to examine a mutation. [1]' ) o = parser.parse_args() # Initialize all files allFiles = Files(o.bedFile, o.ref, o.table, o.mutPos, o.outFile, o.minClonal, o.maxClonal) # At this point, the list of regions of interest from the bed file, # the begining of the fasta file, the translation table, and # the first line of the mutpos file should be read in. In addition, # all files should be in sync with the first mutation in the mutpos file. # Create a list of mutations: allMuts = [] mutsIndex = -1 #Start iterating through the files: print('Starting checking mutations...') #lineNum=0 for line in allFiles: #lineNum += 1 #print(lineNum) # Check for and process mutations if o.minClonal <= line.clonality <= o.maxClonal: mutsIndex += 1 allMuts.append(Mutation(line)) CodonLocations = allFiles.Bed.checkCodon(allMuts[mutsIndex]) for Codon in CodonLocations: if Codon is CodonLocations[0]: if Codon[0:3] != (-1, -1, -1): allMuts[mutsIndex].setCoords(Codon) allMuts[mutsIndex].defineCodons(allFiles.Fasta, allFiles.Table) else: allMuts[mutsIndex].setNonCoding() else: mutsIndex += 1 allMuts.append(Mutation(line)) if Codon[0:3] != (-1, -1, -1): allMuts[mutsIndex].setCoords(Codon) allMuts[mutsIndex].defineCodons(allFiles.Fasta, allFiles.Table) else: allMuts[mutsIndex].setNonCoding() # Open the output file print('Writing output files...') allFiles.OutFile.open() totSyn = 0 totNonSyn = 0 # Write all mutation records to the output file for mut in allMuts: totSyn += mut.syn totNonSyn += mut.nonSyn allFiles.OutFile.write(str(mut)) # Close the output file totalMuts = totSyn + totNonSyn allFiles.close() print('Summary Statistics:') print('%s\tpotential amino acid changes:' % (totalMuts)) print('%s\tsynonymous mutations' % totSyn) print('%s\tnon-synonymous mutations' % totNonSyn) if __name__ == "__main__": main() (c) Amino acids and their correspoinding codons table for human mitochondrial DNA (VertebrateMito.transtable) >>VertebrateMito Met ATG,ATA Start Ter TAA,TAG,AGA,AGG Stop Ala GCT,GCC,GCA,GCG . Arg CGT,CGC,CGA,CGG . Asn AAT,AAC . Asp GAT,GAC . Cys TGT,TGC . Gln CAA,CAG . Glu GAA,GAG . Gly GGT,GGC,GGA,GGG . His CAT,CAC . Ile ATT,ATC . Leu TTA,TTG,CTT,CTC,CTA,CTG . Lys AAA,AAG . Phe TTT,TTC . Pro CCT,CCC,CCA,CCG . Ser TCT,TCC,TCA,TCG,AGT,AGC . Thr ACT,ACC,ACA,ACG . Trp TGA,TGG . Tyr TAT,TAC . Val GTT,GTC,GTA,GTG .