program CbyT; {= Character by Taxa = program for filtering for GBS and other genetic data } {$APPTYPE CONSOLE} {$R *.res} { Version is given in the 'Version' constant below. Copyright is maintained by the author (Nick Tinker) in right of Canada. Permission to use or modify is granted under the GNU general public license: http://www.gnu.org/licenses/gpl.txt A simple usage manual will be written to standard output by invoking with a single -h on command line (curemly it is also invoked when the program is run with no parameters at all) the manual can also be inspected within the routine below called "HelpMeMan" This program is formatted for compiling under Windows using the Delphi 10 development environment It contains minimal Windows- and Delphi-specific code so it could be modified to run in another environment. } uses System.SysUtils, DateUtils; const Version='2013-11-30'; MaxT=3000; {maximum taxa} MaxC=1500000; {maximum characters} TAB=char(9); type CA=array of byte; var Dat:array[1..MaxT] of CA; {store character states as characters!} TNameR:array[1..MaxT] of string; {Taxa Name expexted to be found in input file } TNameW:array[1..MaxT] of string; {taxa name to write (alias) optional, set to TNameR if not found in taxa file } TaxPos:array[1..MaxT] of integer; {which column the taxa owns, which Taxa the column owns } CName,CSeq,SNPVar:array[1..MaxC] of string; GNA,GNB,GNH,SNPPos:array[1..MaxC] of integer; {global vars for allele counts} MMAF,MHet,MPres:array[1..MaxC] of real; Redund:array[1..MaxC] of string; CAllele:array[1..2,1..MaxC] of char; NT,NC,NRedund:integer; MaxHet,MinMAF,MinPresP,MinPresN:integer; RawFN,TaxFN,RedundFN,OutFN:string; FType,MapDat:byte; HaploAnchors:array[1..MaxC] of string; NHaploAnchors:integer; LNames,LSeqs:array of string; UseNames:boolean; {search and use existing GBS locus names, invent new names for unique loci} LocNameFN:string; StartID,NLNames:LongInt; DateVar:TDateTime; LNPrefix:string; procedure HelpMeMan; {outputs simple help manual to standard output, invoked by '-h'} begin writeln('-------------------------------------------------------------------------------'); writeln('Simple usage manual for program "CbyT" (=Character by Taxa). '); writeln('Written by: Nick Tinker (nick.tinker@agr.gc.ca)'); writeln('Purpose: filtering for GBS and other genetic data'); writeln('Copyright: N Tinker in right of Canada'); writeln(' Permission to use or modify under the GNU General Public License:'); writeln(' http://www.gnu.org/licenses/gpl.txt '); writeln('-------------------------------------------------------------------------------'); writeln('Command line parameters for CbyT must be set in the following order:'); writeln(''); writeln('1: Raw file name – omit extensions for hapmap e.g. “c:\uneak\hapmap\hapmap”'); writeln('2: Taxa file name – see details below'); writeln('3: File with known redundant loci (optional, type "null" if not used)'); writeln('4: Output file name'); writeln('5: File type (1= Jesse; 2=hapmap)'); writeln('6: “1” for map data, or “0” for diversity data (explained below)'); writeln('7: Threshold for maximum heterozygosity as % (integer)'); writeln('8: Threshold for minimum minor allele frequency = MAF as % (integer)'); writeln('9: Threshold for minimum completeness of scores as % (integer)'); writeln('10: Optional: locus nomenclature file (see below)'); writeln('11: Optional: prefix for naming new loci (string), default = “avgbs”'); writeln('-------------------------------------------------------------------------------'); writeln('Details:'); writeln(' -CbyT will read a raw data file produced from Jesse Poland’s GBS pipeline '); writeln(' or from the UNEAK GBS pipeline (hapmap format). '); writeln(' -It will filter the data based on input parameters, '); writeln(' output written to several files in various formats.'); writeln('Taxa File:'); writeln(' -simple text file, no header, lists taxa names from the source file. '); writeln(' -names can be listed in the order you want them to appear in the output'); writeln(' -if name is not found, missing data is written'); writeln(' -optional second colum lets you specify new name for output'); writeln('Map vs. diversity data:'); writeln(' -Diversity data is written for all taxa '); writeln(' -if you specify mapping data, the first two taxa must be mapping parents'); writeln(' -mapping parents are not written, used to put the data in parental phase'); writeln('Locus nomenclature file (optional): '); writeln(' -used to name SNPs in the filtered output'); writeln(' -format is: name sequence SNP-bases length SNP-position'); writeln(' -if used, new loci will be named sequenctially and appended'); writeln('# sample CbyT nomenclature file (line starting with # is a comment)'); writeln('avgbs_11 TGCAGTTTTRTTTCTTCAGAAAATTAGTTT A/G 30 10'); writeln('avgbs_12 TGCAGTTTTTYTACTTCCAGAATGAACTTGAA C/T 32 11'); writeln(''); writeln(''); end; procedure HaltError(ES:string); begin writeln('Error: ',ES); writeln('Press enter to halt program'); readln; Halt; end; procedure ResetData; var T:longInt; begin writeln('Initializing Arrays ... '); for T:=1 to MaxT do begin setlength(Dat[T],1); TaxPos[T]:=0; end; writeln('Done initializing . . . . '); end; procedure CopyUntil(var S,T:string; F:char); {read S into T until reaching F} var C1,C2:char; begin T:=''; if (length(S)>0) and (S[1]=F) and (F<>' ') then delete(S,1,1) else begin C1:='$'; if length(S)>0 then repeat C1:=C2; C2:=S[1]; T:=T+C2; delete(S,1,1) until (length(S)=0) or (S[1]=F); if length(S)>1 then delete(S,1,1); end; end; procedure ReadLNames(FName:string); var F:text; S,TS:string; N,NID,MaxID:LongInt; begin assign(F,FName); reset(F); N:=0; MaxID:=0; repeat readln(F,S); if (length(S)>0) and (S[1]<>'#') then begin N:=N+1; SetLength(LNames,N+1); SetLength(LSeqs,N+1); CopyUntil(S,LNames[N],TAB); CopyUntil(S,LSeqs[N],TAB); {the remainder of S is not used} S:=LNames[N]; {parse the name to get the numeric ID} repeat CopyUntil(S,TS,'_'); until S=''; NID:=StrToInt(TS); if NID>MaxID then MaxID:=NID; end; until EOF(F); close(F); if MaxID<200000 then StartID:=200000 else StartID:=MaxID; {this finds a new starting point for unique numbers. we always make new numbers > 200K to preserve special numbers UNEAK IDs will therefore always start new numbers at least at 200000} NLNames:=N; end; procedure RenameLoci(FName:string); {optional - find and use exising locus manes} var C,L:LongInt; TS:string; Found:boolean; F:text; begin writeln('Now we are renaming loci based the locus nomenclature file: ',FName); writeln('This will take some time (we need to search each of ',NC,' new loci against ',NLNames,' sequences)'); writeln('We will then append new unique locus names to this file.'); writeln('You should update your database when complete.'); writeln('Each dot is 100 sequences searched.'); assign(F,FName); append(F); DateVar:=Now; writeln(F,'# New SNPs, ',FormatDateTime('yyyy/mmmm/dd',DateVar),' while reading file: ',RawFN); for C:=1 to NC do begin if C mod 100 = 0 then write('.'); L:=0; Found:=false; repeat L:=L+1; Found:=CSeq[C]=LSeqs[L]; until (L=NLNames) or Found; if Found then begin CName[C]:=LNames[L]; {write('f'); } end else begin StartID:=StartID+1; CName[C]:=LNPrefix+'_'+IntToStr(StartID); writeln(F,CName[C],TAB,CSeq[C],TAB,SNPVar[C],TAB,length(CSeq[C]),TAB,SNPPos[C]); end; end; close(F); end; Procedure ReadTaxa(FN:string); {read a list of target taxa to filter} var InF:text; S,TS:string; begin assign(InF,FN); reset(InF); NT:=0; repeat NT:=NT+1; readln(InF,S); CopyUntil(S,TNameR[NT],TAB); if length(S)>0 then TNameW[NT]:=S else TNameW[NT]:=TNameR[NT]; {older taxa lists are just one name. New lists may have a write alias} until eof(InF); MinPresN:=round(MinPresP*NT/100); close(InF); end; function GetAmbig(S:string; BP:integer; SNP:string):String; {return ambiguous base for FASTA file} var AB:char; begin if (SNP='G/T') or (SNP='T/G') then AB:='K' {Keto} else if (SNP='A/C') or (SNP='C/A') then AB:='M' {Amino} else if (SNP='A/G') or (SNP='G/A') then AB:='R' {Purine} else if (SNP='C/T') or (SNP='T/C') then AB:='Y' {Pyrimidine} else if (SNP='C/G') or (SNP='G/C') then AB:='S' {Strong} else if (SNP='A/T') or (SNP='T/A') then AB:='W'; {Weak} if length(S)>=(BP) then S[BP]:=AB; GetAmbig:=S; end; procedure GetPos(TS1,TS2:string; A1,A2:char; var Pos:integer); {this routine finds the position of the (first) SNP based on two sequenced alleles because the POS value is not reported in the UNEAK oputput} var C:integer; Found:boolean; begin if (length(TS1)>0) and ( length(TS1) = length(TS2) ) then begin C:=0; Pos:=0; Found:=false; repeat C:=C+1; Found:=( (TS1[C]<>TS2[C]) and ( ((TS1[C]=A1)and(TS2[C]=A2)) or ((TS1[C]=A2)and(TS2[C]=A1)) ) ); if Found then Pos:=C; until Found or (C=length(TS1)); if not Found then begin {HaltError('SNP in UNEAK file not found for alleles provided'); } writeln(TS1,TAB,'SNP not found - may have been length PM ?'); end; end else HaltError('conditions for GetAmbigSeqAndPos not met'); end; procedure ReadDat(FN:string; FType:Byte); { samples of four file types are provided below for help in debugging the code..... FType 1: Jesse Poland Format.................. rsOrig rs alleles snp_pos MAF alleleA alleleB het present Taxa1 Taxa2 Taxa3 Taxa4 Taxa5 Taxa6 TGCAGAAAAAAAAAAAACGAAATTGGCCGACTAGCTACGCTTACCGAGATCGGAAGAGCGGTTC avGBS1 A/C 16 0.015 1067 17 4 0.774 A C A A A A TGCAGAAAAAAAAAAGATATCGTGTCAGTGGAGCTGATCATTACTTGGGGGAGGAAGGAGATAA avGBS3 G/A 32 0.182 206 881 44 0.748 N A G A A A FType 1: New format (Jesse dropped the locus name from first field *** I put it back in !!!! (but before rsOrig) *************) rs rsOrig alleles alleleA alleleB het present MAF HET X001A1-24-2-4-1-3 X00Ab6112 ********** has extra header line where I substitured new names ********************** FType 2: hapmap.hmp.txt rs alleles chrom pos strand assembly center protLSID assayLSID panelLSID QCcode T1 T2 T3 T4 TP1 G/T 0 1 + NA UNEAK GBS NoRef Custom QC+ K T K T TP2 C/T 0 2 + NA UNEAK GBS NoRef Custom QC+ N Y N T hapmap.hmc.txt rs T1 T2 T3 T4 HetCount_allele1 HetCount_allele2 Count_allele1 Count_allele2 Frequency TP1 1|2 0|1 2|1 0|2 17 35 17 43 0.283 TP2 0|0 1|1 0|0 0|1 4 3 6 15 0.286 hapmap.fas.txt >TP1_query TGCAGAAAAAAAAAACGGCTGCGCCGATCAATCTGGATAGCAAGGGGTGCTACGCAGGGCCGAG >TP1_hit TGCAGAAAAAAAAAACGGCTGCGCCGATCAATCTGGATAGCAAGGTGTGCTACGCAGGGCCGAG FType 3: Arsh.Calls.txt ID Sun-II-1 Rigodon-AC TAM-O-301 Calibre TAM-O-397 Furlong MN841801-1 Prescott g5-17 2 3 3 3 3 3 Arsh.seqs.txt ID Seq-1 Seq-2 N_SNP P_SNP SNPs MAF HET MIS Count_Seq-1 Count_Seq-2 G5_17 TGCAGAAAAAAAAAAAACGAA... TGCAGAAAAAAAAAAAACGTA... 1 15 A\T 5 9 13 44 1824 FType 4: Simple calls from SNP data - just locus and genotypes } var InF,FasF:text; RDND,T,TP,NTP,MaxTP,NCC,C,P,SeqL1,SeqL2,SeqL:integer; S,FS,TS1,TS2,TS3,TCName,Seq1,Seq2,SNStr:string; TC:char; GT:Byte; NPar,NProg,NA,NH,NB,NG:integer; {genotype counts} Pos:integer; PresP,HET,MAF:real; TmpDat:array[1..MaxT] of byte; {hold data for one character until it is considered valid} IsRedund:boolean; begin if (FType=2) then begin {UNEAK} assign(InF,FN+'.hmp.txt'); assign (FasF,FN+'.fas.txt'); reset(FasF); end else if FType=3 then begin {Arsh} assign(InF,FN+'.calls.txt'); assign(FasF,FN+'.seqs.txt'); reset(FasF); readln(FasF); end else assign(InF,FN); reset(InF); repeat Readln(InF,S); until (length(S)>1) and (S[1]<>'#'); {if FType=1 then readln(InF); } {read past old taxa names .... special ............} case FType of 1: for P:=1 to 10 do CopyUntil(S,TS1,TAB); {new ftype} 2: for P:=1 to 11 do CopyUntil(S,TS1,TAB); 3,4: CopyUntil(S,TS1,TAB); {copy past locus header, the rest of the header string contains taxa names} end; NTP:=0; MaxTP:=0; repeat CopyUntil(S,TS1,TAB); NTP:=NTP+1; if length(TS1)>0 then begin for T:=1 to NT do if TS1=TNameR[T] then begin TaxPos[NTP]:=T; if NTP>MaxTP then MaxTP:=NTP; end; end; until length(S)=0; { for P:=1 to NT do writeln(TaxName[P]); } if (MapDat=1) then NPar:=2 else NPar:=0; writeln('Reading character Data for ',NT-NPar,' taxa'); NCC:=0; NC:=0; {we need to read parental data but not count it in the statistics!!! we use "start" for that} repeat readln(InF,S); NCC:=NCC+1; if NCC mod 1000=0 then write(' ',NCC,' ') else if NCC mod 100=1 then write('.'); if FType=1 then begin CopyUntil(S,TCName,TAB); CopyUntil(S,Seq1,TAB); { TCName:='avjp'+IntToStr(NCC); } end else CopyUntil(S,TCName,TAB); if (FType=2) then begin readln(FasF,FS); CopyUntil(FS,TS1,'>'); CopyUntil(FS,TS1,'_'); if TS1<>TCName then begin writeln('oops, seq #1 does not match - hit enter to halt'); readln; halt; end; CopyUntil(FS,TS2,'_'); {ts2='query'} if TS2<>'query' then begin writeln('oops, seq#1 fasta file seems to be out of order - hit enter to halt'); readln; halt; end; SeqL1:=StrToInt(FS); readln(FasF,Seq1); readln(FasF,FS); {read and parse fasta name of second allele seq} CopyUntil(FS,TS1,'>'); CopyUntil(FS,TS1,'_'); if TS1<>TCName then begin writeln('oops, seq #2 does not match - hit enter to halt'); readln; halt; end; CopyUntil(FS,TS2,'_'); {ts2='query'} if TS2<>'hit' then begin writeln('oops, seq #2 (hit) fasta file seems to be out of order - hit enter to halt'); readln; halt; end; SeqL2:=StrToInt(FS); if seqL2>SeqL1 then SeqL:=SeqL2 else SeqL:=SeqL1; {sometimes the allele seqs are different lengths!} readln(FasF,Seq2); {second allele - we will make ambig seq and POS later on} SetLength(Seq1,SeqL); SetLength(Seq2,SeqL); {we remove AAAAA padding here to match with Jesse's sequences and avoid errors in assay design} end else if FType=3 then begin { Arsh's files are not synchronized (yet) ....................... readln(FasF,FastaName); CopyUntil(FastaName,TS1,TAB); if TS1<>TCName then begin writeln('oops, seq does not match- hit enter to halt'); readln; halt; end; CopyUntil(FastaName,TSeq,TAB); } end; {use redundant flag to make a smaller subset for Benazir} if NHaploAnchors>0 then begin IsRedund:=true; for RDND:=1 to NHaploAnchors do if TCName=HaploAnchors[RDND] then IsRedund:=false; end else begin IsRedund:=false; for RDND:=1 to NRedund do if TCName=Redund[RDND] then IsRedund:=true; end; if IsRedund then NCC:=NCC-1 else begin if FType in [1,2] then begin CopyUntil(S,TS1,TAB); SNStr:=TS1; CopyUntil(TS1,TS2,'/'); if length(TS2)>0 then CAllele[1,NCC]:=TS2[1] else CAllele[1,NCC]:='X'; if length(TS1)>0 then CAllele[2,NCC]:=TS1[1] else CAllele[2,NCC]:='X'; case FType of 1: begin {snp_pos alleleA alleleB het MAF HET present } CopyUntil(S,TS1,TAB); POS:=StrToInt(TS1)+1; {****** careful!!!!!!!!!!!!!!!} {careful Jesse counts from 0, but might change this someday} for P:=1 to 6 do CopyUntil(S,TS1,TAB); {might want to store some of this too} end; 2: begin for P:=1 to 9 do CopyUntil(S,TS1,TAB); { ........ there is no SNPpos - POS means chromosome position (?) .............. we need to extract SNP position from the UNEAK FASTA sequences} GetPos(Seq1,Seq2,CAllele[1,NCC],CAllele[2,NCC],POS); end; end; end; for T:=1 to NT do TmpDat[T]:=0; for TP:=1 to MaxTP do begin CopyUntil(S,TS1,TAB); if TaxPos[TP]>0 then begin if length(TS1)=2 then begin {for AA BB AB genotypes} if TS1='AA' then TC:='A' else if TS1='BB' then TC:='B' else if TS1='AB' then TC:='H' else TC:='N' end else if length(TS1)=1 then TC:=TS1[1] else TC:='N'; if FType=3 then case TC of '1': GT:=1; '2': GT:=2; '3': GT:=3; '0': GT:=0; else GT:=0; end else if FType=4 then case TC of 'A','1': GT:=1; 'H','2': GT:=2; 'B','3': GT:=3; '.','0','N': GT:=0; else GT:=0; end else begin if TC='N' then GT:=0 else if TC=CAllele[1,NCC] then GT:=1 else if TS1[1]=CAllele[2,NCC] then GT:=3 else GT:=2; { 'H' and all other ambiguous codes will be called heterozygotes } end; end; TmpDat[TaxPos[TP]]:=GT; end; NA:=0; NB:=0; NH:=0; for T:=NPar+1 to NT do case TmpDat[T] of 1:NA:=NA+1; 2:NH:=NH+1; 3:NB:=NB+1; end; NG:=NA+NB+NH; NProg:=NT-NPar; PRESP:=100*NG/NProg; if NG>0 then begin MAF:=( 100*( (NA*2)+NH) / (2*NG) ); HET:=100*NH/NG; end else begin MAF:=0; Het:=0; end; if MAF>50 then MAF:=100-MAF; if ( (HET<=MaxHet) and (MAF>=MinMAF) and (PRESP>=MinPresP) ) then begin NC:=NC+1; CName[NC]:=TCName; if FType<3 then CSeq[NC]:=GetAmbig(Seq1,Pos,SNStr); {******************* new *******************************} GNA[NC]:=NA; GNB[NC]:=NB; GNH[NC]:=NH; MMAF[NC]:=MAF; MHet[NC]:=HET; MPres[NC]:=PresP; SNPPos[NC]:=Pos; SNPVar[NC]:=SNStr; for T:=1 to NT do begin SetLength(Dat[T],NC+1); Dat[T,NC]:=TmpDat[T]; end; end; end; until eof(InF) or (NCC=MaxC); if NCC=MaxC then writeln('Warning: stopped reading when maximum character was reached'); writeln('Completed reading raw data for ',NCC,' characters (loci)'); close(InF); if FType=2 then close(FasF); end; procedure WriteDat(FN:string; MapDat:byte); {if MAP is true, put markers in phase of taxa 1 x 2, omit these from output} var OutF,RPF,FasF,StatF,DarF1,DarF2:text; T,C,NID,NPar,PresTot,MinGeno:longint; P1,P2,PP:byte; S,TS:string; Flip,Ambig:boolean; begin if UseNames then RenameLoci(LocNameFN); DateVar:=Now; writeln('Writing filtered data to file: ',FN); assign(OutF,FN); rewrite(OutF); assign(RPF,FN+'.rphase.txt'); rewrite(RPF); assign(StatF,FN+'.stats.txt'); rewrite(StatF); if FType<4 then begin assign(FasF,FN+'.fas'); rewrite(FasF); end; if (MapDat=1) then NPar:=2 else NPar:=0; writeln(OutF,'Filtered data created by CbyT program (N Tinker)'); writeln(OutF,'Date of creation =',FormatDateTime('yyyy/mmmm/dd',DateVar)); writeln(OutF,NC,TAB,' = Number of filtered characters (please edit because reverse phase are in seperate file now)'); write(OutF,NT-Npar,TAB,' = Number of Taxa'); if NPar>0 then writeln(OutF,' (excluding ',NPAr,' parents)') else writeln(OutF); writeln(StatF,'Filtered data created by CbyT program (N Tinker)'); writeln(StatF,'Date of creation =',FormatDateTime('yyyy/mmmm/dd',DateVar)); writeln(StatF,'Input parameters .............. '); writeln(StatF,'Data file = ',RawFN); writeln(StatF,'Taxa file = ',TaxFN); writeln(StatF,'Redundant file = ',RedundFN); writeln(StatF,'FType = ',FType); writeln(StatF,'MapDat = ',MapDat); writeln(StatF,'MaxHet% = ',MaxHet); writeln(StatF,'MinMAF% = ',MinMAF); writeln(StatF,'MinPres% = ',MinPresP); writeln(StatF,'MinPresN = ',MinPresN); writeln(StatF,NC,TAB,' = Number of filtered characters'); write(StatF,NT-NPar,TAB,' = Number of Taxa'); if NPar>0 then writeln(StatF,'(excluding ',NPAr,' parents)') else writeln(StatF); writeln(StatF); writeln(StatF,'Attention: the SNP position here is reported in BP starting at 1 (not zero) '); writeln(StatF); writeln(StatF,'ID',TAB,'NID',TAB,'SNP',TAB,'Pos',TAB,'SeqL',TAB,'Seq',TAB,'NA',TAB,'NB',TAB,'NH',TAB,'PresN',TAB,'Pres%',TAB,'Het%',TAB,'MAF%'); writeln(StatF); for T:=NPar+1 to NT do write(OutF,TAB,TNameW[T]); writeln(OutF); for C:=1 to NC do begin { if FType<4 then AmbigSeq:=GetAmbig(CSeq[C],SNPPos[C],SNPVar[C]) else AmbigSeq:='???'; } { we will get ambig seq earlier now and replace cSeq with this } PresTot:=GNA[C]+GNB[C]+GNH[C]; S:=CName[C]; {shuld have tracked numeric ID, now we need to regenerate it} CopyUntil(S,TS,'_'); write(StatF,CName[C],TAB,SNPVar[C],TAB,SNPPos[C],TAB,length(CSeq[C]),TAB,CSeq[C],TAB,GNA[C],TAB,GNB[C],TAB,GNH[C],TAB,PresTot,TAB); writeln(StatF,MPres[C]:0:1,TAB,MHet[C]:0:1,TAB,MMAF[C]:0:1); if FType<4 then begin writeln(FasF,'>',CName[C]); writeln(FasF,CSeq[C]);; end; Flip:=false; Ambig:=false; if (MapDat=1) then begin P1:=Dat[1,C]; P2:=Dat[2,C]; PP:=P1*10+P2; Case PP of 00: begin Ambig:=true; Flip:=true; end; 01: begin Flip:=true; end; 02: begin Ambig:=true; Flip:=true; end; 03: begin end; 10: begin end; 11: begin Ambig:=true; Flip:=true; end; 12: begin end; 13: begin end; 20: begin Ambig:=true; Flip:=true; end; 21: begin Flip:=true; end; 22: begin Ambig:=true; Flip:=true; end; 23: begin end; 30: begin Flip:=true; end; 31: begin Flip:=true; end; 32: begin Flip:=true; end; 33: begin Ambig:=true; Flip:=true; end; End; { Ambig:=(P1=P2) or ((P1=2) and (P2=0)) or ( (P1=0) and (P2=2) ); {Ambig:=(P1=P2) or (P1=2) or (P2=2); } { revised this on Yung-Fens advice -- but causes probs in pops with het parent} { if not Ambig then Flip:=( (P1=3)and(P2<>3) or (P2=1)and(P1<>1) ); } end; if Ambig or (not Flip) then begin if Ambig then begin write(RPF,CName[C],'_p1'); for T:=Npar+1 to NT do write(RPF,TAB,Dat[T,C]); writeln(RPF); end else begin write(OutF,CName[C]); for T:=NPar+1 to NT do write(OutF,TAB,Dat[T,C]); writeln(OutF); end; end; {Ambig:=false; } {omit this line if you want ambigous markers written in both phases} if (Flip or Ambig) then begin if Ambig then begin write(RPF,CName[C],'_p2'); for T:=NPar+1 to NT do begin if Dat[T,C]=1 then write(RPF,TAB,3) else if Dat[T,C]=3 then write(RPF,TAB,1) else write(RPF,TAB,Dat[T,C]); end; writeln(RPF); end else begin write(OutF,CName[C]); for T:=NPar+1 to NT do begin if Dat[T,C]=1 then write(OutF,TAB,3) else if Dat[T,C]=3 then write(OutF,TAB,1) else write(OutF,TAB,Dat[T,C]); end; writeln(OutF); end; end; end; close(OutF); if FType<4 then close(FasF); close(StatF); Close(RPF); writeln; {write DARwin file........ should put a switch in case DARWin is not needed.................} {@DARwin 5.0 - ALLELIC -2 2 6 N° M1 M1 M11 M11 M15 M15 1 8 8 1 1 2 2 2 8 1 1 3 1 2 @DARwin 5.0 - DON 2 6 N° Code Name Type Loc. cl1 cl2 1 B35 Poyo acu. China 1 1.0 2 B22 Amer. acu. Cam. 1 0.5 } assign(DarF1,FN+'_darwin.var'); assign(DarF2,FN+'_darwin.don'); rewrite(DarF1); rewrite(DarF2); writeln(DarF1,'@DARwin 5.0 - ALLELIC -2'); writeln(DarF2,'@DARwin 5.0 - DON'); writeln(DarF1,' ',NT,TAB,NC*2); writeln(DarF2,' ',NT,TAB,1); write(DarF1,'TaxID'); for C:=1 to NC do write(DarF1,TAB,CName[C],TAB,CName[C]); writeln(DarF1); writeln(DarF2,'TaxID TaxName'); for T:=1 to NT do begin writeln(DarF2,T,TAB,TNameW[T]); write(DarF1,T); for C:=1 to NC do begin case Dat[T,C] of 1: write(DarF1,TAB,'1',TAB,'1'); 2: write(DarF1,TAB,'1',TAB,'3'); 3: write(DarF1,TAB,'3',TAB,'3'); else write(DarF1,TAB,'0',TAB,'0'); end end; writeln(DarF1); end; Close(DarF1); Close(DarF2); {.................................................................................} end; procedure ReadRedund(RFN:string); var RF:text; begin NRedund:=0; if FileExists(RFN) then begin assign(RF,RFN); reset(RF); repeat NRedund:=NRedund+1; readln(RF,Redund[NRedund]); until eof(RF); close(RF); end; end; Procedure ReadCustomMarkers; {one-off program to make a marker subset} var CMF:text; HS:string; begin assign(CMF,'S:\oat_seq\CORE\6Kinfinium\halpo\haplo_anchors_SNP_12D.txt'); reset(CMF); NHaploAnchors:=0; repeat Readln(CMF,HS); NHaploAnchors:=NHaploAnchors+1; HaploAnchors[NHaploAnchors]:=HS; until eof(CMF); close(CMF); end; var PF:text; S,TS,PFN,LocalPath:string; begin NHaploAnchors:=0; UseNames:=false; LNPrefix:='avgbs'; if ParamCount=0 then begin GetDir(0,LocalPath); if LocalPath='d:\data\delphi\exe' then begin {use local debug parameters} writeln('Hmmm... it looks like you are running me from a development environment'); writeln('OK, I will using try my hard-coded debugging defaults. This might get loud!'); RawFN:='D:\test\tt\hapmap\hapmap'; TaxFN:='D:\test\tt\tm.taxa.txt'; RedundFN:='null'; OutFN:='D:\test\tt\tm_out.txt'; FType:=2; {1=poland; 2=unueak; 4=generic} MapDat:=0; {1=mapping data - parents first } MaxHet:=8; MinMAF:=35; MinPresP:=95; UseNames:=true; LocNameFN:='D:\test\tt\Test_GBSLoci.txt'; ReadLNames(LocNameFN); ReadTaxa(TaxFN); ResetData; ReadDat(RawFN,FType); WriteDat(OutFN,MapDat); end else HelpMeMan; end else if ParamCount>8 then begin {read parameters from command line} RawFN:=ParamStr(1); TaxFN:=ParamStr(2); RedundFN:=ParamStr(3); OutFN:=ParamStr(4); FType:=StrToInt(ParamStr(5)); MapDat:=StrToInt(ParamStr(6)); MaxHet:=StrToInt(ParamStr(7)); MinMAF:=StrToInt(ParamStr(8)); MinPresP:=StrToInt(ParamStr(9)); if ParamCount>9 then begin UseNames:=true; LocNameFN:=ParamStr(10); ReadLNames(LocNameFN); end; if ParamCount>10 then LNPrefix:=ParamStr(11) else LNPrefix:='avgbs'; {use an alternate prefix for locus names} if MaxC>20000 then ReadRedund(RedundFN); {this alows me to debug with a small number of characters} ReadTaxa(TaxFN); ResetData; ReadDat(RawFN,FType); WriteDat(OutFN,MapDat); Write(' Done!'); end else HelpMeMan; Writeln(' ...... hit Enter to terminate program... '); readln; end.