############################################################### ############################################################### ############################################################### #!/usr/bin/perl use Math::Random; use strict; use warnings; ############################################################### # # # Coalescence simulations with /n/ loci # # # # Generates /n/ separate coalescent trees, throws mutations # # on these trees; disease status at leafs is calculated based # # on a multiplicative penetrance model # # # # GNU GPL copyleft, 2015 # # # # Andreas Wolf: Main routines/skeleton # # Sabine Siegert: Modifocations to fit n loci and # # Disease modelling # # # ############################################################### ### ### Timestamp ### my $pretime = time(); ### ### Arguments ### if (scalar(@ARGV) != 5) { die ("Note the arguments, see code for details:\n 1st: # trees\n 2nd: # iterations\n 3rd: # individuals\n 4th: Stop criterion per iteration: \n\t\tpopulation prevalence of affected individuals\n 5th: Risk factor\n"); } ### ### Parameters ### my $ntrees = shift(@ARGV); my $iterations = shift(@ARGV); my $no_of_individuals = shift(@ARGV); my $prop_of_affected = shift(@ARGV); my $gamma = shift(@ARGV); my $treeoutput = shift(@ARGV); my $filestring = ">t". $ntrees . "_n" . $iterations . "_N" . $no_of_individuals . "_p" . $prop_of_affected . "_gamma" . $gamma; ### ### Output files ### my $filename1 = $filestring . "_stats.csv"; open(STAT, $filename1); print STAT "iteration\tmutations\taffected\tunaffected\t"; print STAT "affected_freq\tmutation_nr\t"; print STAT "av_mutations_affected\tav_mutations_unaffected\t"; print STAT "aff_carrier\taff_nocarrier\tunaff_carrier\tunaff_nocarrier\t"; print STAT "freq_of_mut\tOR\n"; ### ### Number of affected individuals at leafs (expected) ### my $no_of_affected; my $product = $no_of_individuals * $prop_of_affected; if ($product == int $product) { $no_of_affected = $product; } else { $no_of_affected = int $product + 1; } ### ### Subroutine, "throwing" mutations recursively on an edge and its ### successors so that this mutation reaches a leaf where it will ### be checked to compute disease status ### sub throwmutations { my $mutation_no = shift; my $node = shift; my $mut_hash = shift; my $suc_hash = shift; $mut_hash->{$node}{count}++; $mut_hash->{$node}{which}{$mutation_no} = 1; if (exists $suc_hash->{$node}) { for (my $i = 0; $i < 2; $i++) { my $s = $suc_hash->{$node}[$i]; throwmutations($mutation_no,$s,$mut_hash,$suc_hash); } } } #################################### # MAIN LOOP, ITERATING AS REQUIRED # #################################### for (my $iter = 0; $iter < $iterations; $iter++) { ### ### n trees per iteration -- some variables needed ### my %Ntotaltime; my %Nsuccs; my %Ntime; my %nodes; ####################### # Loop over /n/ trees # ####################### for (my $treenum = 0; $treenum < $ntrees; $treenum++) { ### ### First generation ### my $gennum = 1; ### ### Individuals per generation ### my %ind_per_gen; ### ### Individuals are numbers ### for (my $i = 0; $i < $no_of_individuals; $i++) { $ind_per_gen{$gennum}[$i] = $i; } ### ### Coalescence events ### my %event; ### ### Reset all nodes in the tree ### %nodes = (); ### ### Nodes initially consisting of individuals (leafs of the tree) ### for (my $i = 0; $i < $no_of_individuals; $i++) { $nodes{$i} = $i; } my $noofnodes = $no_of_individuals; ### ### Successors of single nodes (Hash of arrays of length 2) ### my %succs; ####################### # Generating one tree # ####################### ### ### Until root is reached (generetion with one member): ### until (scalar(@{ $ind_per_gen{$gennum} }) == 1) { ### ### Pick two (different) individuals at random... ### my $l = scalar(@{ $ind_per_gen{$gennum} }); my $one = 1; my $two = $one; until ($one != $two) { $one = int(rand $l); $two = int(rand $l); } my $first = $ind_per_gen{$gennum}[$one]; my $second = $ind_per_gen{$gennum}[$two]; ### ### ...merge them to generate new node,... ### $noofnodes++; my $new = $noofnodes - 1; $nodes{$new} = $new; ### ### ...this is an event and successors are pushed on stack ### $event{$gennum} = $new; push @{ $succs{$new} }, $first; push @{ $succs{$new} }, $second; ### ### Next generation is the same, ... ### $gennum++; @{ $ind_per_gen{$gennum} } = @{ $ind_per_gen{$gennum - 1} }; my $diff = 0; if ($one < $two) { $diff = -1; } ### ### ...except that the two merged inidividuals are replaced by the newly generated one. ### splice(@{ $ind_per_gen{$gennum} }, $one, 1); splice(@{ $ind_per_gen{$gennum} }, $two + $diff, 1, $new); } ##################### # Tree is generated # ##################### ############################# # Time resp. branch lengths # ############################# ### ### The time of the coalescence events has to be re-ordered ### (the i of the t_i's count the number of existing nodes ### after coalescence event) ### my %eventtime; foreach my $gen (sort keys %event) { $eventtime{$event{$gen}} = $no_of_individuals - $gen; } ### ### For technical reasons the leafs have an event time as well ### for (my $i = 0; $i < $no_of_individuals; $i++) { $eventtime{$i} = $no_of_individuals; } ### ### Calculation of path lenghts (from node to node) ### measured in steps ### my %length; ### ### Each node (except leafs), X, has two successors, Y and Z. ### The lenght from X to Y (Z) is the difference of their ### event times (the time they occured for the first time) ### foreach my $key (keys %succs) { $length{$key}{$succs{$key}[0]} = $eventtime{$succs{$key}[0]} - $eventtime{$key}; $length{$key}{$succs{$key}[1]} = $eventtime{$succs{$key}[1]} - $eventtime{$key}; } ### ### Calculating the t_i's ### my %t; for (my $i = 2; $i <= $gennum; $i++) { ### ### Parameter of exponential distribution (i.e. mean) ### my $lambda = 2/($i*($i-1)); ### ### using Perl module, with mean lambda ### my $rand_exp_distr = Math::Random::random_exponential(1,$lambda); $t{$i} = $rand_exp_distr; } ### ### Exponentially distributed time from node to noded, and total time ### my %time; my $totaltime = 0; ### ### ...needed for each node which has a branch: ### foreach my $key (keys %succs) { ### ### ...its successors and length to each: ### my $s1 = $succs{$key}[0]; my $s2 = $succs{$key}[1]; my $l1 = $length{$key}{$s1}; my $l2 = $length{$key}{$s2}; ### ### ...starts with t_i where i is roots eventtime... ### my $sum = 0; ### ### ...and make steps, adding the according time... ### for (my $i = 1; $i <= $l1; $i++) { $sum = $sum + $t{$eventtime{$key} + $i}; } $time{$key}{$s1} = $sum; $totaltime = $totaltime + $sum; ### ### ...for both successors. ### $sum = 0; for (my $i = 1; $i <= $l2; $i++) { $sum = $sum + $t{$eventtime{$key} + $i}; } $time{$key}{$s2} = $sum; $totaltime = $totaltime + $sum; } #################################################### # Tree including branch lengths etc. finished here # #################################################### ### ### Assign this tree to hash of trees ### $Ntotaltime{$treenum} = $totaltime; %{$Nsuccs{$treenum}} = %succs; %{$Ntime{$treenum}} = %time; ###################### # And now mutations! # ###################### ### ### Mutations at each node. Mutation counter on inner nodes will ### be wrong but at leafs they sum, so OK for leafs where they ### are used to compute disease status! ! ### my %mutations_at_node; foreach my $node (keys %nodes) { $mutations_at_node{$node}{count} = 0; } ### ### Numbers, names and "locations" of mutations ### my $treename; my $mutation_no = 0; my %mutsonN; for (my $i = 0; $i < $ntrees; $i++) { $mutsonN{$i} = 0; } # my $mutsonA = 0; # my $mutsonB = 0; #################################################### # Loop until number of desired affected is reached # #################################################### ### ### Observed number of affected individuals at leafs (initially 0) ### my $obs_affected = 0; my %affection_status; until ($obs_affected >= $no_of_affected) { $mutation_no++; ### ### Pick one of n trees at random ### my $thistreenum = int(rand($ntrees)); ### ### Random point of time for that tree ### my $r = rand; my $rpot = $r * $Ntotaltime{$thistreenum}; ### ### Respective tree variables ### my %thissuccs = %{ $Nsuccs{$thistreenum} }; my %thistime = %{ $Ntime{$thistreenum} }; $mutsonN{$thistreenum}++; my $timesofar = 0; $treename = "T" . $thistreenum; ### ### Find corresponding branch on that tree ### my $thisone; my $ready = 0; foreach my $key (sort keys %thissuccs) { for (my $i = 0; $i < 2; $i++) { my $s = $thissuccs{$key}[$i]; my $t = $thistime{$key}{$s}; $timesofar = $timesofar + $t; if ($timesofar >= $rpot and !$ready) { $ready = 1; $thisone = $s; } } } ### ### Throw current mutation on found node of respective tree, and recursively on ### its successors until leaf is reached ### ### Different mutation name, i.e. number and tree my $thismutation = $mutation_no . "x" . $treename; throwmutations($thismutation, $thisone, \%mutations_at_node, \%thissuccs); ### ### Now check for affection status at leafs, depending on multiplicative penetrance model ### foreach my $node (keys %nodes) { if (!exists $thissuccs{$node}) { my $noofmuts = $mutations_at_node{$node}{count}; ### HERE IS THE MODEL! my $affprob = 1 - ( (1-$gamma) ** $noofmuts ); my $randnr = rand; if ($randnr < $affprob) { $affection_status{$node} = 1; } else { $affection_status{$node} = 0; } } } ### ### Count affected individuals at leafs (calculate stop criterion) ### my $aff_sum = 0; foreach my $node (keys %affection_status) { $aff_sum = $aff_sum + $affection_status{$node}; } $obs_affected = $aff_sum; my $pop_freq = $obs_affected / $no_of_individuals; ### ### Statistics / main output ### for (my $mut = 1; $mut <= $mutation_no; $mut++) { my $carrier = 0; my $affected = 0; my $unaffected = 0; my $affected_mutations = 0; my $unaffected_mutations = 0; my $aff_car = 0; my $noaff_car = 0; foreach my $node (keys %affection_status) { my $affstat = $affection_status{$node}; if ($affstat == 1) { $affected++; $affected_mutations = $affected_mutations + $mutations_at_node{$node}{count}; } else { $unaffected++; $unaffected_mutations = $unaffected_mutations + $mutations_at_node{$node}{count}; } foreach my $obsmut (keys %{ $mutations_at_node{$node}{which} }) { my @mutsplit = split /x/, $obsmut; my $mutnr = $mutsplit[0]; if ($mutnr == $mut) { $carrier++; if ($affstat == 1) { $aff_car++; } else { $noaff_car++; } } } } my $aff_nocar = $affected - $aff_car; my $noaff_nocar = $unaffected - $noaff_car; my $freq_of_mut = $carrier / $no_of_individuals; my $OR = "NA"; if ($aff_nocar != 0 and $noaff_car != 0) { $OR = ($aff_car * $noaff_nocar) / ($aff_nocar * $noaff_car); } my $av_mut_affected = "NA"; if ($affected != 0) { $av_mut_affected = $affected_mutations / $affected; } my $av_mut_unaffected = "NA"; if ($unaffected != 0) { $av_mut_unaffected = $unaffected_mutations / $unaffected; } print STAT "$iter\t$mutation_no\t$affected\t$unaffected\t"; print STAT "$pop_freq\t$mut\t"; print STAT "$av_mut_affected\t$av_mut_unaffected\t"; print STAT "$aff_car\t$aff_nocar\t$noaff_car\t$noaff_nocar\t"; print STAT "$freq_of_mut\t$OR\n"; } } ### ### Close main loop ### } my $posttime = time(); my $timeinsec = $posttime - $pretime; my $timeinmin = $timeinsec / 60; print "Time: $timeinmin minutes ($timeinsec seconds).\n"; ############################################################### ############################################################### ############################################################### #!/usr/bin/perl use Math::Random; use strict; use warnings; ############################################################### # # # Coalescence simulations with /n/ loci # # # # Generates /n/ separate coalescent trees, throws mutations # # on these trees; disease status at leafs is calculated based # # on a logistic penetrance model # # # # GNU GPL copyleft, 2015 # # # # Andreas Wolf: Main routines/skeleton # # Sabine Siegert: Modifocations to fit n loci and # # Disease modelling # # # ############################################################### ### ### Timestamp ### my $pretime = time(); ### ### Arguments ### if (scalar(@ARGV) != 6) { die ("Note the arguments, see code for details:\n 1st: # trees\n 2nd: # iterations\n 3rd: # individuals\n 4th: Stop criterion per iteration: \n\t\tpopulation prevalence of affected individuals\n 5th: Risk factor\n"); } ### ### Parameters ### my $ntrees = shift(@ARGV); my $iterations = shift(@ARGV); my $no_of_individuals = shift(@ARGV); my $prop_of_affected = shift(@ARGV); my $alpha = shift(@ARGV); my $beta = shift(@ARGV); my $treeoutput = shift(@ARGV); my $filestring = ">t". $ntrees . "_n" . $iterations . "_N" . $no_of_individuals . "_p" . $prop_of_affected . "_a" . $alpha . "_b" . $beta; ### ### Output files ### my $filename1 = $filestring . "_stats.csv"; open(STAT, $filename1); print STAT "iteration\tmutations\taffected\tunaffected\t"; print STAT "affected_freq\tmutation_nr\t"; print STAT "av_mutations_affected\tav_mutations_unaffected\t"; print STAT "aff_carrier\taff_nocarrier\tunaff_carrier\tunaff_nocarrier\t"; print STAT "freq_of_mut\tOR\n"; ### ### Number of affected individuals at leafs (expected) ### my $no_of_affected; my $product = $no_of_individuals * $prop_of_affected; if ($product == int $product) { $no_of_affected = $product; } else { $no_of_affected = int $product + 1; } ### ### Subroutine, "throwing" mutations recursively on an edge, from ### there to its successors so that this mutation reaches a leaf ### where it will be used to compute disease status ### sub throwmutations { my $mutation_no = shift; my $node = shift; my $mut_hash = shift; my $suc_hash = shift; $mut_hash->{$node}{count}++; $mut_hash->{$node}{which}{$mutation_no} = 1; if (exists $suc_hash->{$node}) { for (my $i = 0; $i < 2; $i++) { my $s = $suc_hash->{$node}[$i]; throwmutations($mutation_no,$s,$mut_hash,$suc_hash); } } } #################################### # MAIN LOOP, ITERATING AS REQUIRED # #################################### for (my $iter = 0; $iter < $iterations; $iter++) { ### ### n trees per iteration -- some hashes needed for ### time per tree, total time, successor representations,... ### my %Ntotaltime; my %Nsuccs; my %Ntime; my %nodes; ####################### # Loop over /n/ trees # ####################### for (my $treenum = 0; $treenum < $ntrees; $treenum++) { ### ### First generation ### my $gennum = 1; ### ### Individuals per generation ### my %ind_per_gen; ### ### Individuals are numbers ### for (my $i = 0; $i < $no_of_individuals; $i++) { $ind_per_gen{$gennum}[$i] = $i; } ### ### Coalescence events ### my %event; ### ### Reset all nodes in the tree ### %nodes = (); ### ### Nodes initially consisting of individuals (leafs of the tree) ### for (my $i = 0; $i < $no_of_individuals; $i++) { $nodes{$i} = $i; } my $noofnodes = $no_of_individuals; ### ### Successors of single nodes (Hash of arrays of length 2) ### my %succs; ####################### # Generating one tree # ####################### ### ### Until root is reached (generetion with one member): ### until (scalar(@{ $ind_per_gen{$gennum} }) == 1) { ### ### Pick two (different) individuals at random... ### my $l = scalar(@{ $ind_per_gen{$gennum} }); my $one = 1; my $two = $one; until ($one != $two) { $one = int(rand $l); $two = int(rand $l); } my $first = $ind_per_gen{$gennum}[$one]; my $second = $ind_per_gen{$gennum}[$two]; ### ### ...merge them to generate new node,... ### $noofnodes++; my $new = $noofnodes - 1; $nodes{$new} = $new; ### ### ...this is an event and successors are pushed on stack ### $event{$gennum} = $new; push @{ $succs{$new} }, $first; push @{ $succs{$new} }, $second; ### ### Next generation is the same, ... ### $gennum++; @{ $ind_per_gen{$gennum} } = @{ $ind_per_gen{$gennum - 1} }; my $diff = 0; if ($one < $two) { $diff = -1; } ### ### ...except that the two merged inidividuals are replaced by the newly generated one. ### splice(@{ $ind_per_gen{$gennum} }, $one, 1); splice(@{ $ind_per_gen{$gennum} }, $two + $diff, 1, $new); } ##################### # Tree is generated # ##################### ############################# # Time resp. branch lengths # ############################# ### ### The time of the coalescence events has to be re-ordered ### (the i of the t_i's count the number of existing nodes ### after coalescence event) ### my %eventtime; foreach my $gen (sort keys %event) { $eventtime{$event{$gen}} = $no_of_individuals - $gen; } ### ### For technical reasons the leafs have an event time as well ### for (my $i = 0; $i < $no_of_individuals; $i++) { $eventtime{$i} = $no_of_individuals; } ### ### Calculation of path lenghts (from node to node) ### measured in steps ### my %length; ### ### Each node (except leafs), X, has two successors, Y and Z. ### The lenght from X to Y (Z) is the difference of their ### event times (the time they occured for the first time) ### foreach my $key (keys %succs) { $length{$key}{$succs{$key}[0]} = $eventtime{$succs{$key}[0]} - $eventtime{$key}; $length{$key}{$succs{$key}[1]} = $eventtime{$succs{$key}[1]} - $eventtime{$key}; } ### ### Calculating the t_i's ### my %t; for (my $i = 2; $i <= $gennum; $i++) { ### ### Parameter of exponential distribution (i.e. mean) ### my $lambda = 2/($i*($i-1)); ### ### using Perl module, with mean lambda ### my $rand_exp_distr = Math::Random::random_exponential(1,$lambda); $t{$i} = $rand_exp_distr; } ### ### Exponentially distributed time from node to node, and total time ### my %time; my $totaltime = 0; ### ### ...needed for each node which has a branch: ### foreach my $key (keys %succs) { ### ### ...its successors and length to each: ### my $s1 = $succs{$key}[0]; my $s2 = $succs{$key}[1]; my $l1 = $length{$key}{$s1}; my $l2 = $length{$key}{$s2}; ### ### ...starts with t_i where i is roots eventtime... ### my $sum = 0; ### ### ...and make steps, adding the according time... ### for (my $i = 1; $i <= $l1; $i++) { $sum = $sum + $t{$eventtime{$key} + $i}; } $time{$key}{$s1} = $sum; $totaltime = $totaltime + $sum; ### ### ...for both successors. ### $sum = 0; for (my $i = 1; $i <= $l2; $i++) { $sum = $sum + $t{$eventtime{$key} + $i}; } $time{$key}{$s2} = $sum; $totaltime = $totaltime + $sum; } #################################################### # Tree including branch lengths etc. finished here # #################################################### ### ### Assign this tree to hash of trees ### $Ntotaltime{$treenum} = $totaltime; %{$Nsuccs{$treenum}} = %succs; %{$Ntime{$treenum}} = %time; ###################### # And now mutations! # ###################### ### ### Mutations at each node. Mutation counter on inner nodes will ### be wrong but at leafs they sum, so OK for leafs where they ### are used to compute disease status! ### my %mutations_at_node; foreach my $node (keys %nodes) { $mutations_at_node{$node}{count} = 0; } ### ### Numbers, names and "locations" of mutations ### my $treename; my $mutation_no = 0; my %mutsonN; for (my $i = 0; $i < $ntrees; $i++) { $mutsonN{$i} = 0; } #################################################### # Loop until number of desired affected is reached # #################################################### ### ### Observed number of affected individuals at leafs (initially 0) ### my $obs_affected = 0; my %affection_status; until ($obs_affected >= $no_of_affected) { $mutation_no++; ### ### Pick one of n trees at random ### my $thistreenum = int(rand($ntrees)); ### ### Random point of time for that tree ### my $r = rand; my $rpot = $r * $Ntotaltime{$thistreenum}; ### ### Respective tree variables ### my %thissuccs = %{ $Nsuccs{$thistreenum} }; my %thistime = %{ $Ntime{$thistreenum} }; $mutsonN{$thistreenum}++; my $timesofar = 0; $treename = "T" . $thistreenum; ### ### Find corresponding branch on that tree ### my $thisone; my $ready = 0; foreach my $key (sort keys %thissuccs) { for (my $i = 0; $i < 2; $i++) { my $s = $thissuccs{$key}[$i]; my $t = $thistime{$key}{$s}; $timesofar = $timesofar + $t; if ($timesofar >= $rpot and !$ready) { $ready = 1; $thisone = $s; } } } ### ### Throw current mutation on found node of respective tree, and recursively on ### its successors until leaf is reached ### ### Different mutation name, i.e. number and tree my $thismutation = $mutation_no . "x" . $treename; throwmutations($thismutation, $thisone, \%mutations_at_node, \%thissuccs); ### ### Now check for affection status at leafs, depending on logistic penetrance model ### foreach my $node (keys %nodes) { if (!exists $thissuccs{$node}) { my $noofmuts = $mutations_at_node{$node}{count}; ### HERE IS THE MODEL! my $term = $alpha + ($beta * $noofmuts); my $affprob = exp($term) / (1 + exp($term)); my $randnr = rand; if ($randnr < $affprob) { $affection_status{$node} = 1; } else { $affection_status{$node} = 0; } } } ### ### Count affected individuals at leafs (calculate stop criterion) ### my $aff_sum = 0; foreach my $node (keys %affection_status) { $aff_sum = $aff_sum + $affection_status{$node}; } $obs_affected = $aff_sum; my $pop_freq = $obs_affected / $no_of_individuals; ### ### Statistics / main output ### for (my $mut = 1; $mut <= $mutation_no; $mut++) { my $carrier = 0; my $affected = 0; my $unaffected = 0; my $affected_mutations = 0; my $unaffected_mutations = 0; my $aff_car = 0; my $noaff_car = 0; foreach my $node (keys %affection_status) { my $affstat = $affection_status{$node}; if ($affstat == 1) { $affected++; $affected_mutations = $affected_mutations + $mutations_at_node{$node}{count}; } else { $unaffected++; $unaffected_mutations = $unaffected_mutations + $mutations_at_node{$node}{count}; } foreach my $obsmut (keys %{ $mutations_at_node{$node}{which} }) { my @mutsplit = split /x/, $obsmut; my $mutnr = $mutsplit[0]; if ($mutnr == $mut) { $carrier++; if ($affstat == 1) { $aff_car++; } else { $noaff_car++; } } } } my $aff_nocar = $affected - $aff_car; my $noaff_nocar = $unaffected - $noaff_car; my $freq_of_mut = $carrier / $no_of_individuals; my $OR = "NA"; if ($aff_nocar != 0 and $noaff_car != 0) { $OR = ($aff_car * $noaff_nocar) / ($aff_nocar * $noaff_car); } my $av_mut_affected = "NA"; if ($affected != 0) { $av_mut_affected = $affected_mutations / $affected; } my $av_mut_unaffected = "NA"; if ($unaffected != 0) { $av_mut_unaffected = $unaffected_mutations / $unaffected; } print STAT "$iter\t$mutation_no\t$affected\t$unaffected\t"; print STAT "$pop_freq\t$mut\t"; print STAT "$av_mut_affected\t$av_mut_unaffected\t"; print STAT "$aff_car\t$aff_nocar\t$noaff_car\t$noaff_nocar\t"; print STAT "$freq_of_mut\t$OR\n"; } } ### ### Close main loop ### } my $posttime = time(); my $timeinsec = $posttime - $pretime; my $timeinmin = $timeinsec / 60; print "Time: $timeinmin minutes ($timeinsec seconds).\n";