File Coverage

Bio/PopGen/Simulation/Coalescent.pm
Criterion Covered Total %
statement 119 130 91.5
branch 20 34 58.8
condition 3 9 33.3
subroutine 13 13 100.0
pod 7 7 100.0
total 162 193 83.9


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::PopGen::Simulation::Coalescent
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::PopGen::Simulation::Coalescent - A Coalescent simulation factory
17              
18             =head1 SYNOPSIS
19              
20             use Bio::PopGen::Simulation::Coalescent;
21             my @taxonnames = qw(SpeciesA SpeciesB SpeciesC SpeciesD);
22             my $sim1 = Bio::PopGen::Simulation::Coalescent->new(-samples => \@taxonnames);
23              
24             my $tree = $sim1->next_tree;
25              
26             # add 20 mutations randomly to the tree
27             $sim1->add_Mutations($tree,20);
28              
29             # or for anonymous samples
30              
31             my $sim2 = Bio::PopGen::Simulation::Coalescent->new( -sample_size => 6,
32             -maxcount => 50);
33             my $tree2 = $sim2->next_tree;
34             # add 20 mutations randomly to the tree
35             $sim2->add_Mutations($tree2,20);
36              
37             =head1 DESCRIPTION
38              
39             Builds a random tree every time next_tree is called or up to -maxcount
40             times with branch lengths and provides the ability to randomly add
41             mutations onto the tree with a probabilty proportional to the branch
42             lengths.
43              
44             This algorithm is based on the make_tree algorithm from Richard Hudson 1990.
45              
46             Hudson, R. R. 1990. Gene genealogies and the coalescent
47             process. Pp. 1-44 in D. Futuyma and J. Antonovics, eds. Oxford
48             surveys in evolutionary biology. Vol. 7. Oxford University
49             Press, New York.
50              
51             This module was previously named Bio::Tree::RandomTree
52              
53             =head1 FEEDBACK
54              
55             =head2 Mailing Lists
56              
57             User feedback is an integral part of the evolution of this and other
58             Bioperl modules. Send your comments and suggestions preferably to
59             the Bioperl mailing list. Your participation is much appreciated.
60              
61             bioperl-l@bioperl.org - General discussion
62             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
63              
64             =head2 Support
65              
66             Please direct usage questions or support issues to the mailing list:
67              
68             I
69              
70             rather than to the module maintainer directly. Many experienced and
71             reponsive experts will be able look at the problem and quickly
72             address it. Please include a thorough description of the problem
73             with code and data examples if at all possible.
74              
75             =head2 Reporting Bugs
76              
77             Report bugs to the Bioperl bug tracking system to help us keep track
78             of the bugs and their resolution. Bug reports can be submitted via
79             the web:
80              
81             https://github.com/bioperl/bioperl-live/issues
82              
83             =head1 AUTHOR - Jason Stajich, Matthew Hahn
84              
85             Email jason-at-bioperl-dot-org
86             Email matthew-dot-hahn-at-duke-dot-edu
87              
88             =head1 APPENDIX
89              
90             The rest of the documentation details each of the object methods.
91             Internal methods are usually preceded with a _
92              
93             =cut
94              
95              
96             # Let the code begin...
97              
98              
99             package Bio::PopGen::Simulation::Coalescent;
100 1     1   390 use vars qw($PRECISION_DIGITS);
  1         1  
  1         33  
101 1     1   3 use strict;
  1         1  
  1         19  
102              
103             $PRECISION_DIGITS = 3; # Precision for the branchlength
104              
105 1     1   281 use Bio::Tree::AlleleNode;
  1         1  
  1         20  
106 1     1   4 use Bio::PopGen::Genotype;
  1         1  
  1         14  
107 1     1   259 use Bio::Tree::Tree;
  1         2  
  1         23  
108              
109 1     1   5 use base qw(Bio::Root::Root Bio::Factory::TreeFactoryI);
  1         1  
  1         278  
110              
111              
112             =head2 new
113              
114             Title : new
115             Usage : my $obj = Bio::PopGen::Simulation::Coalescent->new();
116             Function: Builds a new Bio::PopGen::Simulation::Coalescent object
117             Returns : an instance of Bio::PopGen::Simulation::Coalescent
118             Args : -samples => arrayref of sample names
119             OR
120             -sample_size=> number of samples (samps will get a systematic name)
121             -maxcount => [optional] maximum number of trees to provide
122              
123             =cut
124              
125             sub new{
126 1     1 1 113 my ($class,@args) = @_;
127 1         9 my $self = $class->SUPER::new(@args);
128            
129 1         1 $self->{'_treecounter'} = 0;
130 1         2 $self->{'_maxcount'} = 0;
131 1         6 my ($maxcount, $samps,$samplesize ) = $self->_rearrange([qw(MAXCOUNT
132             SAMPLES
133             SAMPLE_SIZE)],
134             @args);
135 1         2 my @samples;
136            
137 1 50       3 if( ! defined $samps ) {
138 1 50 33     5 if( ! defined $samplesize || $samplesize <= 0 ) {
139 0         0 $self->throw("Must specify a valid samplesize if parameter -SAMPLE is not specified (sampsize is $samplesize)");
140             }
141 1         3 foreach ( 1..$samplesize ) { push @samples, "Samp$_"; }
  5         8  
142             } else {
143 0 0       0 if( ref($samps) !~ /ARRAY/i ) {
144 0         0 $self->throw("Must specify a valid ARRAY reference to the parameter -SAMPLES, did you forget a leading '\\'?");
145             }
146 0         0 @samples = @$samps;
147             }
148            
149 1         3 $self->samples(\@samples);
150 1         3 $self->sample_size(scalar @samples);
151 1 50       2 defined $maxcount && $self->maxcount($maxcount);
152 1         2 return $self;
153             }
154              
155             =head2 next_tree
156              
157             Title : next_tree
158             Usage : my $tree = $factory->next_tree
159             Function: Returns a random tree based on the initialized number of nodes
160             NOTE: if maxcount is not specified on initialization or
161             set to a valid integer, subsequent calls to next_tree will
162             continue to return random trees and never return undef
163             Returns : Bio::Tree::TreeI object
164             Args : none
165              
166             =cut
167              
168             sub next_tree{
169 1     1 1 3 my ($self) = @_;
170             # If maxcount is set to something non-zero then next tree will
171             # continue to return valid trees until maxcount is reached
172             # otherwise will always return trees
173             return if( $self->maxcount &&
174 1 50 33     3 $self->{'_treecounter'}++ >= $self->maxcount );
175 1         3 my $size = $self->sample_size;
176            
177 1         1 my $in;
178 1         1 my @tree = ();
179 1         2 my @list = ();
180            
181 1         6 for($in=0;$in < 2*$size -1; $in++ ) {
182 9         20 push @tree, { 'nodenum' => "Node$in" };
183             }
184             # in C we would have 2 arrays
185             # an array of nodes (tree)
186             # and array of pointers to these nodes (list)
187             # and we just shuffle the list items to do the
188             # tree topology generation
189             # instead in perl, we will have a list of hashes (nodes) called @tree
190             # and a list of integers representing the indexes in tree called @list
191              
192 1         3 for($in=0;$in < $size;$in++) {
193 5         4 $tree[$in]->{'time'} = 0;
194 5         3 $tree[$in]->{'desc1'} = undef;
195 5         5 $tree[$in]->{'desc2'} = undef;
196 5         6 push @list, $in;
197             }
198              
199 1         2 my $t=0;
200             # generate times for the nodes
201 1         2 for($in = $size; $in > 1; $in-- ) {
202 4         5 $t+= -2.0 * log(1 - $self->random(1)) / ( $in * ($in-1) );
203 4         9 $tree[2 * $size - $in]->{'time'} =$t;
204             }
205             # topology generation
206 1         2 for ($in = $size; $in > 1; $in-- ) {
207 4         5 my $pick = int $self->random($in);
208 4         2 my $nodeindex = $list[$pick];
209 4         4 my $swap = 2 * $size - $in;
210 4         4 $tree[$swap]->{'desc1'} = $nodeindex;
211 4         3 $list[$pick] = $list[$in-1];
212 4         4 $pick = int rand($in - 1);
213 4         3 $nodeindex = $list[$pick];
214 4         2 $tree[$swap]->{'desc2'} = $nodeindex;
215 4         6 $list[$pick] = $swap;
216             }
217             # Let's convert the hashes into nodes
218              
219 1         1 my @nodes = ();
220 1         2 foreach my $n ( @tree ) {
221             push @nodes,
222             Bio::Tree::AlleleNode->new(-id => $n->{'nodenum'},
223 9         27 -branch_length => $n->{'time'});
224             }
225 1         1 my $ct = 0;
226 1         2 foreach my $node ( @nodes ) {
227 9         7 my $n = $tree[$ct++];
228 9 100       13 if( defined $n->{'desc1'} ) {
229 4         9 $node->add_Descendent($nodes[$n->{'desc1'}]);
230             }
231 9 100       11 if( defined $n->{'desc2'} ) {
232 4         6 $node->add_Descendent($nodes[$n->{'desc2'}]);
233             }
234             }
235 1         7 my $T = Bio::Tree::Tree->new(-root => pop @nodes );
236 1         7 return $T;
237             }
238              
239             =head2 add_Mutations
240              
241             Title : add_Mutations
242             Usage : $factory->add_Mutations($tree, $mutcount);
243             Function: Adds mutations to a tree via a random process weighted by
244             branch length (it is a poisson distribution
245             as part of a coalescent process)
246             Returns : none
247             Args : $tree - Bio::Tree::TreeI
248             $nummut - number of mutations
249             $precision - optional # of digits for precision
250              
251              
252             =cut
253              
254             sub add_Mutations{
255 1     1 1 322 my ($self,$tree, $nummut,$precision) = @_;
256 1   33     5 $precision ||= $PRECISION_DIGITS;
257 1         2 $precision = 10**$precision;
258              
259 1         1 my @branches;
260             my @lens;
261 1         1 my $branchlen = 0;
262 1         1 my $last = 0;
263 1         2 my @nodes = $tree->get_nodes();
264 1         1 my $i = 0;
265              
266             # Jason's somewhat simplistics way of doing a poission
267             # distribution for a fixed number of mutations
268             # build an array and put the node number in a slot
269             # representing the branch to put a mutation on
270             # but weight the number of slots per branch by the
271             # length of the branch ( ancestor's time - node time)
272            
273 1         2 foreach my $node ( @nodes ) {
274 9 100       13 if( $node->ancestor ) {
275 8         11 my $len = int ( ($node->ancestor->branch_length -
276             $node->branch_length) * $precision);
277 8 50       13 if ( $len > 0 ) {
278 8         12 for( my $j =0;$j < $len;$j++) {
279 8817         9544 push @branches, $i;
280             }
281 8         6 $last += $len;
282             }
283 8         8 $branchlen += $len;
284             }
285 9 50       26 if( ! $node->isa('Bio::Tree::AlleleNode') ) {
286 0         0 bless $node, 'Bio::Tree::AlleleNode'; # rebless it to the right node
287             }
288             # This let's us reset the stored genotypes so we can keep reusing the
289             # same tree topology, but throw down mutations multiple times
290 9         13 $node->reset_Genotypes;
291 9         11 $i++;
292             }
293             # sanity check
294 1 50       4 $self->throw("branch len is $branchlen arraylen is $last")
295             unless ( $branchlen == $last );
296 1         2 my @mutations;
297 1         3 for( my $j = 0; $j < $nummut; $j++) {
298 100         172 my $index = int(rand($branchlen));
299 100         84 my $branch = $branches[$index];
300              
301             # We're using an infinite sites model so every new
302             # mutation is a new site
303 100         335 my $g = Bio::PopGen::Genotype->new(-marker_name => "Mutation$j",
304             -alleles => [1]);
305 100         168 $nodes[$branch]->add_Genotype($g);
306 100         149 push @mutations, "Mutation$j";
307             # Let's add this mutation to all the children (push it down
308             # the branches to the tips)
309 100         157 foreach my $child ( $nodes[$branch]->get_all_Descendents ) {
310 262         323 $child->add_Genotype($g);
311             }
312             }
313             # Insure that everyone who doesn't have the mutation
314             # has the ancestral state, which is '0'
315 1         2 foreach my $node ( @nodes ) {
316 9         8 foreach my $m ( @mutations ) {
317 900 100       1049 if( ! $node->has_Marker($m) ) {
318 538         1359 my $emptyg = Bio::PopGen::Genotype->new(-marker_name => $m,
319             -alleles => [0]);
320 538         851 $node->add_Genotype($emptyg);
321             }
322             }
323             }
324             }
325              
326             =head2 maxcount
327              
328             Title : maxcount
329             Usage : $obj->maxcount($newval)
330             Function:
331             Returns : Maxcount value
332             Args : newvalue (optional)
333              
334              
335             =cut
336              
337             sub maxcount{
338 1     1 1 1 my ($self,$value) = @_;
339 1 50       2 if( defined $value) {
340 0 0       0 if( $value =~ /^(\d+)/ ) {
341 0         0 $self->{'maxcount'} = $1;
342             } else {
343 0         0 $self->warn("Must specify a valid Positive integer to maxcount");
344 0         0 $self->{'maxcount'} = 0;
345             }
346             }
347 1         3 return $self->{'_maxcount'};
348             }
349              
350             =head2 samples
351              
352             Title : samples
353             Usage : $obj->samples($newval)
354             Function:
355             Example :
356             Returns : value of samples
357             Args : newvalue (optional)
358              
359              
360             =cut
361              
362             sub samples{
363 1     1 1 1 my ($self,$value) = @_;
364 1 50       3 if( defined $value) {
365 1 50       38 if( ref($value) !~ /ARRAY/i ) {
366 0         0 $self->warn("Must specify a valid array ref to the method 'samples'");
367 0         0 $value = [];
368             }
369 1         3 $self->{'samples'} = $value;
370             }
371 1         1 return $self->{'samples'};
372              
373             }
374              
375             =head2 sample_size
376              
377             Title : sample_size
378             Usage : $obj->sample_size($newval)
379             Function:
380             Example :
381             Returns : value of sample_size
382             Args : newvalue (optional)
383              
384              
385             =cut
386              
387             sub sample_size{
388 2     2 1 1 my ($self,$value) = @_;
389 2 100       5 if( defined $value) {
390 1         1 $self->{'sample_size'} = $value;
391             }
392 2         2 return $self->{'sample_size'};
393              
394             }
395              
396             =head2 random
397              
398             Title : random
399             Usage : my $rfloat = $node->random($size)
400             Function: Generates a random number between 0 and $size
401             This is abstracted so that someone can override and provide their
402             own special RNG. This is expected to be a uniform RNG.
403             Returns : Floating point random
404             Args : $maximum size for random number (defaults to 1)
405              
406              
407             =cut
408              
409             sub random{
410 8     8 1 8 my ($self,$max) = @_;
411 8         40 return rand($max);
412             }
413              
414             1;