File Coverage

Bio/Tools/Phylo/PAML.pm
Criterion Covered Total %
statement 732 841 87.0
branch 360 456 78.9
condition 109 161 67.7
subroutine 32 34 94.1
pod 2 2 100.0
total 1235 1494 82.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Phylo::PAML
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich, Aaron J Mackey
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Phylo::PAML - Parses output from the PAML programs codeml,
17             baseml, basemlg, codemlsites and yn00
18              
19             =head1 SYNOPSIS
20              
21             #!/usr/bin/perl -Tw
22             use strict;
23              
24             use Bio::Tools::Phylo::PAML;
25              
26             # need to specify the output file name (or a fh) (defaults to
27             # -file => "codeml.mlc"); also, optionally, the directory in which
28             # the other result files (rst, 2ML.dS, etc) may be found (defaults
29             # to "./")
30             my $parser = Bio::Tools::Phylo::PAML->new
31             (-file => "./results/mlc", -dir => "./results/");
32              
33             # get the first/next result; a Bio::Tools::Phylo::PAML::Result object,
34             # which isa Bio::SeqAnalysisResultI object.
35             my $result = $parser->next_result();
36              
37             # get the sequences used in the analysis; returns Bio::PrimarySeq
38             # objects (OTU = Operational Taxonomic Unit).
39             my @otus = $result->get_seqs();
40              
41             # codon summary: codon usage of each sequence [ arrayref of {
42             # hashref of counts for each codon } for each sequence and the
43             # overall sum ], and positional nucleotide distribution [ arrayref
44             # of { hashref of frequencies for each nucleotide } for each
45             # sequence and overall frequencies ]:
46             my ($codonusage, $ntdist) = $result->get_codon_summary();
47              
48             # example manipulations of $codonusage and $ntdist:
49             printf "There were %d %s codons in the first seq (%s)\n",
50             $codonusage->[0]->{AAA}, 'AAA', $otus[0]->id();
51             printf "There were %d %s codons used in all the sequences\n",
52             $codonusage->[$#{$codonusage}]->{AAA}, 'AAA';
53             printf "Nucleotide %c was present %g of the time in seq %s\n",
54             'A', $ntdist->[1]->{A}, $otus[1]->id();
55              
56             # get Nei & Gojobori dN/dS matrix:
57             my $NGmatrix = $result->get_NGmatrix();
58              
59             # get ML-estimated dN/dS matrix, if calculated; this corresponds to
60             # the runmode = -2, pairwise comparison usage of codeml
61             my $MLmatrix = $result->get_MLmatrix();
62              
63             # These matrices are length(@otu) x length(@otu) "strict lower
64             # triangle" 2D-matrices, which means that the diagonal and
65             # everything above it is undefined. Each of the defined cells is a
66             # hashref of estimates for "dN", "dS", "omega" (dN/dS ratio), "t",
67             # "S" and "N". If a ML matrix, "lnL" and "kappa" will also be defined.
68             printf "The omega ratio for sequences %s vs %s was: %g\n",
69             $otus[0]->id, $otus[1]->id, $MLmatrix->[0]->[1]->{omega};
70              
71             # with a little work, these matrices could also be passed to
72             # Bio::Tools::Run::Phylip::Neighbor, or other similar tree-building
73             # method that accepts a matrix of "distances" (using the LOWTRI
74             # option):
75             my $distmat = [ map { [ map { $$_{omega} } @$_ ] } @$MLmatrix ];
76              
77             # for runmode's other than -2, get tree topology with estimated
78             # branch lengths; returns a Bio::Tree::TreeI-based tree object with
79             # added PAML parameters at each node
80             my ($tree) = $result->get_trees();
81             for my $node ($tree->get_nodes()) {
82             # inspect the tree: the "t" (time) parameter is available via
83             # $node->branch_length(); all other branch-specific parameters
84             # ("omega", "dN", etc.) are available via
85             # ($omega) = $node->get_tag_values('omega');
86             }
87              
88             # if you are using model based Codeml then trees are stored in each
89             # modelresult object
90             for my $modelresult ( $result->get_NSSite_results ) {
91             # model M0, M1, etc
92             print "model is ", $modelresult->model_num, "\n";
93             my ($tree) = $modelresult->get_trees();
94             for my $node ($tree->get_nodes()) {
95             # inspect the tree: the "t" (time) parameter is available via
96             # $node->branch_length(); all other branch-specific parameters
97             # ("omega", "dN", etc.) are available via
98             # ($omega) = $node->get_tag_values('omega');
99             }
100             }
101              
102             # get any general model parameters: kappa (the
103             # transition/transversion ratio), NSsites model parameters ("p0",
104             # "p1", "w0", "w1", etc.), etc.
105             my $params = $result->get_model_params();
106             printf "M1 params: p0 = %g\tp1 = %g\n", $params->{p0}, $params->{p1};
107              
108             # parse AAML result files
109             my $aamat = $result->get_AADistMatrix();
110             my $aaMLmat = $result->get_AAMLDistMatrix();
111              
112             =head1 DESCRIPTION
113              
114             This module is used to parse the output from the PAML programs codeml,
115             baseml, basemlg, codemlsites and yn00. You can use the
116             Bio::Tools::Run::Phylo::PAML::* modules to actually run some of the
117             PAML programs, but this module is only useful to parse the output.
118              
119             This module has fledgling support for PAML version 4.3a (October 2009).
120             Please report any problems to the mailing list (see FEEDBACK below).
121              
122             =head1 TO DO
123              
124             Implement get_posteriors(). For NSsites models, obtain arrayrefs of
125             posterior probabilities for membership in each class for every
126             position; probabilities correspond to classes w0, w1, ... etc.
127              
128             my @probs = $result->get_posteriors();
129              
130             # find, say, positively selected sites!
131             if ($params->{w2} > 1) {
132             for (my $i = 0; $i < @probs ; $i++) {
133             if ($probs[$i]->[2] > 0.5) {
134             # assumes model M1: three w's, w0, w1 and w2 (positive selection)
135             printf "position %d: (%g prob, %g omega, %g mean w)\n",
136             $i, $probs[$i]->[2], $params->{w2}, $probs[$i]->[3];
137             }
138             }
139             } else { print "No positive selection found!\n"; }
140              
141             =head1 FEEDBACK
142              
143             =head2 Mailing Lists
144              
145             User feedback is an integral part of the evolution of this and other
146             Bioperl modules. Send your comments and suggestions preferably to
147             the Bioperl mailing list. Your participation is much appreciated.
148              
149             bioperl-l@bioperl.org - General discussion
150             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
151              
152             =head2 Support
153              
154             Please direct usage questions or support issues to the mailing list:
155              
156             I
157              
158             rather than to the module maintainer directly. Many experienced and
159             reponsive experts will be able look at the problem and quickly
160             address it. Please include a thorough description of the problem
161             with code and data examples if at all possible.
162              
163             =head2 Reporting Bugs
164              
165             Report bugs to the Bioperl bug tracking system to help us keep track
166             of the bugs and their resolution. Bug reports can be submitted via the
167             web:
168              
169             https://github.com/bioperl/bioperl-live/issues
170              
171             =head1 AUTHOR - Jason Stajich, Aaron Mackey
172              
173             Email jason-at-bioperl.org
174             Email amackey-at-virginia.edu
175              
176             =head1 CONTRIBUTORS
177              
178             Albert Vilella avilella-AT-gmail-DOT-com
179             Sendu Bala bix@sendu.me.uk
180             Dave Messina dmessina@cpan.org
181              
182             =head1 TODO
183              
184             RST parsing -- done, Avilella contributions bug#1506, added by jason 1.29
185             -- still need to parse in joint probability and non-syn changes
186             at site table
187              
188             =head1 APPENDIX
189              
190             The rest of the documentation details each of the object methods.
191             Internal methods are usually preceded with a _
192              
193             =cut
194              
195             # Let the code begin...
196              
197             package Bio::Tools::Phylo::PAML;
198 1     1   636 use vars qw($RSTFILENAME);
  1         1  
  1         34  
199 1     1   3 use strict;
  1         1  
  1         19  
200              
201             # Object preamble - inherits from Bio::Root::Root
202              
203 1     1   3 use base qw(Bio::Root::Root Bio::Root::IO Bio::AnalysisParserI);
  1         8  
  1         296  
204              
205             BEGIN {
206 1     1   13 $RSTFILENAME = 'rst'; # where to get the RST data from
207             }
208              
209             # other objects used:
210 1     1   4 use IO::String;
  1         1  
  1         17  
211 1     1   2 use File::Spec;
  1         1  
  1         14  
212 1     1   246 use Bio::TreeIO;
  1         2  
  1         23  
213 1     1   389 use Bio::Tools::Phylo::PAML::Result;
  1         2  
  1         26  
214 1     1   291 use Bio::LocatableSeq;
  1         3  
  1         24  
215 1     1   5 use Bio::PrimarySeq;
  1         1  
  1         16  
216 1     1   277 use Bio::Matrix::PhylipDist;
  1         1  
  1         21  
217 1     1   324 use Bio::Tools::Phylo::PAML::ModelResult;
  1         2  
  1         7970  
218              
219             =head2 new
220              
221             Title : new
222             Usage : my $obj = Bio::Tools::Phylo::PAML->new(%args);
223             Function: Builds a new Bio::Tools::Phylo::PAML object
224             Returns : Bio::Tools::Phylo::PAML
225             Args : Hash of options: -file, -fh, -dir
226             -file (or -fh) should contain the contents of the PAML
227             outfile;
228             -dir is the (optional) name of the directory in
229             which the PAML program was run (and includes other
230             PAML-generated files from which we can try to gather data)
231              
232             =cut
233              
234             sub new {
235              
236 20     20 1 48 my ( $class, @args ) = @_;
237              
238 20         77 my $self = $class->SUPER::new(@args);
239 20         68 $self->_initialize_io(@args);
240 20         69 my ($dir) = $self->_rearrange( [qw(DIR)], @args );
241 20 100       59 $self->{_dir} = $dir if defined $dir;
242 20         54 return $self;
243             }
244              
245             =head2 Implement Bio::AnalysisParserI interface
246              
247             =cut
248              
249             =head2 next_result
250              
251             Title : next_result
252             Usage : $result = $obj->next_result();
253             Function: Returns the next result available from the input, or
254             undef if there are no more results.
255             Example :
256             Returns : a Bio::Tools::Phylo::PAML::Result object
257             Args : none
258              
259             =cut
260              
261             sub next_result {
262              
263 20     20 1 2579 my ($self) = @_;
264 20         27 my %data;
265              
266             # parse the RST file, if it doesn't exist or if dir is not set
267             # this will just skip the parsing
268 20         61 $self->_parse_rst();
269 20         26 my $idlookup; # a hashreference to SEQID (number) ==> 'SEQUENCENAME'
270             # get the various codon and other sequence summary data, if necessary:
271             $self->_parse_summary
272 20 50 33     91 unless ( $self->{'_summary'} && !$self->{'_summary'}->{'multidata'} );
273              
274             # OK, depending on seqtype and runmode now, one of a few things can happen:
275 20         47 my $seqtype = $self->{'_summary'}->{'seqtype'};
276 20 100 100     92 if ( $seqtype eq 'CODONML' || $seqtype eq 'AAML' ) {
    100          
    50          
277 16         16 my $has_model_line = 0;
278 16         38 while ( defined( $_ = $self->_readline ) ) {
279 73 100 100     789 if ( $seqtype eq 'CODONML'
    100 100        
    100 100        
    100 100        
    100 100        
    50 66        
    50          
    50          
280             && m/^pairwise comparison, codon frequencies:/ )
281             {
282              
283             # runmode = -2, CODONML
284 8         36 $self->debug("pairwise Ka/Ks\n");
285 8         16 $self->_pushback($_);
286 8         19 %data = $self->_parse_PairwiseCodon;
287 8         14 last;
288             }
289             elsif ( $seqtype eq 'AAML' && m/^ML distances of aa seqs\.$/ ) {
290 1         5 $self->_pushback($_);
291              
292             # get AA distances
293 1         3 %data = ( '-AAMLdistmat' => $self->_parse_aa_dists() );
294              
295             # $self->_pushback($_);
296             # %data = $self->_parse_PairwiseAA;
297             # last;
298             }
299             elsif (
300             m/^Model\s+(\d+)/
301             || ( ( !$has_model_line && m/^TREE/ )
302             && $seqtype eq 'CODONML'
303             && ($self->{'_summary'}->{'version'} !~ /4/))
304             # last bit to keep PAML >= 4 from being caught here
305             # bug 2482. Not sure this is the right fix, but tests
306             # pass and the bug's test case passes.
307             )
308             {
309 9         24 $self->_pushback($_);
310 9         26 my $model = $self->_parse_NSsitesBatch;
311 9         16 push @{ $data{'-NSsitesresults'} }, $model;
  9         26  
312 9         33 $has_model_line = 1;
313             }
314             elsif (m/for each branch/) {
315 2         6 my %branch_dnds = $self->_parse_branch_dnds;
316 2 50       6 if ( !defined $data{'-trees'} ) {
317 0         0 $self->warn(
318             "No trees have been loaded, can't do anything\n");
319 0         0 next;
320             }
321 2         2 my ($tree) = @{ $data{'-trees'} };
  2         3  
322 2 50 33     18 if ( !$tree
      33        
323             || !ref($tree)
324             || !$tree->isa('Bio::Tree::Tree') )
325             {
326 0         0 $self->warn("no tree object already stored!\n");
327 0         0 next;
328             }
329              
330             # These need to be added to the Node/branches
331 2         9 while ( my ( $k, $v ) = each %branch_dnds ) {
332              
333             # we can probably do better by caching at some point
334 22         23 my @nodes;
335 22         51 for my $id ( split( /\.\./, $k ) ) {
336             my @nodes_L =
337 98         207 map { $tree->find_node( -id => $_ ) }
338 44         36 @{ $idlookup->{$id} };
  44         60  
339 44 100       108 my $n =
340             @nodes_L < 2
341             ? shift(@nodes_L)
342             : $tree->get_lca(@nodes_L);
343 44 50       60 if ( !$n ) {
344 0         0 $self->warn("no node for $n\n");
345             }
346 44 100 66     75 unless ( $n->is_Leaf && $n->id ) {
347 30         42 $n->id($id);
348             }
349 44         65 push @nodes, $n;
350             }
351 22         24 my ( $parent, $child ) = @nodes;
352 22         85 while ( my ( $kk, $vv ) = each %$v ) {
353 198         220 $child->add_tag_value( $kk, $vv );
354             }
355             }
356             }
357             elsif (m/^TREE/) {
358              
359             # runmode = 0
360 3         7 $self->_pushback($_);
361 3         12 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
362              
363             #last;
364             }
365             elsif (m/Heuristic tree search by stepwise addition$/) {
366              
367             # runmode = 3
368 0         0 $self->throw(
369             -class => 'Bio::Root::NotImplemented',
370             -text => "StepwiseAddition not yet implemented!"
371             );
372              
373             # $self->_pushback($_);
374             # %data = $self->_parse_StepwiseAddition;
375             # last;
376              
377             }
378             elsif (m/Heuristic tree search by NNI perturbation$/) {
379              
380             # runmode = 4
381 0         0 $self->throw(
382             -class => 'Bio::Root::NotImplemented',
383             -text => "NNI Perturbation not yet implemented!"
384             );
385              
386             # $self->_pushback($_);
387             # %data = $self->_parse_Perturbation;
388             # last;
389              
390             }
391             elsif (m/^stage 0:/) {
392              
393             # runmode = (1 or 2)
394 0         0 $self->throw(
395             -class => 'Bio::Root::NotImplemented',
396             -text => "StarDecomposition not yet implemented!"
397             );
398              
399 0         0 $self->_pushback($_);
400 0         0 %data = $self->_parse_StarDecomposition;
401 0         0 last;
402             }
403             }
404             }
405             elsif ( $seqtype eq 'BASEML' ) {
406 2         5 while ( defined( $_ = $self->_readline ) ) {
407 3 100       13 if (/^Distances:/) {
    50          
408 2         6 $self->_pushback($_);
409 2         7 my ( $kappa, $alpha ) = $self->_parse_nt_dists();
410 2         11 %data = (
411             '-kappa_distmat' => $kappa,
412             '-alpha_distmat' => $alpha
413             );
414             }
415             elsif (/^TREE/) {
416 1         4 $self->_pushback($_);
417 1         4 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
418             }
419             }
420             }
421             elsif ( $seqtype eq 'YN00' ) {
422 2         5 while ( $_ = $self->_readline ) {
423 3 100       9 if (
424             m/^Estimation by the method|\(B\) Yang & Nielsen \(2000\) method/
425             )
426             {
427 2         7 $self->_pushback($_);
428 2         7 %data = $self->_parse_YN_Pairwise;
429 2         2 last;
430             }
431             }
432             }
433 20 50       39 if (%data) {
434 20         45 $data{'-version'} = $self->{'_summary'}->{'version'};
435 20         34 $data{'-seqs'} = $self->{'_summary'}->{'seqs'};
436 20         30 $data{'-patterns'} = $self->{'_summary'}->{'patterns'};
437 20         30 $data{'-ngmatrix'} = $self->{'_summary'}->{'ngmatrix'};
438 20         35 $data{'-codonpos'} = $self->{'_summary'}->{'codonposition'};
439 20         27 $data{'-codonfreq'} = $self->{'_summary'}->{'codonfreqs'};
440 20         33 $data{'-model'} = $self->{'_summary'}->{'model'};
441 20         35 $data{'-seqfile'} = $self->{'_summary'}->{'seqfile'};
442 20         31 $data{'-aadistmat'} = $self->{'_summary'}->{'aadistmat'};
443 20         34 $data{'-stats'} = $self->{'_summary'}->{'stats'};
444 20         21 $data{'-aafreq'} = $self->{'_summary'}->{'aafreqs'};
445 20         22 $data{'-ntfreq'} = $self->{'_summary'}->{'ntfreqs'};
446 20         31 $data{'-input_params'} = $self->{'_summary'}->{'inputparams'};
447 20         36 $data{'-rst'} = $self->{'_rst'}->{'rctrted_seqs'};
448 20         23 $data{'-rst_persite'} = $self->{'_rst'}->{'persite'};
449 20         23 $data{'-rst_trees'} = $self->{'_rst'}->{'trees'};
450 20         217 return Bio::Tools::Phylo::PAML::Result->new(%data);
451             }
452             else {
453 0         0 return;
454             }
455             }
456              
457             sub _parse_summary {
458 20     20   22 my ($self) = @_;
459              
460             # Depending on whether verbose > 0 or not, and whether the result
461             # set comes from a multi-data run, the first few lines could be
462             # various things; we're going to throw away any sequence data
463             # here, since we'll get it later anyways
464              
465             # multidata ? : \n\nData set 1\n
466             # verbose ? : cleandata ? : \nBefore deleting alignment gaps. \d sites\n
467             # [ sequence printout ]
468             # \nAfter deleting gaps. \d sites\n"
469             # : [ sequence printout ]
470             # CODONML (in paml 3.12 February 2002) <<-- what we want to see!
471              
472 20         95 my $SEQTYPES = qr( (?: (?: CODON | AA | BASE | CODON2AA ) ML ) | YN00 )x;
473 20         25 my $line;
474 20 50       59 $self->{'_already_parsed_seqs'} = $self->{'_already_parsed_seqs'} ? 1 : 0;
475 20         21 my @lines;
476 20         65 while ( $_ = $self->_readline ) {
477 185         189 push @lines, $_;
478 185 100 100     2728 if (m/^($SEQTYPES) \s+ # seqtype: CODONML, AAML, BASEML, CODON2AAML, YN00, etc
    100 100        
    100 66        
    50          
    100          
    50          
479             (?: \(in \s+ ([^\)]+?) \s* \) \s* )? # version: "paml 3.12 February 2002"; not present < 3.1 or YN00
480             (\S+) \s* # tree filename
481             (?: (.+?) )? # model description (not there in YN00)
482             \s* $ # trim any trailing space
483             /ox
484             )
485             {
486 20         44 @{ $self->{'_summary'} }{qw(seqtype version seqfile model)} =
  20         138  
487             ( $1, $2, $3, $4 );
488              
489             # in 4.3, the model is on its own line
490 20 100       75 if ( !defined $self->{'_summary'}->{'model'} ) {
491 8         18 my $model_line = $self->_readline;
492 8         13 chomp $model_line;
493 8 100       24 if ($model_line =~ /^Model:/) {
494 6         11 $self->{'_summary'}->{'model'} = $model_line;
495             }
496             }
497              
498             defined $self->{'_summary'}->{'model'}
499 20 100       108 && $self->{'_summary'}->{'model'} =~ s/Model:\s+//;
500             $self->_pushback($_)
501 20 50       59 if $self->{'_summary'}->{'seqtype'} eq 'AAMODEL';
502 20         32 last;
503             }
504             elsif ((m/\s+?\d+?\s+?\d+?/) && ( $self->{'_already_parsed_seqs'} != 1 )) {
505 6         18 $self->_parse_seqs;
506             }
507             elsif (m/^Data set \d$/) {
508 1         3 $self->{'_summary'} = {};
509 1         4 $self->{'_summary'}->{'multidata'}++;
510             }
511             elsif (m/^Before\s+deleting\s+alignment\s+gaps/) { #Gap
512 0         0 my ($phylip_header) = $self->_readline;
513 0         0 $self->_parse_seqs;
514             }
515             elsif ( ( @lines >= 3 ) && ( $self->{'_already_parsed_seqs'} != 1 ) )
516             { #No gap
517             # don't start parsing seqs yet if we're on a blank line
518             # (gives another opportunity to match one of the other regexes)
519 5 100       21 unless (/^\n$/) {
520 2         8 $self->_parse_seqs;
521             }
522             }
523             elsif ( (/Printing out site pattern counts/)
524             && ( $self->{'_already_parsed_seqs'} != 1 ) ) {
525 0         0 $self->_parse_patterns;
526             }
527             }
528              
529 20 50       50 unless ( defined $self->{'_summary'}->{'seqtype'} ) {
530 0         0 $self->throw(
531             -class => 'Bio::Root::NotImplemented',
532             -text => 'Unknown format of PAML output did not see seqtype'
533             );
534             }
535 20         37 my $seqtype = $self->{'_summary'}->{'seqtype'};
536 20 100       68 if ( $seqtype eq "CODONML" ) {
    100          
    50          
    100          
    50          
537 14         34 $self->_parse_inputparams(); # settings from the .ctl file
538             # that get printed
539 14         38 $self->_parse_patterns(); # codon patterns - not very interesting
540 14         34 $self->_parse_seqs(); # the sequences data used for analysis
541 14         33 $self->_parse_codoncts(); # counts and distributions of codon/nt
542             # usage
543 14         34 $self->_parse_codon_freqs(); # codon frequencies
544 14         38 $self->_parse_distmat(); # NG distance matrices
545             }
546             elsif ( $seqtype eq "AAML" ) {
547 2         8 $self->_parse_inputparams;
548 2         6 $self->_parse_patterns();
549 2         21 $self->_parse_seqs(); # the sequences data used for analysis
550 2         7 $self->_parse_aa_freqs(); # AA frequencies
551             # get AA distances
552 2         6 $self->{'_summary'}->{'aadistmat'} = $self->_parse_aa_dists();
553              
554             }
555             elsif ( $seqtype eq "CODON2AAML" ) {
556 0         0 $self->throw(
557             -class => 'Bio::Root::NotImplemented',
558             -text => 'CODON2AAML parsing not yet implemented!'
559             );
560             }
561             elsif ( $seqtype eq "BASEML" ) {
562 2         8 $self->_parse_patterns();
563 2         7 $self->_parse_seqs();
564 2         7 $self->_parse_nt_freqs();
565              
566             }
567             elsif ( $seqtype eq "YN00" ) {
568 2         5 $self->_parse_codon_freqs();
569 2         6 $self->_parse_codoncts();
570 2         4 $self->_parse_distmat(); # NG distance matrices
571             }
572             else {
573 0         0 $self->throw(
574             -class => 'Bio::Root::NotImplemented',
575             -text => 'Unknown seqtype, not yet implemented!',
576             -value => $seqtype
577             );
578             }
579              
580             }
581              
582             sub _parse_inputparams {
583 16     16   20 my ($self) = @_;
584 16         39 while ( defined( $_ = $self->_readline ) ) {
585 39 100 66     273 if (/^((?:Codon frequencies)|(?:Site-class models))\s*:\s+(.+)/) {
    100          
    100          
586 10         27 my ( $param, $val ) = ( $1, $2 );
587 10         40 $self->{'_summary'}->{'inputparams'}->{$param} = $val;
588             }
589             elsif (/^\s+$/) {
590 4         7 next;
591             }
592             elsif ( /^ns\s+=\s+/ || /^Frequencies/ ) {
593 16         50 $self->_pushback($_);
594 16         23 last;
595             }
596             }
597             }
598              
599             sub _parse_codon_freqs {
600 16     16   17 my ($self) = @_;
601 16         21 my ( $okay, $done ) = ( 0, 0 );
602              
603 16         32 while ( defined( $_ = $self->_readline ) ) {
604 1495 100       9145 if (/^Nei|\(A\) Nei/) { $self->_pushback($_); last }
  2         7  
  2         3  
605 1493 100       1635 last if ($done);
606 1479 100       2490 next if (/^\s+/);
607             next
608 922 100 100     2928 unless ( $okay || /^Codon position x base \(3x4\) table\, overall/ );
609 56         44 $okay = 1;
610 56 100       182 if (s/^position\s+(\d+):\s+//) {
611 42         73 my $pos = $1;
612 42         114 s/\s+$//;
613 42         91 my @bases = split;
614 42         55 foreach my $str (@bases) {
615 168         229 my ( $base, $freq ) = split( /:/, $str, 2 );
616 168         333 $self->{'_summary'}->{'codonposition'}->[ $pos - 1 ]->{$base} =
617             $freq;
618             }
619 42 100       131 $done = 1 if $pos == 3;
620             }
621             }
622 16         18 $done = 0;
623 16         33 while ( defined( $_ = $self->_readline ) ) {
624 37 100       113 if (/^Nei\s\&\sGojobori|\(A\)\sNei-Gojobori/) {
625 8         21 $self->_pushback($_);
626 8         17 last;
627             }
628 29 100       52 last if ($done);
629 21 100       60 if (/^Codon frequencies under model, for use in evolver/) {
630 8         21 while ( defined( $_ = $self->_readline ) ) {
631 136 100       260 last if (/^\s+$/);
632 128         194 s/^\s+//;
633 128         241 s/\s+$//;
634 128         84 push @{ $self->{'_summary'}->{'codonfreqs'} }, [split];
  128         427  
635             }
636 8         15 $done = 1;
637             }
638             }
639             }
640              
641             sub _parse_aa_freqs {
642 2     2   3 my ($self) = @_;
643 2         5 my ( $okay, $done, $header ) = ( 0, 0, 0 );
644 2         1 my (@bases);
645 2 50       2 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
  2         21  
646 2         5 while ( defined( $_ = $self->_readline ) ) {
647 34 100 66     99 if ( /^TREE/ || /^AA distances/ ) {
648 1         2 $self->_pushback($_);
649 1         3 last;
650             }
651 33 100       37 last if ($done);
652 32 100 100     108 next if ( /^\s+$/ || /^\(Ambiguity/ );
653 20 100       70 if (/^Frequencies\./) {
    50          
    100          
    100          
    100          
654 2         4 $okay = 1;
655             }
656             elsif ( !$okay ) { # skip till we see 'Frequencies.
657 0         0 next;
658             }
659             elsif ( !$header ) {
660 2         8 s/^\s+//; # remove leading whitespace
661 2         10 @bases = split; # get an array of the all the aa names
662 2         3 $header = 1;
663 2         5 $self->{'_summary'}->{'aafreqs'} = {}; # reset/clear values
664 2         4 next;
665             }
666             elsif (
667             /^\#\s+constant\s+sites\:\s+
668             (\d+)\s+ # constant sites
669             \(\s*([\d\.]+)\s*\%\s*\)/x
670             )
671             {
672 2         11 $self->{'_summary'}->{'stats'}->{'constant_sites'} = $1;
673 2         7 $self->{'_summary'}->{'stats'}->{'constant_sites_percentage'} = $2;
674             }
675             elsif (/^ln\s+Lmax\s+\(unconstrained\)\s+\=\s+(\S+)/x) {
676 1         4 $self->{'_summary'}->{'stats'}->{'loglikelihood'} = $1;
677 1         4 $done = 1; # done for sure
678             }
679             else {
680 13         66 my ( $seqname, @freqs ) = split;
681 13         16 my $basect = 0;
682 13         16 foreach my $f (@freqs) {
683              
684             # this will also store 'Average'
685             $self->{'_summary'}->{'aafreqs'}->{$seqname}
686 260         321 ->{ $bases[ $basect++ ] } = $f;
687             }
688             }
689             }
690             }
691              
692             # This is for parsing the automatic tree output
693              
694             sub _parse_StarDecomposition {
695 0     0   0 my ($self) = @_;
696 0         0 my %data;
697              
698 0         0 return %data;
699             }
700              
701             sub _parse_aa_dists {
702 3     3   5 my ($self) = @_;
703 3         4 my ( $okay, $seen, $done ) = ( 0, 0, 0 );
704 3         3 my ( %matrix, @names, @values );
705 3 50       4 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
  3         8  
706 3         4 my $type = '';
707 3         6 while ( defined( $_ = $self->_readline ) ) {
708 23 100       30 last if $done;
709 21 50       25 if (/^TREE/) { $self->_pushback($_); last; }
  0         0  
  0         0  
710 21 100       42 if (/^\s+$/) {
711 2 50       5 last if ($seen);
712 2         2 next;
713             }
714 19 100       32 if (/^(AA|ML) distances/) {
715 3         4 $okay = 1;
716 3         6 $type = $1;
717 3         6 next;
718             }
719 16         51 s/\s+$//g; # remove trailing space
720 16 50       23 if ($okay) {
721 16         29 my ( $seqname, @vl ) = split;
722 16         13 $seen = 1;
723 16         11 my $i = 0;
724              
725             # hacky workaround to problem with 3.14 aaml
726 16 50 100     54 if (
      66        
727             $type eq 'ML'
728             && !@names
729             && # first entry
730             @vl
731             )
732             { # not empty
733 0         0 push @names, $self->{'_summary'}->{'seqs'}->[0]->display_id;
734             }
735 16         18 for my $s (@names) {
736 35 50       42 last unless @vl;
737 35         54 $matrix{$seqname}->{$s} = $matrix{$s}->{$seqname} = shift @vl;
738             }
739 16         15 push @names, $seqname;
740              
741 16         22 $matrix{$seqname}->{$seqname} = 0;
742             }
743 16 100       35 $done = 1 if ( scalar @names == $numseqs );
744             }
745 3         3 my %dist;
746 3         3 my $i = 0;
747 3         5 @values = ();
748 3         4 foreach my $lname (@names) {
749 16         10 my @row;
750 16         11 my $j = 0;
751 16         12 foreach my $rname (@names) {
752 86         63 my $v = $matrix{$lname}->{$rname};
753 86 50       90 $v = $matrix{$rname}->{$lname} unless defined $v;
754 86         65 push @row, $v;
755 86         116 $dist{$lname}{$rname} = [ $i, $j++ ];
756             }
757 16         13 $i++;
758 16         19 push @values, \@row;
759             }
760             return new Bio::Matrix::PhylipDist(
761 3         29 -program => $self->{'_summary'}->{'seqtype'},
762             -matrix => \%dist,
763             -names => \@names,
764             -values => \@values
765             );
766             }
767              
768             sub _parse_patterns {
769 18     18   26 my ($self) = @_;
770 18         25 my ( $patternct, @patterns, $ns, $ls );
771 18 50       44 return if exists $self->{'_summary'}->{'patterns'};
772              
773 18         38 while ( defined( $_ = $self->_readline ) ) {
774 99 100 66     507 if ( /^Codon\s+(usage|position)/ || /Model/ ) {
    100          
    100          
    100          
775 11         22 $self->_pushback($_);
776 11         13 last;
777             }
778             elsif ($patternct) {
779              
780             # last unless ( @patterns == $patternct );
781 52 100       104 last if (/^\s+$/);
782 45         83 s/^\s+//;
783 45         228 push @patterns, split;
784             }
785             elsif (/^ns\s+\=\s*(\d+)\s+ls\s+\=\s*(\d+)/) {
786 18         53 ( $ns, $ls ) = ( $1, $2 );
787             }
788             elsif (/^\# site patterns \=\s*(\d+)/) {
789 7         20 $patternct = $1;
790             }
791             else {
792              
793             # $self->debug("Unknown line: $_");
794             }
795             }
796 18         115 $self->{'_summary'}->{'patterns'} = {
797             -patterns => \@patterns,
798             -ns => $ns,
799             -ls => $ls
800             };
801             }
802              
803             sub _parse_seqs {
804              
805             # this should in fact be packed into a Bio::SimpleAlign object instead of
806             # an array but we'll stay with this for now
807 26     26   39 my ($self) = @_;
808              
809             # Use this flag to deal with paml 4 vs 3 differences
810             # In PAML 4 the sequences precede the CODONML|BASEML|AAML
811             # while in PAML3 the files start off with this
812 26 100       67 return 1 if $self->{'_already_parsed_seqs'};
813 18         19 my ( @firstseq, @seqs );
814 18         41 while ( defined( $_ = $self->_readline ) ) {
815 91 100       214 if (/^(Printing|After|TREE|Codon)/) {
816 5         13 $self->_pushback($_);
817 5         6 last;
818             }
819 86 100 100     302 last if ( /^\s+$/ && @seqs > 0 );
820 73 100       142 next if (/^\s+$/);
821 67 100       384 next if (/^\d+\s+$/);
822              
823             # we are reading PHYLIP format
824 66         218 my ( $name, $seqstr ) = split( /\s+/, $_, 2 );
825 66         3714 $seqstr =~ s/\s+//g; # remove whitespace
826 66 100       114 unless (@firstseq) {
827 13         716 @firstseq = split( //, $seqstr );
828 13         116 push @seqs,
829             Bio::LocatableSeq->new(
830             -display_id => $name,
831             -seq => $seqstr
832             );
833             }
834             else {
835              
836 53         51 my $i = 0;
837 53         47 my $v;
838 53         148 while ( ( $v = index( $seqstr, '.', $i ) ) >= $i ) {
839              
840             # replace the '.' with the correct seq from the
841 2741         1685 substr( $seqstr, $v, 1, $firstseq[$v] );
842 2741         2789 $i = $v;
843             }
844 53         169 push @seqs,
845             Bio::LocatableSeq->new(
846             -display_id => $name,
847             -seq => $seqstr
848             );
849             }
850             }
851 18 100       40 if ( @seqs > 0 ) {
852 13         27 $self->{'_summary'}->{'seqs'} = \@seqs;
853 13         16 $self->{'_already_parsed_seqs'} = 1;
854             }
855 18         277 1;
856             }
857              
858       16     sub _parse_codoncts { }
859              
860             sub _parse_distmat {
861 16     16   15 my ($self) = @_;
862 16         13 my @results;
863 16         24 my $ver = 3.14;
864 16         15 my $firstseq, my $secondseq;
865              
866 16         28 while ( defined( $_ = $self->_readline ) ) {
867 24 100       71 next if /^\s+$/;
868              
869             # We need to get the names of the sequences if this is from YN00:
870 16 100       34 if (/^\(A\)\sNei-Gojobori\s\(1986\)\smethod/) {
871 1         2 $ver = 3.15;
872 1         2 while ( defined( $_ = $self->_readline ) ) {
873 9 100       27 if ($_ =~ m/.*\d+?\.\d+?\s*\(.*/) {
874 1         1 $secondseq = $_;
875 1         2 last;
876             }
877 8         13 $firstseq = $_;
878             }
879             }
880 16         17 last;
881             }
882              
883             #return unless (/^Nei\s*\&\s*Gojobori/);
884              
885             # skip the next 3 lines
886 16 100       43 if ( $self->{'_summary'}->{'seqtype'} eq 'CODONML' ) {
887 14         26 $self->_readline;
888 14         31 $self->_readline;
889 14         36 $self->_readline;
890             }
891 16         20 my $seqct = 0;
892 16         15 my @seqs;
893 16 100       33 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
894 2 100       6 if ($firstseq) {
895 1         5 $firstseq =~ s/(.+?)\s+.*/$1/;
896 1         4 $secondseq =~ s/(.+?)\s+.*/$1/;
897 1         3 chomp $firstseq;
898 1         1 chomp $secondseq;
899 1         8 push @seqs, Bio::PrimarySeq->new( -display_id => $firstseq );
900 1         7 push @seqs, Bio::PrimarySeq->new( -display_id => $secondseq );
901             }
902             }
903 16         34 while ( defined( $_ = $self->_readline ) ) {
904 98 50 66     271 last if ( /^\s+$/ && exists $self->{'_summary'}->{'ngmatrix'} );
905 82 50 33     311 next if ( /^\s+$/ || /^NOTE:/i );
906 82         81 chomp;
907              
908 82         62 my ( $seq, $rest );
909 82 100       113 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
910 12         34 ( $seq, $rest ) = split( /\s+/, $_, 2 );
911             }
912             else {
913 70         268 $_ =~ m/(.+?)\s*(-*\d+?\.\d+?.*)/;
914 70         99 $seq = $1;
915 70         77 $rest = $2;
916             }
917 82 100       155 $rest = '' unless defined $rest; # get rid of empty messages
918 82         60 my $j = 0;
919 82 100       123 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
920 12         67 push @seqs, Bio::PrimarySeq->new( -display_id => $seq );
921             }
922 82   100     406 while ($rest
923             && $rest =~
924             /(\-?\d+(\.\d+)?)\s*\(\-?(\d+(\.\d+)?)\s+(\-?\d+(\.\d+)?)\)/g )
925             {
926 218         1437 $self->{'_summary'}->{'ngmatrix'}->[ $j++ ]->[$seqct] = {
927             'omega' => $1,
928             'dN' => $3,
929             'dS' => $5
930             };
931             }
932 82         160 $seqct++;
933             }
934 16 100 66     51 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' && @seqs ) {
935 2         5 $self->{'_summary'}->{'seqs'} = \@seqs;
936             }
937              
938 16         200 1;
939             }
940              
941             sub _parse_PairwiseCodon {
942 8     8   7 my ($self) = @_;
943 8         9 my @result;
944 8         8 my ( $a, $b, $log, $model, $t, $kappa, $omega, $fixedkappa );
945             # check to see if we have a fixed kappa:
946 8 100       27 if ( $self->{'_summary'}->{'model'} =~ /kappa = (\d+?\.\d+?) fixed/) {
947 1         3 $fixedkappa = $1;
948             }
949 8         13 while ( defined( $_ = $self->_readline ) ) {
950 680 100       2158 if (/^pairwise comparison, codon frequencies\:\s*(\S+)\./) {
    100          
    100          
    100          
    50          
    0          
951 8         25 $model = $1;
952             }
953             # 1st line of a pair block, e.g.
954             # 2 (all_c7259) ... 1 (all_s57600)
955             elsif (/^(\d+)\s+\((\S+)\)\s+\.\.\.\s+(\d+)\s+\((\S+)\)/) {
956 112         252 ( $a, $b ) = ( $1, $3 );
957             }
958             # 2nd line of a pair block, e.g.
959             # lnL = -126.880601
960             elsif (/^lnL\s+\=\s*(\-?\d+(\.\d+)?)/) {
961 112         136 $log = $1;
962 112 50       146 if ( defined( $_ = $self->_readline ) ) {
963             # 3rd line of a pair block, e.g.
964             # 0.19045 2.92330 0.10941
965 112         232 s/^\s+//;
966 112         277 ( $t, $kappa, $omega ) = split;
967             # if there was a fixed kappa, there will only be two values here ($t, $omega) and $kappa = $fixedkappa.
968 112 100       289 if ($omega eq "") {
969 1         2 $omega = $kappa;
970 1         3 $kappa = $fixedkappa;
971             }
972             }
973             }
974             # 5th line of a pair block, e.g.
975             # t= 0.1904 S= 5.8 N= 135.2 dN/dS= 0.1094 dN= 0.0476 dS= 0.4353
976             # OR lines like (note last field; this includes a fix for bug #3040)
977             # t= 0.0439 S= 0.0 N= 141.0 dN/dS= 0.1626 dN= 0.0146 dS= nan
978             elsif (m/^t\=\s*(\d+(\.\d+)?)\s+/)
979             {
980             # Breaking out each piece individually so that you can see
981             # what each regexp actually looks for
982 112         89 my $parse_string = $_;
983 112         263 $parse_string =~ m/.*t\s*\=\s*(\d+?\.\d+?)\s/;
984 112         122 my $temp_t = $1;
985 112         194 $parse_string =~ m/\sS\s*\=\s*(\d+?\.\d+?)\s/;
986 112         97 my $temp_S = $1;
987 112         173 $parse_string =~ m/\sN\s*\=\s*(\d+?\.\d+?)\s/;
988 112         102 my $temp_N = $1;
989 112         186 $parse_string =~ m/\sdN\/dS\s*\=\s*(\d+?\.\d+?)\s/;
990 112         92 my $temp_omega = $1;
991 112         185 $parse_string =~ m/\sdN\s*\=\s*(\d+?\.\d+?)\s/;
992 112         101 my $temp_dN = $1;
993 112         168 $parse_string =~ m/\sdS\s*\=\s*(.+)\s/;
994 112         103 my $temp_dS = $1;
995 112 50 33     1109 $result[ $b - 1 ]->[ $a - 1 ] = {
    50 33        
996             'lnL' => $log,
997             't' => defined $t && length($t) ? $t : $temp_t,
998             'S' => $temp_S,
999             'N' => $temp_N,
1000             'kappa' => $kappa,
1001             'omega' => defined $omega && length($omega) ? $omega : $temp_omega,
1002             'dN' => $temp_dN,
1003             'dS' => $temp_dS
1004             };
1005             }
1006             # 4th line of a pair block (which is blank)
1007             elsif (/^\s+$/) {
1008 336         466 next;
1009             }
1010             elsif (/^\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)/) {
1011             }
1012             else {
1013 0         0 $self->debug("unknown line: $_");
1014             }
1015             }
1016 8         32 return ( -mlmatrix => \@result );
1017             }
1018              
1019             sub _parse_YN_Pairwise {
1020 2     2   3 my ($self) = @_;
1021 2         3 my @result;
1022 2         5 while ( defined( $_ = $self->_readline ) ) {
1023 11 100       24 last if (/^seq\.\s+seq\./);
1024             }
1025 2         3 while ( defined( $_ = $self->_readline ) ) {
1026 51 100       313 if (
    100          
    100          
1027             m/^\s+(\d+)\s+ # seq #
1028             (\d+)\s+ # seq #
1029             (\d+(\.\d+))\s+ # S
1030             (\d+(\.\d+))\s+ # N
1031             (\d+(\.\d+))\s+ # t
1032             (\d+(\.\d+))\s+ # kappa
1033             (\d+(\.\d+))\s+ # omega
1034             \-??(\d+(\.\d+))\s+ # dN
1035             \+\-\s+
1036             \-??(\d+(\.\d+))\s+ # dN SE
1037             \-??(\d+(\.\d+))\s+ # dS
1038             \+\-\s+
1039             \-??(\d+(\.\d+))\s+ # dS SE
1040             /ox
1041             )
1042             {
1043              
1044 43         302 $result[ $2 - 1 ]->[ $1 - 1 ] = {
1045             'S' => $3,
1046             'N' => $5,
1047             't' => $7,
1048             'kappa' => $9,
1049             'omega' => $11,
1050             'dN' => $13,
1051             'dN_SE' => $15,
1052             'dS' => $17,
1053             'dS_SE' => $19,
1054             };
1055             }
1056             elsif (/^\s+$/) {
1057 4         6 next;
1058             }
1059             elsif (/^\(C\) LWL85, LPB93 & LWLm methods/) {
1060 1         3 $self->_pushback($_);
1061 1         1 last;
1062             }
1063             }
1064 2         9 return ( -mlmatrix => \@result );
1065             }
1066              
1067             sub _parse_Forestry {
1068 13     13   20 my ($self) = @_;
1069 13         31 my ( $instancecount, $num_param, $loglikelihood, $score, $done,
1070             $treelength ) = ( 0, 0, 0, 0, 0, 0 );
1071 13         11 my $okay = 0;
1072 13         14 my ( @ids, %match, @branches, @trees );
1073 13         24 while ( defined( $_ = $self->_readline ) ) {
1074 175 50       211 last if $done;
1075 175 100 33     1868 if (s/^TREE\s+\#\s*\d+:\s+//) {
    100 33        
    100 33        
    100 66        
    100          
    100          
    100          
1076 13         66 ($score) = (s/MP\s+score\:\s+(\S+)\s+$//);
1077 13         105 @ids = /(\d+)[\,\)]/g;
1078             }
1079             elsif (/^Node\s+\&/
1080             || /^\s+N37/
1081             || /^(CODONML|AAML|YN00|BASEML)/
1082             || /^\*\*/
1083             || /^Detailed output identifying parameters/ )
1084             {
1085 11         30 $self->_pushback($_);
1086 11         11 $done = 1;
1087 11         25 last;
1088             }
1089             elsif (/^tree\s+length\s+\=\s+(\S+)/) {
1090 13         36 $treelength = $1; # not going to store this for now
1091             # as it is directly calculated from
1092             # $tree->total_branch_length;
1093             }
1094             elsif (/^\s*lnL\(.+np\:\s*(\d+)\)\:\s+(\S+)/) {
1095              
1096             # elsif( /^\s*lnL\(.+\)\:\s+(\S+)/ ) {
1097 13         50 ( $num_param, $loglikelihood ) = ( $1, $2 );
1098             }
1099             elsif (/^\(/) {
1100 26         249 s/([\,:])\s+/$1/g;
1101 26         150 my $treestr = IO::String->new($_);
1102 26         1115 my $treeio = Bio::TreeIO->new(
1103             -fh => $treestr,
1104             -format => 'newick'
1105             );
1106 26         53 my $tree = $treeio->next_tree;
1107 26 50       45 if ($tree) {
1108 26         39 $tree->score($loglikelihood);
1109 26         68 $tree->id("num_param:$num_param");
1110 26 100       57 if ( $okay > 0 ) {
1111              
1112             # we don't save the trees with the number labels
1113 13 50 33     62 if ( !%match && @ids ) {
1114 13         18 my $i = 0;
1115 13         137 for my $m (/([^():,]+):/g) {
1116 66         124 $match{ shift @ids } = [$m];
1117             }
1118 13         20 my %grp;
1119 13         38 while ( my $br = shift @branches ) {
1120 120         104 my ( $parent, $child ) = @$br;
1121 120 100       131 if ( $match{$child} ) {
1122 93         60 push @{ $match{$parent} }, @{ $match{$child} };
  93         102  
  93         196  
1123             }
1124             else {
1125 27         51 push @branches, $br;
1126             }
1127             }
1128 13 50       43 if ( $self->verbose > 1 ) {
1129 0         0 for my $k ( sort { $a <=> $b } keys %match ) {
  0         0  
1130             $self->debug( "$k -> ",
1131 0         0 join( ",", @{ $match{$k} } ), "\n" );
  0         0  
1132             }
1133             }
1134             }
1135              
1136             # Associate SEs to nodes using tags
1137 13 100       31 if ( defined( $self->{_SEs} ) ) {
1138 3         13 my @SEs = split( " ", $self->{_SEs} );
1139 3         6 my $i = 0;
1140 3         11 foreach my $parent_id ( map { /\d+\.\.(\d+)/ }
  33         52  
1141             split( " ", $self->{_branch_ids} ) )
1142             {
1143 33         27 my @nodes;
1144 33         18 my @node_ids = @{ $match{$parent_id} };
  33         51  
1145             my @nodes_L =
1146 33         32 map { $tree->find_node( -id => $_ ) } @node_ids;
  48         110  
1147 33 100       66 my $n =
1148             @nodes_L < 2
1149             ? shift(@nodes_L)
1150             : $tree->get_lca(@nodes_L);
1151 33 50       48 if ( !$n ) {
1152 0         0 $self->warn(
1153             "no node could be found for node in SE assignation (no lca?)"
1154             );
1155             }
1156 33         58 $n->add_tag_value( 'SE', $SEs[$i] );
1157 33         50 $i++;
1158             }
1159             }
1160 13         27 push @trees, $tree;
1161             }
1162             }
1163 26         77 $okay++;
1164             }
1165             elsif (/^SEs for parameters/) {
1166 3         7 my $se_line = $self->_readline;
1167 3         8 $se_line =~ s/\n//;
1168 3         8 $self->{_SEs} = $se_line;
1169             }
1170             elsif (/^\s*\d+\.\.\d+/) {
1171 13         49 push @branches, map { [ split( /\.\./, $_ ) ] } split;
  93         170  
1172 13         26 my $ids = $_;
1173 13         30 $ids =~ s/\n//;
1174 13         41 $self->{_branch_ids} = $ids;
1175             }
1176             }
1177 13         90 return \@trees, \%match;
1178             }
1179              
1180             sub _parse_NSsitesBatch {
1181 9     9   21 my $self = shift;
1182 9         11 my ( %data, $idlookup );
1183 9         13 my ( $okay, $done ) = ( 0, 0 );
1184 9         19 while ( defined( $_ = $self->_readline ) ) {
1185 120 100       153 last if $done;
1186 115 100       319 next if /^\s+$/;
1187 68 50 100     163 next unless ( $okay || /^Model\s+\d+/ || /^TREE/ );
      66        
1188              
1189 68 100       497 if (/^Model\s+(\d+)/) {
    100          
    100          
    100          
    100          
    100          
    100          
    50          
    50          
    100          
1190 7 50       14 if ($okay) {
1191              
1192             # this only happens if $okay was already 1 and
1193             # we hit a Model line
1194 0         0 $self->_pushback($_);
1195 0         0 $done = 1;
1196             }
1197             else {
1198 7         13 chomp;
1199 7         21 $data{'-model_num'} = $1;
1200 7         28 ( $data{'-model_description'} ) = (/\:\s+(.+)/);
1201 7         16 $okay = 1;
1202             }
1203             }
1204             elsif (/^Time used\:\s+(\S+)/) {
1205 7         30 $data{'-time_used'} = $1;
1206 7         18 $done = 1;
1207             }
1208             elsif (/^kappa\s+\(ts\/tv\)\s+\=\s+(\S+)/) {
1209 9         30 $data{'-kappa'} = $1;
1210             }
1211             elsif (/^TREE/) {
1212 9         19 $self->_pushback($_);
1213 9         26 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
1214 9 50 50     38 if ( defined $data{'-trees'}
1215 9         40 && scalar @{ $data{'-trees'} } )
1216             {
1217 9         29 $data{'-likelihood'} = $data{'-trees'}->[0]->score;
1218             }
1219 9         30 $okay = 1;
1220             }
1221             elsif (/^omega\s+\(dn\/ds\)\s+\=\s+(\S+)/i) {
1222              
1223             # for M0 (single ratio for the entire tree)
1224             # explicitly put '1.00000' rather than '1', because \d+\.\d{5}
1225             # is reported in all other cases.
1226 1         4 my @p = (q/1.00000/); # since there is only one class,
1227 1         3 my @w = $1;
1228 1         5 $data{'-dnds_site_classes'} = {
1229             'p' => \@p,
1230             'w' => \@w
1231             };
1232              
1233             # since no K=X is provided, put 1 here
1234 1         7 $data{q/-num_site_classes/} = 1;
1235             }
1236             elsif (
1237             /^(Naive Empirical Bayes)|(Bayes Empirical Bayes)|(Positively\sselected\ssites)/i
1238             )
1239             {
1240 6         23 $self->_pushback($_);
1241 6         20 my ( $sites, $neb, $beb ) = $self->_parse_Pos_selected_sites;
1242 6         12 $data{'-pos_sites'} = $sites;
1243 6         11 $data{'-neb_sites'} = $neb;
1244 6         16 $data{'-beb_sites'} = $beb;
1245             }
1246             elsif (/^dN/i) {
1247 15 100       67 if (/K\=(\d+)/) {
    50          
1248 7         24 $data{'-num_site_classes'} = $1;
1249 7         22 while ( $_ = $self->_readline ) {
1250 12 100       47 unless ( $_ =~ /^\s+$/ ) {
1251 7         17 $self->_pushback($_);
1252 7         9 last;
1253             }
1254             }
1255 7 100       18 if (/^site class/) {
1256 1         5 $self->_readline;
1257 1         3 my $tmp = $self->_readline;
1258 1         6 my @p = $tmp =~ /(\d+\.\d{5})/g;
1259 1         4 $tmp = $self->_readline;
1260 1         6 my @b_w = $tmp =~ /(\d+\.\d{5})/g;
1261 1         3 $tmp = $self->_readline;
1262 1         6 my @f_w = $tmp =~ /(\d+\.\d{5})/g;
1263 1         2 my @w;
1264              
1265 1         3 foreach my $i ( 0 .. $#b_w ) {
1266 4         8 push @w,
1267             {
1268             q/background/ => $b_w[$i],
1269             q/foreground/ => $f_w[$i]
1270             };
1271             }
1272 1         6 $data{'-dnds_site_classes'} = {
1273             q/p/ => \@p,
1274             q/w/ => \@w
1275             };
1276             }
1277             else {
1278 6         13 my $tmp = $self->_readline;
1279 6         43 my @p = $tmp =~ /(\d+\.\d{5})/g;
1280 6         16 $tmp = $self->_readline;
1281 6         31 my @w = $tmp =~ /(\d+\.\d{5})/g;
1282 6         34 $data{'-dnds_site_classes'} = {
1283             'p' => \@p,
1284             'w' => \@w
1285             };
1286             }
1287             }
1288             elsif (/for each branch/) {
1289 8         27 my %branch_dnds = $self->_parse_branch_dnds;
1290 8 50       27 if ( !defined $data{'-trees'} ) {
1291 0         0 $self->warn(
1292             "No trees have been loaded, can't do anything\n");
1293 0         0 next;
1294             }
1295 8         8 my ($tree) = @{ $data{'-trees'} };
  8         20  
1296 8 50 33     54 if ( !$tree
      33        
1297             || !ref($tree)
1298             || !$tree->isa('Bio::Tree::Tree') )
1299             {
1300 0         0 $self->warn("no tree object already stored!\n");
1301 0         0 next;
1302             }
1303              
1304             # These need to be added to the Node/branches
1305 8         28 while ( my ( $k, $v ) = each %branch_dnds ) {
1306              
1307             # we can probably do better by caching at some point
1308 56         45 my @nodes;
1309 56         122 for my $id ( split( /\.\./, $k ) ) {
1310             my @nodes_L =
1311 241         752 map { $tree->find_node( -id => $_ ) }
1312 112         95 @{ $idlookup->{$id} };
  112         164  
1313 112 100       246 my $n =
1314             @nodes_L < 2
1315             ? shift(@nodes_L)
1316             : $tree->get_lca(@nodes_L);
1317 112 50       170 if ( !$n ) {
1318 0         0 $self->warn(
1319             "no node could be found for $id (no lca?)");
1320             }
1321 112 100 66     151 unless ( $n->is_Leaf && $n->id ) {
1322 72         116 $n->id($id);
1323             }
1324 112         154 push @nodes, $n;
1325             }
1326 56         80 my ( $parent, $child ) = @nodes;
1327 56         179 while ( my ( $kk, $vv ) = each %$v ) {
1328 504         560 $child->add_tag_value( $kk, $vv );
1329             }
1330             }
1331             }
1332             }
1333             elsif (/^Parameters in beta:/) {
1334 0         0 $_ = $self->_readline; # need the next line
1335 0 0       0 if (/p\=\s+(\S+)\s+q\=\s+(\S+)/) {
1336 0         0 $data{'-shape_params'} = {
1337             'shape' => 'beta',
1338             'p' => $1,
1339             'q' => $2
1340             };
1341             }
1342             else {
1343 0         0 $self->warn("unparseable beta parameters: $_");
1344             }
1345             }
1346             elsif (/^Parameters in beta\&w\>1:/) {
1347              
1348             # Parameters in beta&w>1:
1349             # p0= 1.00000 p= 0.07642 q= 0.85550
1350             # (p1= 0.00000) w= 1.00000
1351 0         0 $_ = $self->_readline; # need the next line
1352 0         0 my ( $p0, $p, $q, $p1, $w );
1353 0 0       0 if (/p0\=\s+(\S+)\s+p\=\s+(\S+)\s+q\=\s+(\S+)/) {
1354 0         0 $p0 = $1;
1355 0         0 $p = $2;
1356 0         0 $q = $3;
1357             }
1358             else {
1359 0         0 $self->warn("unparseable beta parameters: $_");
1360             }
1361 0         0 $_ = $self->_readline; # need the next line
1362 0 0       0 if (/\(p1\=\s+(\S+)\)\s+w\=\s*(\S+)/) {
1363 0         0 $p1 = $1;
1364 0         0 $w = $2;
1365 0         0 $data{'-shape_params'} = {
1366             'shape' => 'beta',
1367             'p0' => $p0,
1368             'p' => $p,
1369             'q' => $q,
1370             'p1' => $p1,
1371             'w' => $w
1372             };
1373             }
1374             else {
1375 0         0 $self->warn("unparseable beta parameters: $_");
1376             }
1377             }
1378             elsif (/^alpha\s+\(gamma\)\s+\=\s+(\S+)/) {
1379 3         7 my $gamma = $1;
1380 3         6 $_ = $self->_readline;
1381 3         4 my ( @r, @f );
1382 3 50       16 if (s/^r\s+\(\s*\d+\)\:\s+//) {
1383 3         8 @r = split;
1384             }
1385 3         9 $_ = $self->_readline;
1386 3 50       14 if (s/^f\s*\:\s+//) {
1387 3         6 @f = split;
1388             }
1389 3         31 $data{'-shape_params'} = {
1390             'shape' => 'alpha',
1391             'gamma' => $gamma,
1392             'r' => \@r,
1393             'f' => \@f
1394             };
1395             }
1396             }
1397 9         122 return new Bio::Tools::Phylo::PAML::ModelResult(%data);
1398             }
1399              
1400             sub _parse_Pos_selected_sites {
1401 6     6   11 my $self = shift;
1402 6         11 my $okay = 0;
1403 6         28 my (%sites) = (
1404             'default' => [],
1405             'neb' => [],
1406             'beb' => []
1407             );
1408 6         10 my $type = 'default';
1409 6         16 while ( defined( $_ = $self->_readline ) ) {
1410 116 100 100     422 next if ( /^\s+$/ || /^\s+Pr\(w\>1\)/ );
1411 80 100 66     207 if ( /^Time used/ || /^TREE/ ) {
1412 4         13 $self->_pushback($_);
1413 4         5 last;
1414             }
1415 76 100 100     600 if (/^Naive Empirical Bayes/i) {
    100 100        
    100 100        
    100          
    100          
    100          
1416 5         13 $type = 'neb';
1417             }
1418             elsif (/^Bayes Empirical Bayes/i) {
1419 2         5 $type = 'beb';
1420             }
1421             elsif (/^Positively selected sites/) {
1422 4         7 $okay = 1;
1423             }
1424             elsif ( $okay
1425             && /^\s+(\d+)\s+(\S+)\s+(\-?\d+(?:\.\d+)?)(\**)\s+(\-?\d+(?:\.\d+)?)\s+\+\-\s+(\-?\d+(?:\.\d+)?)/
1426             )
1427             {
1428 9         12 my $signif = $4;
1429 9 50       13 $signif = '' unless defined $signif;
1430 9         5 push @{ $sites{$type} }, [ $1, $2, $3, $signif, $5, $6 ];
  9         41  
1431             }
1432             elsif ( $okay
1433             && /^\s+(\d+)\s+(\S+)\s+(\-?\d*(?:.\d+))(\**)\s+(\-?\d+(?:\.\d+)?)/
1434             )
1435             {
1436 8         12 my $signif = $4;
1437 8 50       10 $signif = '' unless defined $signif;
1438 8         8 push @{ $sites{$type} }, [ $1, $2, $3, $signif, $5 ];
  8         35  
1439             }
1440             elsif ( $okay && /^\s+(\d+)\s+(\S)\s+([\d\.\-\+]+)(\**)/ ) {
1441 8         14 my $signif = $4;
1442 8 50       12 $signif = '' unless defined $signif;
1443 8         5 push @{ $sites{$type} }, [ $1, $2, $3, $signif ];
  8         52  
1444             }
1445             }
1446 6         20 return ( $sites{'default'}, $sites{'neb'}, $sites{'beb'} );
1447             }
1448              
1449             sub _parse_branch_dnds {
1450 10     10   13 my $self = shift;
1451 10         19 my ($okay) = (0);
1452 10         11 my %branch_dnds;
1453             my @header;
1454 10         21 while ( defined( $_ = $self->_readline ) ) {
1455 136 100       299 next if (/^\s+$/);
1456 98 50 66     177 next unless ( $okay || /^\s+branch\s+t/ );
1457 98 100       261 if (/^\s+branch\s+(.+)/) {
    100          
1458 10         32 s/^\s+//;
1459 10         58 @header = split( /\s+/, $_ );
1460 10         21 $okay = 1;
1461             }
1462             elsif (/^\s*(\d+\.\.\d+)/) {
1463 78         89 my $branch = $1;
1464 78         125 s/^\s+//;
1465 78         64 my $i = 0;
1466              
1467             # fancyness just maps the header names like 't' or 'dN'
1468             # into the hash so we get at the end of the day
1469             # 't' => 0.067
1470             # 'dN'=> 0.001
1471 78         153 $branch_dnds{$branch} = { map { $header[ $i++ ] => $_ } split };
  702         1062  
1472             }
1473             else {
1474 10         21 $self->_pushback($_);
1475 10         15 last;
1476             }
1477             }
1478 10         59 return %branch_dnds;
1479             }
1480              
1481             #baseml stuff
1482             sub _parse_nt_freqs {
1483 2     2   4 my ($self) = @_;
1484 2         4 my ( $okay, $done, $header ) = ( 0, 0, 0 );
1485 2         3 my (@bases);
1486 2 50       4 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
  2         8  
1487 2         6 while ( defined( $_ = $self->_readline ) ) {
1488 28 50 33     83 if ( /^TREE/ || /^Distances/ ) { $self->_pushback($_); last }
  0         0  
  0         0  
1489 28 100       39 last if ($done);
1490 26 100 66     86 next if ( /^\s+$/ || /^\(Ambiguity/ );
1491 16 100       63 if (/^Frequencies\./) {
    50          
    100          
    100          
    100          
1492 2         4 $okay = 1;
1493             }
1494             elsif ( !$okay ) { # skip till we see 'Frequencies.
1495 0         0 next;
1496             }
1497             elsif ( !$header ) {
1498 2         8 s/^\s+//; # remove leading whitespace
1499 2         6 @bases = split; # get an array of the all the aa names
1500 2         3 $header = 1;
1501 2         4 $self->{'_summary'}->{'ntfreqs'} = {}; # reset/clear values
1502 2         4 next;
1503             }
1504             elsif (
1505             /^\#\s+constant\s+sites\:\s+
1506             (\d+)\s+ # constant sites
1507             \(\s*([\d\.]+)\s*\%\s*\)/ox
1508             )
1509             {
1510 2         7 $self->{'_summary'}->{'stats'}->{'constant_sites'} = $1;
1511 2         7 $self->{'_summary'}->{'stats'}->{'constant_sites_percentage'} = $2;
1512             }
1513             elsif (/^ln\s+Lmax\s+\(unconstrained\)\s+\=\s+(\S+)/ox) {
1514 2         5 $self->{'_summary'}->{'stats'}->{'loglikelihood'} = $1;
1515 2         2 $done = 1; # done for sure
1516             }
1517             else {
1518 8         19 my ( $seqname, @freqs ) = split;
1519 8         9 my $basect = 0;
1520 8         9 foreach my $f (@freqs) {
1521              
1522             # this will also store 'Average'
1523             $self->{'_summary'}->{'ntfreqs'}->{$seqname}
1524 32         54 ->{ $bases[ $basect++ ] } = $f;
1525             }
1526             }
1527             }
1528             }
1529              
1530             sub _parse_nt_dists {
1531 2     2   3 my ($self) = @_;
1532 2         2 my ( $okay, $seen, $done ) = ( 0, 0, 0 );
1533 2         4 my ( %matrix, @names );
1534 2 50       3 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
  2         7  
1535 2         2 my $type = '';
1536 2         6 while ( defined( $_ = $self->_readline ) ) {
1537 13 50       18 if (/^TREE/) { $self->_pushback($_); last; }
  0         0  
  0         0  
1538 13 100       17 last if $done;
1539 12 100       21 next if (/^This matrix is not used in later/);
1540 10 100       23 if (/^\s+$/) {
1541 2 50       7 last if ($seen);
1542 2         4 next;
1543             }
1544 8 100       18 if (/^Distances:(\S+)\s+\(([^\)]+)\)\s+\(alpha set at (\-?\d+\.\d+)\)/)
1545             {
1546 2         2 $okay = 1;
1547 2         3 $type = $1;
1548 2         4 next;
1549             }
1550 6         19 s/\s+$//g; # remove trailing space
1551 6 50       9 if ($okay) {
1552 6         14 my ( $seqname, $vl ) = split( /\s+/, $_, 2 );
1553 6         5 $seen = 1;
1554 6         4 my $i = 0;
1555 6 100       14 if ( defined $vl ) {
1556 4         18 while ( $vl =~ /(\-?\d+\.\d+)\s*\(\s*(\-?\d+\.\d+)\s*\)\s*/g ) {
1557 6         8 my ( $kappa, $alpha ) = ( $1, $2 );
1558             $matrix{$seqname}{ $names[$i] } =
1559 6         17 $matrix{ $names[$i] }{$seqname} = [ $kappa, $alpha ];
1560              
1561 6         14 $i++;
1562             }
1563 4 50       7 unless ($i) {
1564 0         0 $self->warn("no matches for $vl\n");
1565             }
1566             }
1567 6         4 push @names, $seqname;
1568 6         14 $matrix{$seqname}->{$seqname} = [ 0, 0 ];
1569             }
1570 6 100       16 $done = 1 if ( scalar @names == $numseqs );
1571             }
1572 2         5 my %dist;
1573 2         2 my $i = 0;
1574 2         6 my ( @kvalues, @avalues );
1575 2         2 foreach my $lname (@names) {
1576 6         7 my ( @arow, @krow );
1577 6         3 my $j = 0;
1578 6         6 foreach my $rname (@names) {
1579 18         24 my $v = $matrix{$lname}{$rname};
1580              
1581 18         16 push @krow, $v->[0]; # kappa values
1582 18         13 push @arow, $v->[1]; # alpha
1583 18         27 $dist{$lname}{$rname} = [ $i, $j++ ];
1584             }
1585 6         3 $i++;
1586 6         7 push @kvalues, \@krow;
1587 6         7 push @avalues, \@arow;
1588             }
1589             return (
1590             Bio::Matrix::PhylipDist->new(
1591             -program => $self->{'_summary'}->{'seqtype'},
1592             -matrix => \%dist,
1593             -names => \@names,
1594             -values => \@kvalues
1595             ),
1596             Bio::Matrix::PhylipDist->new(
1597 2         22 -program => $self->{'_summary'}->{'seqtype'},
1598             -matrix => \%dist,
1599             -names => \@names,
1600             -values => \@avalues
1601             )
1602             );
1603             }
1604              
1605             # BASEML
1606             sub _parse_rate_parametes {
1607 0     0   0 my $self = shift;
1608 0         0 my (%rate_parameters);
1609 0         0 while ( defined( $_ = $self->_readline ) ) {
1610 0 0       0 if (/^Rate\s+parameters:\s+/) {
    0          
    0          
    0          
    0          
1611 0         0 s/\s+$//;
1612 0         0 $rate_parameters{'rate_parameters'} = [ split( /\s+/, $_ ) ];
1613             }
1614             elsif (/^Base\s+frequencies:\s+/) {
1615 0         0 s/\s+$//;
1616 0         0 $rate_parameters{'base_frequencies'} = [ split( /\s+/, $_ ) ];
1617             }
1618             elsif (
1619             m/^Rate\s+matrix\s+Q,\s+Average\s+Ts\/Tv\s+(\([^\)+]+\))?\s*\=\s+
1620             (\-?\d+\.\d+)/x
1621             )
1622             {
1623 0         0 $rate_parameters{'average_TsTv'} = $1;
1624 0         0 while ( defined( $_ = $self->_readline ) ) {
1625              
1626             # short circuit
1627 0 0       0 last if (/^\s+$/);
1628 0 0       0 if (/^alpha/) {
1629 0         0 $self->_pushback($_);
1630 0         0 last;
1631             }
1632 0         0 s/^\s+//;
1633 0         0 s/\s+$//;
1634 0         0 push @{ $rate_parameters{'rate_matrix_Q'} }, [split];
  0         0  
1635             }
1636             }
1637             elsif (/^alpha\s+\(gamma,\s+K=\s*(\d+)\s*\)\s*\=\s*(\-?\d+\.\d+)/) {
1638 0         0 $rate_parameters{'K'} = $1;
1639 0         0 $rate_parameters{'alpha'} = $2;
1640             }
1641             elsif (s/^(r|f):\s+//) {
1642 0         0 my ($p) = $1;
1643 0         0 s/\s+$//;
1644 0         0 $rate_parameters{$p} = [split];
1645             }
1646             }
1647             }
1648              
1649             # RST parsing
1650             sub _parse_rst {
1651 20     20   22 my ($self) = @_;
1652 20 50 66     93 return unless $self->{'_dir'} && -d $self->{'_dir'} && -r $self->{'_dir'};
      66        
1653              
1654 1         11 my $rstfile = File::Spec->catfile( $self->{'_dir'}, $RSTFILENAME );
1655 1 50 33     29 return unless -e $rstfile && !-z $rstfile;
1656 1         7 my $rstio = Bio::Root::IO->new( -file => $rstfile );
1657              
1658             # define whatever data structures you need to store the data
1659             # key points are to reuse existing bioperl objs (like Bio::Seq)
1660             # where appropriate
1661 1         2 my ( @firstseq, @seqs, @trees, @per_site_prob );
1662 0         0 my $count;
1663 1         3 while ( defined( $_ = $rstio->_readline ) ) {
1664              
1665             # implement the parsing here
1666 616 100 66     2718 if (/^TREE\s+\#\s+(\d+)/) {
    100          
    100          
    50          
    100          
1667 1         3 while ( defined( $_ = $rstio->_readline ) ) {
1668 10 100       24 if (/tree\s+with\s+node\s+labels\s+for/) {
1669 1         4 my $tree = Bio::TreeIO->new(
1670             -noclose => 1,
1671             -fh => $rstio->_fh,
1672             -format => 'newick'
1673             )->next_tree;
1674              
1675             # cleanup leading/trailing whitespace
1676 1         4 for my $n ( $tree->get_nodes ) {
1677 12         16 my $id = $n->id;
1678 12         15 $id =~ s/^\s+//;
1679 12         21 $id =~ s/\s+$//;
1680 12         16 $n->id($id);
1681 12 100       14 if ( defined( my $blen = $n->branch_length ) ) {
1682 11         12 $blen =~ s/^\s+//;
1683 11         10 $blen =~ s/\s+$//;
1684 11         12 $n->branch_length($blen);
1685             }
1686             }
1687 1         2 push @trees, $tree;
1688 1         4 last;
1689             }
1690             }
1691             }
1692             elsif (/^Prob\sof\sbest\scharacter\sat\seach\snode,\slisted\sby\ssite/)
1693             {
1694 1         10 $self->{'_rst'}->{'persite'} = [];
1695 1         3 while ( defined( $_ = $rstio->_readline ) ) {
1696 135 100 100     517 next if ( /^Site/i || /^\s+$/ );
1697 131 100       484 if (s/^\s+(\d+)\s+(\d+)\s+([^:]+)\s*:\s*(.+)//) {
    50          
1698 130         241 my ( $sitenum, $freq, $extant, $ancestral ) =
1699             ( $1, $2, $3, $4 );
1700 130         82 my ( @anc_site, @extant_site );
1701 130         136 @extant_site = {};
1702 130         370 while ( $extant =~ s/^([A-Z\-]{3})\s+\(([A-Z*])\)\s+//g ) {
1703 910         3192 push @extant_site, { 'codon' => $1, 'aa' => $2 };
1704             }
1705 130         403 while (
1706             $ancestral =~ s/^([A-Z\-]{3})\s+([A-Z*])\s+ # codon AA
1707             (\S+)\s+ # Prob
1708             \(([A-Z*])\s+(\S+)\)\s*//xg # AA Prob
1709             )
1710             {
1711 650         3488 push @anc_site,
1712             {
1713             'codon' => $1,
1714             'aa' => $2,
1715             'prob' => $3,
1716             'Yang95_aa' => $4,
1717             'Yang95_aa_prob' => $5
1718             };
1719             }
1720              
1721             # saving persite
1722 130         531 $self->{'_rst'}->{'persite'}->[$sitenum] =
1723             [ @extant_site, @anc_site ];
1724             }
1725             elsif (/^Summary\sof\schanges\salong\sbranches\./) {
1726 1         3 last;
1727             }
1728             }
1729             }
1730             elsif (/^Check\sroot\sfor\sdirections\sof\schange\./
1731             || /^Summary\sof\schanges\salong\sbranches\./ )
1732             {
1733 1         1 my ( @branches, @branch2node, $branch, $node );
1734 1         2 my $tree = $trees[-1];
1735 1 50       3 if ( !$tree ) {
1736 0         0 $self->warn("No tree built before parsing Branch changes\n");
1737 0         0 last;
1738             }
1739             my @nodes = (
1740 12         10 map { $_->[0] }
1741 30         24 sort { $a->[1] <=> $b->[1] }
1742 1         3 map { [ $_, $_->id =~ /^(\d+)\_?/ ] } $tree->get_nodes
  12         18  
1743             );
1744 1         4 unshift @nodes,
1745             undef; # fake first node so that index will match nodeid
1746 1         4 while ( defined( $_ = $rstio->_readline ) ) {
1747 93 100       215 next if /^\s+$/;
1748 57 100       207 if (m/^List\sof\sextant\sand\sreconstructed\ssequences/) {
    100          
    50          
1749 1         8 $rstio->_pushback($_);
1750 1         3 last;
1751             }
1752             elsif (/^Branch\s+(\d+):\s+(\d+)\.\.(\d+)\s+/) {
1753 11         7 my ( $left, $right );
1754 11         20 ( $branch, $left, $right ) = ( $1, $2, $3 );
1755 11         16 ($node) = $nodes[$right];
1756 11 50       13 if ( !$node ) {
1757 0         0 $self->warn(
1758             "cannot find $right in $tree ($branch $left..$right)\n"
1759             );
1760 0         0 last;
1761             }
1762 11 50       35 if (/\(n=\s*(\S+)\s+s=\s*(\S+)\)/) {
1763 11         18 $node->add_tag_value( 'n', $1 );
1764 11         14 $node->add_tag_value( 's', $2 );
1765             }
1766 11         26 $branch2node[$branch] = $right;
1767             }
1768             elsif (
1769             /^\s+(\d+)\s+([A-Z*])\s+(\S+)\s+\-\>\s+([A-Z*])\s+(\S+)?/)
1770             {
1771 45         82 my ( $site, $anc, $aprob, $derived, $dprob ) =
1772             ( $1, $2, $3, $4, $5 );
1773 45 50       58 if ( !$node ) {
1774 0         0 $self->warn("no branch line was previously parsed!");
1775 0         0 next;
1776             }
1777 45         110 my %c = (
1778             'site' => $site,
1779             'anc_aa' => $anc,
1780             'anc_prob' => $aprob,
1781             'derived_aa' => $derived,
1782             );
1783 45 100       67 $c{'derived_prob'} = $dprob if defined $dprob;
1784 45         71 $node->add_tag_value( 'changes', \%c );
1785             }
1786             }
1787             }
1788             elsif (
1789             /^Overall\s+accuracy\s+of\s+the\s+(\d+)\s+ancestral\s+sequences:/)
1790             {
1791 0         0 my $line = $rstio->_readline;
1792 0         0 $line =~ s/^\s+//;
1793 0         0 $line =~ s/\s+$//;
1794 0         0 my @overall_site = split( /\s+/, $line );
1795              
1796             # skip next 2 lines, want the third
1797 0         0 for ( 1 .. 3 ) {
1798 0         0 $line = $rstio->_readline;
1799             }
1800 0         0 $line =~ s/^\s+//;
1801 0         0 $line =~ s/\s+$//;
1802 0         0 my @overall_seq = split( /\s+/, $line );
1803 0 0 0     0 if ( @overall_seq != @overall_site
1804             || @overall_seq != @seqs )
1805             {
1806 0         0 $self->warn(
1807             "out of sync somehow seqs, site scores don't match\n");
1808 0         0 $self->warn("@seqs @overall_seq @overall_site\n");
1809             }
1810 0         0 for (@seqs) {
1811 0         0 $_->description(
1812             sprintf(
1813             "overall_accuracy_site=%s overall_accuracy_seq=%s",
1814             shift @overall_site,
1815             shift @overall_seq
1816             )
1817             );
1818             }
1819             }
1820             elsif (m/^List of extant and reconstructed sequences/o) {
1821 3         5 my $seqcount = 0;
1822 3         9 while ( defined( $_ = $rstio->_readline ) ) {
1823 311 100       374 last if (/^Overall accuracy of the/);
1824 309 100       504 if (/^\s+$/) {
1825 61 50 33     80 last if $seqcount && $seqcount == @seqs;
1826 61         86 next;
1827             }
1828 248 50       474 if (/^\s*(\d+)\s+(\d+)\s+$/) { $seqcount = $1; next }
  0         0  
  0         0  
1829              
1830             # runmode = (0)
1831             # this should in fact be packed into a Bio::SimpleAlign object
1832             # instead of an array but we'll stay with this for now
1833 248 100       442 if (/^node/) {
1834 20         65 my ( $name, $num, $seqstr ) = split( /\s+/, $_, 3 );
1835 20         21 $name .= $num;
1836 20         969 $seqstr =~ s/\s+//g; # remove whitespace
1837 20 100       31 unless (@firstseq) {
1838 1         49 @firstseq = split( //, $seqstr );
1839 1         13 push @seqs,
1840             Bio::LocatableSeq->new(
1841             -display_id => $name,
1842             -seq => $seqstr
1843             );
1844             }
1845             else {
1846 19         19 my $i = 0;
1847 19         8 my $v;
1848 19         50 while ( ( $v = index( $seqstr, '.', $i ) ) >= $i ) {
1849              
1850             # replace the '.' with the correct seq from the
1851 0         0 substr( $seqstr, $v, 1, $firstseq[$v] );
1852 0         0 $i = $v;
1853             }
1854 19         63 $self->debug("adding seq $seqstr\n");
1855 19         65 push @seqs,
1856             Bio::LocatableSeq->new(
1857             -display_id => $name,
1858             -seq => $seqstr
1859             );
1860             }
1861             }
1862             }
1863 3         11 $self->{'_rst'}->{'rctrted_seqs'} = \@seqs;
1864             }
1865             else {
1866             }
1867             }
1868 1         4 $self->{'_rst'}->{'trees'} = \@trees;
1869 1         23 return;
1870             }
1871              
1872             1;