File Coverage

Bio/Tools/Phylo/PAML.pm
Criterion Covered Total %
statement 732 841 87.0
branch 360 456 78.9
condition 109 161 67.7
subroutine 32 34 94.1
pod 2 2 100.0
total 1235 1494 82.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Phylo::PAML
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich, Aaron J Mackey
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Phylo::PAML - Parses output from the PAML programs codeml,
17             baseml, basemlg, codemlsites and yn00
18              
19             =head1 SYNOPSIS
20              
21             #!/usr/bin/perl -Tw
22             use strict;
23              
24             use Bio::Tools::Phylo::PAML;
25              
26             # need to specify the output file name (or a fh) (defaults to
27             # -file => "codeml.mlc"); also, optionally, the directory in which
28             # the other result files (rst, 2ML.dS, etc) may be found (defaults
29             # to "./")
30             my $parser = Bio::Tools::Phylo::PAML->new
31             (-file => "./results/mlc", -dir => "./results/");
32              
33             # get the first/next result; a Bio::Tools::Phylo::PAML::Result object,
34             # which isa Bio::SeqAnalysisResultI object.
35             my $result = $parser->next_result();
36              
37             # get the sequences used in the analysis; returns Bio::PrimarySeq
38             # objects (OTU = Operational Taxonomic Unit).
39             my @otus = $result->get_seqs();
40              
41             # codon summary: codon usage of each sequence [ arrayref of {
42             # hashref of counts for each codon } for each sequence and the
43             # overall sum ], and positional nucleotide distribution [ arrayref
44             # of { hashref of frequencies for each nucleotide } for each
45             # sequence and overall frequencies ]:
46             my ($codonusage, $ntdist) = $result->get_codon_summary();
47              
48             # example manipulations of $codonusage and $ntdist:
49             printf "There were %d %s codons in the first seq (%s)\n",
50             $codonusage->[0]->{AAA}, 'AAA', $otus[0]->id();
51             printf "There were %d %s codons used in all the sequences\n",
52             $codonusage->[$#{$codonusage}]->{AAA}, 'AAA';
53             printf "Nucleotide %c was present %g of the time in seq %s\n",
54             'A', $ntdist->[1]->{A}, $otus[1]->id();
55              
56             # get Nei & Gojobori dN/dS matrix:
57             my $NGmatrix = $result->get_NGmatrix();
58              
59             # get ML-estimated dN/dS matrix, if calculated; this corresponds to
60             # the runmode = -2, pairwise comparison usage of codeml
61             my $MLmatrix = $result->get_MLmatrix();
62              
63             # These matrices are length(@otu) x length(@otu) "strict lower
64             # triangle" 2D-matrices, which means that the diagonal and
65             # everything above it is undefined. Each of the defined cells is a
66             # hashref of estimates for "dN", "dS", "omega" (dN/dS ratio), "t",
67             # "S" and "N". If a ML matrix, "lnL" and "kappa" will also be defined.
68             printf "The omega ratio for sequences %s vs %s was: %g\n",
69             $otus[0]->id, $otus[1]->id, $MLmatrix->[0]->[1]->{omega};
70              
71             # with a little work, these matrices could also be passed to
72             # Bio::Tools::Run::Phylip::Neighbor, or other similar tree-building
73             # method that accepts a matrix of "distances" (using the LOWTRI
74             # option):
75             my $distmat = [ map { [ map { $$_{omega} } @$_ ] } @$MLmatrix ];
76              
77             # for runmode's other than -2, get tree topology with estimated
78             # branch lengths; returns a Bio::Tree::TreeI-based tree object with
79             # added PAML parameters at each node
80             my ($tree) = $result->get_trees();
81             for my $node ($tree->get_nodes()) {
82             # inspect the tree: the "t" (time) parameter is available via
83             # $node->branch_length(); all other branch-specific parameters
84             # ("omega", "dN", etc.) are available via
85             # ($omega) = $node->get_tag_values('omega');
86             }
87              
88             # if you are using model based Codeml then trees are stored in each
89             # modelresult object
90             for my $modelresult ( $result->get_NSSite_results ) {
91             # model M0, M1, etc
92             print "model is ", $modelresult->model_num, "\n";
93             my ($tree) = $modelresult->get_trees();
94             for my $node ($tree->get_nodes()) {
95             # inspect the tree: the "t" (time) parameter is available via
96             # $node->branch_length(); all other branch-specific parameters
97             # ("omega", "dN", etc.) are available via
98             # ($omega) = $node->get_tag_values('omega');
99             }
100             }
101              
102             # get any general model parameters: kappa (the
103             # transition/transversion ratio), NSsites model parameters ("p0",
104             # "p1", "w0", "w1", etc.), etc.
105             my $params = $result->get_model_params();
106             printf "M1 params: p0 = %g\tp1 = %g\n", $params->{p0}, $params->{p1};
107              
108             # parse AAML result files
109             my $aamat = $result->get_AADistMatrix();
110             my $aaMLmat = $result->get_AAMLDistMatrix();
111              
112             =head1 DESCRIPTION
113              
114             This module is used to parse the output from the PAML programs codeml,
115             baseml, basemlg, codemlsites and yn00. You can use the
116             Bio::Tools::Run::Phylo::PAML::* modules to actually run some of the
117             PAML programs, but this module is only useful to parse the output.
118              
119             This module has fledgling support for PAML version 4.3a (October 2009).
120             Please report any problems to the mailing list (see FEEDBACK below).
121              
122             =head1 TO DO
123              
124             Implement get_posteriors(). For NSsites models, obtain arrayrefs of
125             posterior probabilities for membership in each class for every
126             position; probabilities correspond to classes w0, w1, ... etc.
127              
128             my @probs = $result->get_posteriors();
129              
130             # find, say, positively selected sites!
131             if ($params->{w2} > 1) {
132             for (my $i = 0; $i < @probs ; $i++) {
133             if ($probs[$i]->[2] > 0.5) {
134             # assumes model M1: three w's, w0, w1 and w2 (positive selection)
135             printf "position %d: (%g prob, %g omega, %g mean w)\n",
136             $i, $probs[$i]->[2], $params->{w2}, $probs[$i]->[3];
137             }
138             }
139             } else { print "No positive selection found!\n"; }
140              
141             =head1 FEEDBACK
142              
143             =head2 Mailing Lists
144              
145             User feedback is an integral part of the evolution of this and other
146             Bioperl modules. Send your comments and suggestions preferably to
147             the Bioperl mailing list. Your participation is much appreciated.
148              
149             bioperl-l@bioperl.org - General discussion
150             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
151              
152             =head2 Support
153              
154             Please direct usage questions or support issues to the mailing list:
155              
156             I
157              
158             rather than to the module maintainer directly. Many experienced and
159             reponsive experts will be able look at the problem and quickly
160             address it. Please include a thorough description of the problem
161             with code and data examples if at all possible.
162              
163             =head2 Reporting Bugs
164              
165             Report bugs to the Bioperl bug tracking system to help us keep track
166             of the bugs and their resolution. Bug reports can be submitted via the
167             web:
168              
169             https://github.com/bioperl/bioperl-live/issues
170              
171             =head1 AUTHOR - Jason Stajich, Aaron Mackey
172              
173             Email jason-at-bioperl.org
174             Email amackey-at-virginia.edu
175              
176             =head1 CONTRIBUTORS
177              
178             Albert Vilella avilella-AT-gmail-DOT-com
179             Sendu Bala bix@sendu.me.uk
180             Dave Messina dmessina@cpan.org
181              
182             =head1 TODO
183              
184             RST parsing -- done, Avilella contributions bug#1506, added by jason 1.29
185             -- still need to parse in joint probability and non-syn changes
186             at site table
187              
188             =head1 APPENDIX
189              
190             The rest of the documentation details each of the object methods.
191             Internal methods are usually preceded with a _
192              
193             =cut
194              
195             # Let the code begin...
196              
197             package Bio::Tools::Phylo::PAML;
198 1     1   594 use vars qw($RSTFILENAME);
  1         1  
  1         34  
199 1     1   3 use strict;
  1         1  
  1         18  
200              
201             # Object preamble - inherits from Bio::Root::Root
202              
203 1     1   2 use base qw(Bio::Root::Root Bio::Root::IO Bio::AnalysisParserI);
  1         8  
  1         280  
204              
205             BEGIN {
206 1     1   14 $RSTFILENAME = 'rst'; # where to get the RST data from
207             }
208              
209             # other objects used:
210 1     1   4 use IO::String;
  1         1  
  1         15  
211 1     1   3 use File::Spec;
  1         1  
  1         11  
212 1     1   219 use Bio::TreeIO;
  1         1  
  1         21  
213 1     1   352 use Bio::Tools::Phylo::PAML::Result;
  1         1  
  1         24  
214 1     1   277 use Bio::LocatableSeq;
  1         2  
  1         22  
215 1     1   4 use Bio::PrimarySeq;
  1         1  
  1         15  
216 1     1   265 use Bio::Matrix::PhylipDist;
  1         1  
  1         21  
217 1     1   325 use Bio::Tools::Phylo::PAML::ModelResult;
  1         740  
  1         7999  
218              
219             =head2 new
220              
221             Title : new
222             Usage : my $obj = Bio::Tools::Phylo::PAML->new(%args);
223             Function: Builds a new Bio::Tools::Phylo::PAML object
224             Returns : Bio::Tools::Phylo::PAML
225             Args : Hash of options: -file, -fh, -dir
226             -file (or -fh) should contain the contents of the PAML
227             outfile;
228             -dir is the (optional) name of the directory in
229             which the PAML program was run (and includes other
230             PAML-generated files from which we can try to gather data)
231              
232             =cut
233              
234             sub new {
235              
236 20     20 1 42 my ( $class, @args ) = @_;
237              
238 20         52 my $self = $class->SUPER::new(@args);
239 20         56 $self->_initialize_io(@args);
240 20         74 my ($dir) = $self->_rearrange( [qw(DIR)], @args );
241 20 100       49 $self->{_dir} = $dir if defined $dir;
242 20         46 return $self;
243             }
244              
245             =head2 Implement Bio::AnalysisParserI interface
246              
247             =cut
248              
249             =head2 next_result
250              
251             Title : next_result
252             Usage : $result = $obj->next_result();
253             Function: Returns the next result available from the input, or
254             undef if there are no more results.
255             Example :
256             Returns : a Bio::Tools::Phylo::PAML::Result object
257             Args : none
258              
259             =cut
260              
261             sub next_result {
262              
263 20     20 1 2219 my ($self) = @_;
264 20         24 my %data;
265              
266             # parse the RST file, if it doesn't exist or if dir is not set
267             # this will just skip the parsing
268 20         39 $self->_parse_rst();
269 20         20 my $idlookup; # a hashreference to SEQID (number) ==> 'SEQUENCENAME'
270             # get the various codon and other sequence summary data, if necessary:
271             $self->_parse_summary
272 20 50 33     65 unless ( $self->{'_summary'} && !$self->{'_summary'}->{'multidata'} );
273              
274             # OK, depending on seqtype and runmode now, one of a few things can happen:
275 20         31 my $seqtype = $self->{'_summary'}->{'seqtype'};
276 20 100 100     63 if ( $seqtype eq 'CODONML' || $seqtype eq 'AAML' ) {
    100          
    50          
277 16         15 my $has_model_line = 0;
278 16         32 while ( defined( $_ = $self->_readline ) ) {
279 73 100 100     704 if ( $seqtype eq 'CODONML'
    100 100        
    100 100        
    100 100        
    100 100        
    50 66        
    50          
    50          
280             && m/^pairwise comparison, codon frequencies:/ )
281             {
282              
283             # runmode = -2, CODONML
284 8         24 $self->debug("pairwise Ka/Ks\n");
285 8         15 $self->_pushback($_);
286 8         14 %data = $self->_parse_PairwiseCodon;
287 8         11 last;
288             }
289             elsif ( $seqtype eq 'AAML' && m/^ML distances of aa seqs\.$/ ) {
290 1         3 $self->_pushback($_);
291              
292             # get AA distances
293 1         2 %data = ( '-AAMLdistmat' => $self->_parse_aa_dists() );
294              
295             # $self->_pushback($_);
296             # %data = $self->_parse_PairwiseAA;
297             # last;
298             }
299             elsif (
300             m/^Model\s+(\d+)/
301             || ( ( !$has_model_line && m/^TREE/ )
302             && $seqtype eq 'CODONML'
303             && ($self->{'_summary'}->{'version'} !~ /4/))
304             # last bit to keep PAML >= 4 from being caught here
305             # bug 2482. Not sure this is the right fix, but tests
306             # pass and the bug's test case passes.
307             )
308             {
309 9         20 $self->_pushback($_);
310 9         20 my $model = $self->_parse_NSsitesBatch;
311 9         14 push @{ $data{'-NSsitesresults'} }, $model;
  9         19  
312 9         22 $has_model_line = 1;
313             }
314             elsif (m/for each branch/) {
315 2         7 my %branch_dnds = $self->_parse_branch_dnds;
316 2 50       5 if ( !defined $data{'-trees'} ) {
317 0         0 $self->warn(
318             "No trees have been loaded, can't do anything\n");
319 0         0 next;
320             }
321 2         3 my ($tree) = @{ $data{'-trees'} };
  2         3  
322 2 50 33     17 if ( !$tree
      33        
323             || !ref($tree)
324             || !$tree->isa('Bio::Tree::Tree') )
325             {
326 0         0 $self->warn("no tree object already stored!\n");
327 0         0 next;
328             }
329              
330             # These need to be added to the Node/branches
331 2         5 while ( my ( $k, $v ) = each %branch_dnds ) {
332              
333             # we can probably do better by caching at some point
334 22         15 my @nodes;
335 22         44 for my $id ( split( /\.\./, $k ) ) {
336             my @nodes_L =
337 98         200 map { $tree->find_node( -id => $_ ) }
338 44         36 @{ $idlookup->{$id} };
  44         55  
339 44 100       91 my $n =
340             @nodes_L < 2
341             ? shift(@nodes_L)
342             : $tree->get_lca(@nodes_L);
343 44 50       70 if ( !$n ) {
344 0         0 $self->warn("no node for $n\n");
345             }
346 44 100 66     61 unless ( $n->is_Leaf && $n->id ) {
347 30         39 $n->id($id);
348             }
349 44         64 push @nodes, $n;
350             }
351 22         23 my ( $parent, $child ) = @nodes;
352 22         67 while ( my ( $kk, $vv ) = each %$v ) {
353 198         217 $child->add_tag_value( $kk, $vv );
354             }
355             }
356             }
357             elsif (m/^TREE/) {
358              
359             # runmode = 0
360 3         6 $self->_pushback($_);
361 3         10 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
362              
363             #last;
364             }
365             elsif (m/Heuristic tree search by stepwise addition$/) {
366              
367             # runmode = 3
368 0         0 $self->throw(
369             -class => 'Bio::Root::NotImplemented',
370             -text => "StepwiseAddition not yet implemented!"
371             );
372              
373             # $self->_pushback($_);
374             # %data = $self->_parse_StepwiseAddition;
375             # last;
376              
377             }
378             elsif (m/Heuristic tree search by NNI perturbation$/) {
379              
380             # runmode = 4
381 0         0 $self->throw(
382             -class => 'Bio::Root::NotImplemented',
383             -text => "NNI Perturbation not yet implemented!"
384             );
385              
386             # $self->_pushback($_);
387             # %data = $self->_parse_Perturbation;
388             # last;
389              
390             }
391             elsif (m/^stage 0:/) {
392              
393             # runmode = (1 or 2)
394 0         0 $self->throw(
395             -class => 'Bio::Root::NotImplemented',
396             -text => "StarDecomposition not yet implemented!"
397             );
398              
399 0         0 $self->_pushback($_);
400 0         0 %data = $self->_parse_StarDecomposition;
401 0         0 last;
402             }
403             }
404             }
405             elsif ( $seqtype eq 'BASEML' ) {
406 2         5 while ( defined( $_ = $self->_readline ) ) {
407 3 100       10 if (/^Distances:/) {
    50          
408 2         5 $self->_pushback($_);
409 2         7 my ( $kappa, $alpha ) = $self->_parse_nt_dists();
410 2         10 %data = (
411             '-kappa_distmat' => $kappa,
412             '-alpha_distmat' => $alpha
413             );
414             }
415             elsif (/^TREE/) {
416 1         3 $self->_pushback($_);
417 1         3 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
418             }
419             }
420             }
421             elsif ( $seqtype eq 'YN00' ) {
422 2         4 while ( $_ = $self->_readline ) {
423 3 100       9 if (
424             m/^Estimation by the method|\(B\) Yang & Nielsen \(2000\) method/
425             )
426             {
427 2         5 $self->_pushback($_);
428 2         5 %data = $self->_parse_YN_Pairwise;
429 2         4 last;
430             }
431             }
432             }
433 20 50       37 if (%data) {
434 20         34 $data{'-version'} = $self->{'_summary'}->{'version'};
435 20         25 $data{'-seqs'} = $self->{'_summary'}->{'seqs'};
436 20         26 $data{'-patterns'} = $self->{'_summary'}->{'patterns'};
437 20         24 $data{'-ngmatrix'} = $self->{'_summary'}->{'ngmatrix'};
438 20         29 $data{'-codonpos'} = $self->{'_summary'}->{'codonposition'};
439 20         26 $data{'-codonfreq'} = $self->{'_summary'}->{'codonfreqs'};
440 20         30 $data{'-model'} = $self->{'_summary'}->{'model'};
441 20         25 $data{'-seqfile'} = $self->{'_summary'}->{'seqfile'};
442 20         24 $data{'-aadistmat'} = $self->{'_summary'}->{'aadistmat'};
443 20         22 $data{'-stats'} = $self->{'_summary'}->{'stats'};
444 20         21 $data{'-aafreq'} = $self->{'_summary'}->{'aafreqs'};
445 20         21 $data{'-ntfreq'} = $self->{'_summary'}->{'ntfreqs'};
446 20         24 $data{'-input_params'} = $self->{'_summary'}->{'inputparams'};
447 20         22 $data{'-rst'} = $self->{'_rst'}->{'rctrted_seqs'};
448 20         23 $data{'-rst_persite'} = $self->{'_rst'}->{'persite'};
449 20         17 $data{'-rst_trees'} = $self->{'_rst'}->{'trees'};
450 20         177 return Bio::Tools::Phylo::PAML::Result->new(%data);
451             }
452             else {
453 0         0 return;
454             }
455             }
456              
457             sub _parse_summary {
458 20     20   16 my ($self) = @_;
459              
460             # Depending on whether verbose > 0 or not, and whether the result
461             # set comes from a multi-data run, the first few lines could be
462             # various things; we're going to throw away any sequence data
463             # here, since we'll get it later anyways
464              
465             # multidata ? : \n\nData set 1\n
466             # verbose ? : cleandata ? : \nBefore deleting alignment gaps. \d sites\n
467             # [ sequence printout ]
468             # \nAfter deleting gaps. \d sites\n"
469             # : [ sequence printout ]
470             # CODONML (in paml 3.12 February 2002) <<-- what we want to see!
471              
472 20         71 my $SEQTYPES = qr( (?: (?: CODON | AA | BASE | CODON2AA ) ML ) | YN00 )x;
473 20         20 my $line;
474 20 50       43 $self->{'_already_parsed_seqs'} = $self->{'_already_parsed_seqs'} ? 1 : 0;
475 20         18 my @lines;
476 20         39 while ( $_ = $self->_readline ) {
477 185         168 push @lines, $_;
478 185 100 100     2490 if (m/^($SEQTYPES) \s+ # seqtype: CODONML, AAML, BASEML, CODON2AAML, YN00, etc
    100 100        
    100 66        
    50          
    100          
    50          
479             (?: \(in \s+ ([^\)]+?) \s* \) \s* )? # version: "paml 3.12 February 2002"; not present < 3.1 or YN00
480             (\S+) \s* # tree filename
481             (?: (.+?) )? # model description (not there in YN00)
482             \s* $ # trim any trailing space
483             /ox
484             )
485             {
486 20         26 @{ $self->{'_summary'} }{qw(seqtype version seqfile model)} =
  20         115  
487             ( $1, $2, $3, $4 );
488              
489             # in 4.3, the model is on its own line
490 20 100       52 if ( !defined $self->{'_summary'}->{'model'} ) {
491 8         15 my $model_line = $self->_readline;
492 8         14 chomp $model_line;
493 8 100       22 if ($model_line =~ /^Model:/) {
494 6         10 $self->{'_summary'}->{'model'} = $model_line;
495             }
496             }
497              
498             defined $self->{'_summary'}->{'model'}
499 20 100       88 && $self->{'_summary'}->{'model'} =~ s/Model:\s+//;
500             $self->_pushback($_)
501 20 50       41 if $self->{'_summary'}->{'seqtype'} eq 'AAMODEL';
502 20         27 last;
503             }
504             elsif ((m/\s+?\d+?\s+?\d+?/) && ( $self->{'_already_parsed_seqs'} != 1 )) {
505 6         13 $self->_parse_seqs;
506             }
507             elsif (m/^Data set \d$/) {
508 1         3 $self->{'_summary'} = {};
509 1         3 $self->{'_summary'}->{'multidata'}++;
510             }
511             elsif (m/^Before\s+deleting\s+alignment\s+gaps/) { #Gap
512 0         0 my ($phylip_header) = $self->_readline;
513 0         0 $self->_parse_seqs;
514             }
515             elsif ( ( @lines >= 3 ) && ( $self->{'_already_parsed_seqs'} != 1 ) )
516             { #No gap
517             # don't start parsing seqs yet if we're on a blank line
518             # (gives another opportunity to match one of the other regexes)
519 5 100       16 unless (/^\n$/) {
520 2         6 $self->_parse_seqs;
521             }
522             }
523             elsif ( (/Printing out site pattern counts/)
524             && ( $self->{'_already_parsed_seqs'} != 1 ) ) {
525 0         0 $self->_parse_patterns;
526             }
527             }
528              
529 20 50       46 unless ( defined $self->{'_summary'}->{'seqtype'} ) {
530 0         0 $self->throw(
531             -class => 'Bio::Root::NotImplemented',
532             -text => 'Unknown format of PAML output did not see seqtype'
533             );
534             }
535 20         32 my $seqtype = $self->{'_summary'}->{'seqtype'};
536 20 100       60 if ( $seqtype eq "CODONML" ) {
    100          
    50          
    100          
    50          
537 14         30 $self->_parse_inputparams(); # settings from the .ctl file
538             # that get printed
539 14         32 $self->_parse_patterns(); # codon patterns - not very interesting
540 14         26 $self->_parse_seqs(); # the sequences data used for analysis
541 14         24 $self->_parse_codoncts(); # counts and distributions of codon/nt
542             # usage
543 14         31 $self->_parse_codon_freqs(); # codon frequencies
544 14         30 $self->_parse_distmat(); # NG distance matrices
545             }
546             elsif ( $seqtype eq "AAML" ) {
547 2         7 $self->_parse_inputparams;
548 2         5 $self->_parse_patterns();
549 2         17 $self->_parse_seqs(); # the sequences data used for analysis
550 2         6 $self->_parse_aa_freqs(); # AA frequencies
551             # get AA distances
552 2         6 $self->{'_summary'}->{'aadistmat'} = $self->_parse_aa_dists();
553              
554             }
555             elsif ( $seqtype eq "CODON2AAML" ) {
556 0         0 $self->throw(
557             -class => 'Bio::Root::NotImplemented',
558             -text => 'CODON2AAML parsing not yet implemented!'
559             );
560             }
561             elsif ( $seqtype eq "BASEML" ) {
562 2         5 $self->_parse_patterns();
563 2         5 $self->_parse_seqs();
564 2         6 $self->_parse_nt_freqs();
565              
566             }
567             elsif ( $seqtype eq "YN00" ) {
568 2         6 $self->_parse_codon_freqs();
569 2         5 $self->_parse_codoncts();
570 2         5 $self->_parse_distmat(); # NG distance matrices
571             }
572             else {
573 0         0 $self->throw(
574             -class => 'Bio::Root::NotImplemented',
575             -text => 'Unknown seqtype, not yet implemented!',
576             -value => $seqtype
577             );
578             }
579              
580             }
581              
582             sub _parse_inputparams {
583 16     16   19 my ($self) = @_;
584 16         30 while ( defined( $_ = $self->_readline ) ) {
585 39 100 66     213 if (/^((?:Codon frequencies)|(?:Site-class models))\s*:\s+(.+)/) {
    100          
    100          
586 10         21 my ( $param, $val ) = ( $1, $2 );
587 10         32 $self->{'_summary'}->{'inputparams'}->{$param} = $val;
588             }
589             elsif (/^\s+$/) {
590 4         6 next;
591             }
592             elsif ( /^ns\s+=\s+/ || /^Frequencies/ ) {
593 16         38 $self->_pushback($_);
594 16         21 last;
595             }
596             }
597             }
598              
599             sub _parse_codon_freqs {
600 16     16   14 my ($self) = @_;
601 16         18 my ( $okay, $done ) = ( 0, 0 );
602              
603 16         32 while ( defined( $_ = $self->_readline ) ) {
604 1495 100       8945 if (/^Nei|\(A\) Nei/) { $self->_pushback($_); last }
  2         15  
  2         3  
605 1493 100       1670 last if ($done);
606 1479 100       2491 next if (/^\s+/);
607             next
608 922 100 100     2778 unless ( $okay || /^Codon position x base \(3x4\) table\, overall/ );
609 56         58 $okay = 1;
610 56 100       174 if (s/^position\s+(\d+):\s+//) {
611 42         60 my $pos = $1;
612 42         108 s/\s+$//;
613 42         86 my @bases = split;
614 42         54 foreach my $str (@bases) {
615 168         229 my ( $base, $freq ) = split( /:/, $str, 2 );
616 168         326 $self->{'_summary'}->{'codonposition'}->[ $pos - 1 ]->{$base} =
617             $freq;
618             }
619 42 100       122 $done = 1 if $pos == 3;
620             }
621             }
622 16         14 $done = 0;
623 16         30 while ( defined( $_ = $self->_readline ) ) {
624 37 100       96 if (/^Nei\s\&\sGojobori|\(A\)\sNei-Gojobori/) {
625 8         16 $self->_pushback($_);
626 8         9 last;
627             }
628 29 100       43 last if ($done);
629 21 100       55 if (/^Codon frequencies under model, for use in evolver/) {
630 8         15 while ( defined( $_ = $self->_readline ) ) {
631 136 100       261 last if (/^\s+$/);
632 128         206 s/^\s+//;
633 128         257 s/\s+$//;
634 128         91 push @{ $self->{'_summary'}->{'codonfreqs'} }, [split];
  128         437  
635             }
636 8         15 $done = 1;
637             }
638             }
639             }
640              
641             sub _parse_aa_freqs {
642 2     2   3 my ($self) = @_;
643 2         4 my ( $okay, $done, $header ) = ( 0, 0, 0 );
644 2         3 my (@bases);
645 2 50       1 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
  2         18  
646 2         5 while ( defined( $_ = $self->_readline ) ) {
647 34 100 66     92 if ( /^TREE/ || /^AA distances/ ) {
648 1         2 $self->_pushback($_);
649 1         2 last;
650             }
651 33 100       39 last if ($done);
652 32 100 100     108 next if ( /^\s+$/ || /^\(Ambiguity/ );
653 20 100       63 if (/^Frequencies\./) {
    50          
    100          
    100          
    100          
654 2         5 $okay = 1;
655             }
656             elsif ( !$okay ) { # skip till we see 'Frequencies.
657 0         0 next;
658             }
659             elsif ( !$header ) {
660 2         6 s/^\s+//; # remove leading whitespace
661 2         7 @bases = split; # get an array of the all the aa names
662 2         3 $header = 1;
663 2         4 $self->{'_summary'}->{'aafreqs'} = {}; # reset/clear values
664 2         3 next;
665             }
666             elsif (
667             /^\#\s+constant\s+sites\:\s+
668             (\d+)\s+ # constant sites
669             \(\s*([\d\.]+)\s*\%\s*\)/x
670             )
671             {
672 2         7 $self->{'_summary'}->{'stats'}->{'constant_sites'} = $1;
673 2         5 $self->{'_summary'}->{'stats'}->{'constant_sites_percentage'} = $2;
674             }
675             elsif (/^ln\s+Lmax\s+\(unconstrained\)\s+\=\s+(\S+)/x) {
676 1         2 $self->{'_summary'}->{'stats'}->{'loglikelihood'} = $1;
677 1         2 $done = 1; # done for sure
678             }
679             else {
680 13         70 my ( $seqname, @freqs ) = split;
681 13         16 my $basect = 0;
682 13         14 foreach my $f (@freqs) {
683              
684             # this will also store 'Average'
685             $self->{'_summary'}->{'aafreqs'}->{$seqname}
686 260         302 ->{ $bases[ $basect++ ] } = $f;
687             }
688             }
689             }
690             }
691              
692             # This is for parsing the automatic tree output
693              
694             sub _parse_StarDecomposition {
695 0     0   0 my ($self) = @_;
696 0         0 my %data;
697              
698 0         0 return %data;
699             }
700              
701             sub _parse_aa_dists {
702 3     3   3 my ($self) = @_;
703 3         10 my ( $okay, $seen, $done ) = ( 0, 0, 0 );
704 3         3 my ( %matrix, @names, @values );
705 3 50       2 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
  3         8  
706 3         3 my $type = '';
707 3         5 while ( defined( $_ = $self->_readline ) ) {
708 23 100       28 last if $done;
709 21 50       27 if (/^TREE/) { $self->_pushback($_); last; }
  0         0  
  0         0  
710 21 100       46 if (/^\s+$/) {
711 2 50       3 last if ($seen);
712 2         4 next;
713             }
714 19 100       29 if (/^(AA|ML) distances/) {
715 3         5 $okay = 1;
716 3         18 $type = $1;
717 3         8 next;
718             }
719 16         41 s/\s+$//g; # remove trailing space
720 16 50       21 if ($okay) {
721 16         36 my ( $seqname, @vl ) = split;
722 16         14 $seen = 1;
723 16         10 my $i = 0;
724              
725             # hacky workaround to problem with 3.14 aaml
726 16 50 100     37 if (
      66        
727             $type eq 'ML'
728             && !@names
729             && # first entry
730             @vl
731             )
732             { # not empty
733 0         0 push @names, $self->{'_summary'}->{'seqs'}->[0]->display_id;
734             }
735 16         18 for my $s (@names) {
736 35 50       42 last unless @vl;
737 35         56 $matrix{$seqname}->{$s} = $matrix{$s}->{$seqname} = shift @vl;
738             }
739 16         15 push @names, $seqname;
740              
741 16         26 $matrix{$seqname}->{$seqname} = 0;
742             }
743 16 100       33 $done = 1 if ( scalar @names == $numseqs );
744             }
745 3         3 my %dist;
746 3         3 my $i = 0;
747 3         4 @values = ();
748 3         4 foreach my $lname (@names) {
749 16         13 my @row;
750 16         9 my $j = 0;
751 16         16 foreach my $rname (@names) {
752 86         69 my $v = $matrix{$lname}->{$rname};
753 86 50       89 $v = $matrix{$rname}->{$lname} unless defined $v;
754 86         58 push @row, $v;
755 86         116 $dist{$lname}{$rname} = [ $i, $j++ ];
756             }
757 16         12 $i++;
758 16         15 push @values, \@row;
759             }
760             return new Bio::Matrix::PhylipDist(
761 3         23 -program => $self->{'_summary'}->{'seqtype'},
762             -matrix => \%dist,
763             -names => \@names,
764             -values => \@values
765             );
766             }
767              
768             sub _parse_patterns {
769 18     18   17 my ($self) = @_;
770 18         22 my ( $patternct, @patterns, $ns, $ls );
771 18 50       31 return if exists $self->{'_summary'}->{'patterns'};
772              
773 18         35 while ( defined( $_ = $self->_readline ) ) {
774 99 100 66     416 if ( /^Codon\s+(usage|position)/ || /Model/ ) {
    100          
    100          
    100          
775 11         18 $self->_pushback($_);
776 11         11 last;
777             }
778             elsif ($patternct) {
779              
780             # last unless ( @patterns == $patternct );
781 52 100       109 last if (/^\s+$/);
782 45         77 s/^\s+//;
783 45         215 push @patterns, split;
784             }
785             elsif (/^ns\s+\=\s*(\d+)\s+ls\s+\=\s*(\d+)/) {
786 18         49 ( $ns, $ls ) = ( $1, $2 );
787             }
788             elsif (/^\# site patterns \=\s*(\d+)/) {
789 7         16 $patternct = $1;
790             }
791             else {
792              
793             # $self->debug("Unknown line: $_");
794             }
795             }
796 18         82 $self->{'_summary'}->{'patterns'} = {
797             -patterns => \@patterns,
798             -ns => $ns,
799             -ls => $ls
800             };
801             }
802              
803             sub _parse_seqs {
804              
805             # this should in fact be packed into a Bio::SimpleAlign object instead of
806             # an array but we'll stay with this for now
807 26     26   31 my ($self) = @_;
808              
809             # Use this flag to deal with paml 4 vs 3 differences
810             # In PAML 4 the sequences precede the CODONML|BASEML|AAML
811             # while in PAML3 the files start off with this
812 26 100       53 return 1 if $self->{'_already_parsed_seqs'};
813 18         21 my ( @firstseq, @seqs );
814 18         29 while ( defined( $_ = $self->_readline ) ) {
815 91 100       210 if (/^(Printing|After|TREE|Codon)/) {
816 5         11 $self->_pushback($_);
817 5         5 last;
818             }
819 86 100 100     274 last if ( /^\s+$/ && @seqs > 0 );
820 73 100       136 next if (/^\s+$/);
821 67 100       385 next if (/^\d+\s+$/);
822              
823             # we are reading PHYLIP format
824 66         188 my ( $name, $seqstr ) = split( /\s+/, $_, 2 );
825 66         3520 $seqstr =~ s/\s+//g; # remove whitespace
826 66 100       100 unless (@firstseq) {
827 13         566 @firstseq = split( //, $seqstr );
828 13         87 push @seqs,
829             Bio::LocatableSeq->new(
830             -display_id => $name,
831             -seq => $seqstr
832             );
833             }
834             else {
835              
836 53         47 my $i = 0;
837 53         41 my $v;
838 53         132 while ( ( $v = index( $seqstr, '.', $i ) ) >= $i ) {
839              
840             # replace the '.' with the correct seq from the
841 2741         1604 substr( $seqstr, $v, 1, $firstseq[$v] );
842 2741         2811 $i = $v;
843             }
844 53         162 push @seqs,
845             Bio::LocatableSeq->new(
846             -display_id => $name,
847             -seq => $seqstr
848             );
849             }
850             }
851 18 100       33 if ( @seqs > 0 ) {
852 13         25 $self->{'_summary'}->{'seqs'} = \@seqs;
853 13         15 $self->{'_already_parsed_seqs'} = 1;
854             }
855 18         267 1;
856             }
857              
858       16     sub _parse_codoncts { }
859              
860             sub _parse_distmat {
861 16     16   17 my ($self) = @_;
862 16         13 my @results;
863 16         15 my $ver = 3.14;
864 16         13 my $firstseq, my $secondseq;
865              
866 16         30 while ( defined( $_ = $self->_readline ) ) {
867 24 100       69 next if /^\s+$/;
868              
869             # We need to get the names of the sequences if this is from YN00:
870 16 100       32 if (/^\(A\)\sNei-Gojobori\s\(1986\)\smethod/) {
871 1         1 $ver = 3.15;
872 1         3 while ( defined( $_ = $self->_readline ) ) {
873 9 100       29 if ($_ =~ m/.*\d+?\.\d+?\s*\(.*/) {
874 1         2 $secondseq = $_;
875 1         1 last;
876             }
877 8         13 $firstseq = $_;
878             }
879             }
880 16         16 last;
881             }
882              
883             #return unless (/^Nei\s*\&\s*Gojobori/);
884              
885             # skip the next 3 lines
886 16 100       37 if ( $self->{'_summary'}->{'seqtype'} eq 'CODONML' ) {
887 14         26 $self->_readline;
888 14         23 $self->_readline;
889 14         19 $self->_readline;
890             }
891 16         20 my $seqct = 0;
892 16         16 my @seqs;
893 16 100       31 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
894 2 100       6 if ($firstseq) {
895 1         5 $firstseq =~ s/(.+?)\s+.*/$1/;
896 1         3 $secondseq =~ s/(.+?)\s+.*/$1/;
897 1         1 chomp $firstseq;
898 1         2 chomp $secondseq;
899 1         7 push @seqs, Bio::PrimarySeq->new( -display_id => $firstseq );
900 1         4 push @seqs, Bio::PrimarySeq->new( -display_id => $secondseq );
901             }
902             }
903 16         31 while ( defined( $_ = $self->_readline ) ) {
904 98 50 66     259 last if ( /^\s+$/ && exists $self->{'_summary'}->{'ngmatrix'} );
905 82 50 33     257 next if ( /^\s+$/ || /^NOTE:/i );
906 82         74 chomp;
907              
908 82         63 my ( $seq, $rest );
909 82 100       106 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
910 12         39 ( $seq, $rest ) = split( /\s+/, $_, 2 );
911             }
912             else {
913 70         271 $_ =~ m/(.+?)\s*(-*\d+?\.\d+?.*)/;
914 70         77 $seq = $1;
915 70         73 $rest = $2;
916             }
917 82 100       106 $rest = '' unless defined $rest; # get rid of empty messages
918 82         64 my $j = 0;
919 82 100       120 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' ) {
920 12         37 push @seqs, Bio::PrimarySeq->new( -display_id => $seq );
921             }
922 82   100     406 while ($rest
923             && $rest =~
924             /(\-?\d+(\.\d+)?)\s*\(\-?(\d+(\.\d+)?)\s+(\-?\d+(\.\d+)?)\)/g )
925             {
926 218         1379 $self->{'_summary'}->{'ngmatrix'}->[ $j++ ]->[$seqct] = {
927             'omega' => $1,
928             'dN' => $3,
929             'dS' => $5
930             };
931             }
932 82         160 $seqct++;
933             }
934 16 100 66     43 if ( $self->{'_summary'}->{'seqtype'} eq 'YN00' && @seqs ) {
935 2         4 $self->{'_summary'}->{'seqs'} = \@seqs;
936             }
937              
938 16         148 1;
939             }
940              
941             sub _parse_PairwiseCodon {
942 8     8   6 my ($self) = @_;
943 8         8 my @result;
944 8         8 my ( $a, $b, $log, $model, $t, $kappa, $omega, $fixedkappa );
945             # check to see if we have a fixed kappa:
946 8 100       40 if ( $self->{'_summary'}->{'model'} =~ /kappa = (\d+?\.\d+?) fixed/) {
947 1         2 $fixedkappa = $1;
948             }
949 8         12 while ( defined( $_ = $self->_readline ) ) {
950 680 100       2057 if (/^pairwise comparison, codon frequencies\:\s*(\S+)\./) {
    100          
    100          
    100          
    50          
    0          
951 8         16 $model = $1;
952             }
953             # 1st line of a pair block, e.g.
954             # 2 (all_c7259) ... 1 (all_s57600)
955             elsif (/^(\d+)\s+\((\S+)\)\s+\.\.\.\s+(\d+)\s+\((\S+)\)/) {
956 112         221 ( $a, $b ) = ( $1, $3 );
957             }
958             # 2nd line of a pair block, e.g.
959             # lnL = -126.880601
960             elsif (/^lnL\s+\=\s*(\-?\d+(\.\d+)?)/) {
961 112         120 $log = $1;
962 112 50       133 if ( defined( $_ = $self->_readline ) ) {
963             # 3rd line of a pair block, e.g.
964             # 0.19045 2.92330 0.10941
965 112         223 s/^\s+//;
966 112         245 ( $t, $kappa, $omega ) = split;
967             # if there was a fixed kappa, there will only be two values here ($t, $omega) and $kappa = $fixedkappa.
968 112 100       271 if ($omega eq "") {
969 1         2 $omega = $kappa;
970 1         2 $kappa = $fixedkappa;
971             }
972             }
973             }
974             # 5th line of a pair block, e.g.
975             # t= 0.1904 S= 5.8 N= 135.2 dN/dS= 0.1094 dN= 0.0476 dS= 0.4353
976             # OR lines like (note last field; this includes a fix for bug #3040)
977             # t= 0.0439 S= 0.0 N= 141.0 dN/dS= 0.1626 dN= 0.0146 dS= nan
978             elsif (m/^t\=\s*(\d+(\.\d+)?)\s+/)
979             {
980             # Breaking out each piece individually so that you can see
981             # what each regexp actually looks for
982 112         78 my $parse_string = $_;
983 112         248 $parse_string =~ m/.*t\s*\=\s*(\d+?\.\d+?)\s/;
984 112         126 my $temp_t = $1;
985 112         172 $parse_string =~ m/\sS\s*\=\s*(\d+?\.\d+?)\s/;
986 112         97 my $temp_S = $1;
987 112         142 $parse_string =~ m/\sN\s*\=\s*(\d+?\.\d+?)\s/;
988 112         94 my $temp_N = $1;
989 112         157 $parse_string =~ m/\sdN\/dS\s*\=\s*(\d+?\.\d+?)\s/;
990 112         94 my $temp_omega = $1;
991 112         168 $parse_string =~ m/\sdN\s*\=\s*(\d+?\.\d+?)\s/;
992 112         108 my $temp_dN = $1;
993 112         182 $parse_string =~ m/\sdS\s*\=\s*(.+)\s/;
994 112         88 my $temp_dS = $1;
995 112 50 33     1025 $result[ $b - 1 ]->[ $a - 1 ] = {
    50 33        
996             'lnL' => $log,
997             't' => defined $t && length($t) ? $t : $temp_t,
998             'S' => $temp_S,
999             'N' => $temp_N,
1000             'kappa' => $kappa,
1001             'omega' => defined $omega && length($omega) ? $omega : $temp_omega,
1002             'dN' => $temp_dN,
1003             'dS' => $temp_dS
1004             };
1005             }
1006             # 4th line of a pair block (which is blank)
1007             elsif (/^\s+$/) {
1008 336         469 next;
1009             }
1010             elsif (/^\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)/) {
1011             }
1012             else {
1013 0         0 $self->debug("unknown line: $_");
1014             }
1015             }
1016 8         26 return ( -mlmatrix => \@result );
1017             }
1018              
1019             sub _parse_YN_Pairwise {
1020 2     2   4 my ($self) = @_;
1021 2         3 my @result;
1022 2         5 while ( defined( $_ = $self->_readline ) ) {
1023 11 100       26 last if (/^seq\.\s+seq\./);
1024             }
1025 2         6 while ( defined( $_ = $self->_readline ) ) {
1026 51 100       295 if (
    100          
    100          
1027             m/^\s+(\d+)\s+ # seq #
1028             (\d+)\s+ # seq #
1029             (\d+(\.\d+))\s+ # S
1030             (\d+(\.\d+))\s+ # N
1031             (\d+(\.\d+))\s+ # t
1032             (\d+(\.\d+))\s+ # kappa
1033             (\d+(\.\d+))\s+ # omega
1034             \-??(\d+(\.\d+))\s+ # dN
1035             \+\-\s+
1036             \-??(\d+(\.\d+))\s+ # dN SE
1037             \-??(\d+(\.\d+))\s+ # dS
1038             \+\-\s+
1039             \-??(\d+(\.\d+))\s+ # dS SE
1040             /ox
1041             )
1042             {
1043              
1044 43         275 $result[ $2 - 1 ]->[ $1 - 1 ] = {
1045             'S' => $3,
1046             'N' => $5,
1047             't' => $7,
1048             'kappa' => $9,
1049             'omega' => $11,
1050             'dN' => $13,
1051             'dN_SE' => $15,
1052             'dS' => $17,
1053             'dS_SE' => $19,
1054             };
1055             }
1056             elsif (/^\s+$/) {
1057 4         6 next;
1058             }
1059             elsif (/^\(C\) LWL85, LPB93 & LWLm methods/) {
1060 1         3 $self->_pushback($_);
1061 1         1 last;
1062             }
1063             }
1064 2         8 return ( -mlmatrix => \@result );
1065             }
1066              
1067             sub _parse_Forestry {
1068 13     13   15 my ($self) = @_;
1069 13         27 my ( $instancecount, $num_param, $loglikelihood, $score, $done,
1070             $treelength ) = ( 0, 0, 0, 0, 0, 0 );
1071 13         16 my $okay = 0;
1072 13         15 my ( @ids, %match, @branches, @trees );
1073 13         23 while ( defined( $_ = $self->_readline ) ) {
1074 175 50       208 last if $done;
1075 175 100 33     1716 if (s/^TREE\s+\#\s*\d+:\s+//) {
    100 33        
    100 33        
    100 66        
    100          
    100          
    100          
1076 13         58 ($score) = (s/MP\s+score\:\s+(\S+)\s+$//);
1077 13         92 @ids = /(\d+)[\,\)]/g;
1078             }
1079             elsif (/^Node\s+\&/
1080             || /^\s+N37/
1081             || /^(CODONML|AAML|YN00|BASEML)/
1082             || /^\*\*/
1083             || /^Detailed output identifying parameters/ )
1084             {
1085 11         32 $self->_pushback($_);
1086 11         12 $done = 1;
1087 11         19 last;
1088             }
1089             elsif (/^tree\s+length\s+\=\s+(\S+)/) {
1090 13         28 $treelength = $1; # not going to store this for now
1091             # as it is directly calculated from
1092             # $tree->total_branch_length;
1093             }
1094             elsif (/^\s*lnL\(.+np\:\s*(\d+)\)\:\s+(\S+)/) {
1095              
1096             # elsif( /^\s*lnL\(.+\)\:\s+(\S+)/ ) {
1097 13         42 ( $num_param, $loglikelihood ) = ( $1, $2 );
1098             }
1099             elsif (/^\(/) {
1100 26         218 s/([\,:])\s+/$1/g;
1101 26         136 my $treestr = IO::String->new($_);
1102 26         937 my $treeio = Bio::TreeIO->new(
1103             -fh => $treestr,
1104             -format => 'newick'
1105             );
1106 26         52 my $tree = $treeio->next_tree;
1107 26 50       44 if ($tree) {
1108 26         39 $tree->score($loglikelihood);
1109 26         59 $tree->id("num_param:$num_param");
1110 26 100       49 if ( $okay > 0 ) {
1111              
1112             # we don't save the trees with the number labels
1113 13 50 33     54 if ( !%match && @ids ) {
1114 13         12 my $i = 0;
1115 13         127 for my $m (/([^():,]+):/g) {
1116 66         115 $match{ shift @ids } = [$m];
1117             }
1118 13         16 my %grp;
1119 13         33 while ( my $br = shift @branches ) {
1120 120         104 my ( $parent, $child ) = @$br;
1121 120 100       131 if ( $match{$child} ) {
1122 93         54 push @{ $match{$parent} }, @{ $match{$child} };
  93         91  
  93         184  
1123             }
1124             else {
1125 27         45 push @branches, $br;
1126             }
1127             }
1128 13 50       29 if ( $self->verbose > 1 ) {
1129 0         0 for my $k ( sort { $a <=> $b } keys %match ) {
  0         0  
1130             $self->debug( "$k -> ",
1131 0         0 join( ",", @{ $match{$k} } ), "\n" );
  0         0  
1132             }
1133             }
1134             }
1135              
1136             # Associate SEs to nodes using tags
1137 13 100       30 if ( defined( $self->{_SEs} ) ) {
1138 3         15 my @SEs = split( " ", $self->{_SEs} );
1139 3         5 my $i = 0;
1140 3         12 foreach my $parent_id ( map { /\d+\.\.(\d+)/ }
  33         54  
1141             split( " ", $self->{_branch_ids} ) )
1142             {
1143 33         23 my @nodes;
1144 33         24 my @node_ids = @{ $match{$parent_id} };
  33         43  
1145             my @nodes_L =
1146 33         33 map { $tree->find_node( -id => $_ ) } @node_ids;
  48         115  
1147 33 100       67 my $n =
1148             @nodes_L < 2
1149             ? shift(@nodes_L)
1150             : $tree->get_lca(@nodes_L);
1151 33 50       49 if ( !$n ) {
1152 0         0 $self->warn(
1153             "no node could be found for node in SE assignation (no lca?)"
1154             );
1155             }
1156 33         57 $n->add_tag_value( 'SE', $SEs[$i] );
1157 33         51 $i++;
1158             }
1159             }
1160 13         19 push @trees, $tree;
1161             }
1162             }
1163 26         70 $okay++;
1164             }
1165             elsif (/^SEs for parameters/) {
1166 3         7 my $se_line = $self->_readline;
1167 3         8 $se_line =~ s/\n//;
1168 3         7 $self->{_SEs} = $se_line;
1169             }
1170             elsif (/^\s*\d+\.\.\d+/) {
1171 13         39 push @branches, map { [ split( /\.\./, $_ ) ] } split;
  93         166  
1172 13         19 my $ids = $_;
1173 13         29 $ids =~ s/\n//;
1174 13         33 $self->{_branch_ids} = $ids;
1175             }
1176             }
1177 13         60 return \@trees, \%match;
1178             }
1179              
1180             sub _parse_NSsitesBatch {
1181 9     9   10 my $self = shift;
1182 9         11 my ( %data, $idlookup );
1183 9         11 my ( $okay, $done ) = ( 0, 0 );
1184 9         15 while ( defined( $_ = $self->_readline ) ) {
1185 120 100       144 last if $done;
1186 115 100       335 next if /^\s+$/;
1187 68 50 100     142 next unless ( $okay || /^Model\s+\d+/ || /^TREE/ );
      66        
1188              
1189 68 100       401 if (/^Model\s+(\d+)/) {
    100          
    100          
    100          
    100          
    100          
    100          
    50          
    50          
    100          
1190 7 50       12 if ($okay) {
1191              
1192             # this only happens if $okay was already 1 and
1193             # we hit a Model line
1194 0         0 $self->_pushback($_);
1195 0         0 $done = 1;
1196             }
1197             else {
1198 7         11 chomp;
1199 7         17 $data{'-model_num'} = $1;
1200 7         27 ( $data{'-model_description'} ) = (/\:\s+(.+)/);
1201 7         14 $okay = 1;
1202             }
1203             }
1204             elsif (/^Time used\:\s+(\S+)/) {
1205 7         21 $data{'-time_used'} = $1;
1206 7         15 $done = 1;
1207             }
1208             elsif (/^kappa\s+\(ts\/tv\)\s+\=\s+(\S+)/) {
1209 9         49 $data{'-kappa'} = $1;
1210             }
1211             elsif (/^TREE/) {
1212 9         15 $self->_pushback($_);
1213 9         22 ( $data{'-trees'}, $idlookup ) = $self->_parse_Forestry;
1214 9 50 50     26 if ( defined $data{'-trees'}
1215 9         23 && scalar @{ $data{'-trees'} } )
1216             {
1217 9         20 $data{'-likelihood'} = $data{'-trees'}->[0]->score;
1218             }
1219 9         21 $okay = 1;
1220             }
1221             elsif (/^omega\s+\(dn\/ds\)\s+\=\s+(\S+)/i) {
1222              
1223             # for M0 (single ratio for the entire tree)
1224             # explicitly put '1.00000' rather than '1', because \d+\.\d{5}
1225             # is reported in all other cases.
1226 1         3 my @p = (q/1.00000/); # since there is only one class,
1227 1         2 my @w = $1;
1228 1         3 $data{'-dnds_site_classes'} = {
1229             'p' => \@p,
1230             'w' => \@w
1231             };
1232              
1233             # since no K=X is provided, put 1 here
1234 1         3 $data{q/-num_site_classes/} = 1;
1235             }
1236             elsif (
1237             /^(Naive Empirical Bayes)|(Bayes Empirical Bayes)|(Positively\sselected\ssites)/i
1238             )
1239             {
1240 6         13 $self->_pushback($_);
1241 6         13 my ( $sites, $neb, $beb ) = $self->_parse_Pos_selected_sites;
1242 6         12 $data{'-pos_sites'} = $sites;
1243 6         8 $data{'-neb_sites'} = $neb;
1244 6         11 $data{'-beb_sites'} = $beb;
1245             }
1246             elsif (/^dN/i) {
1247 15 100       54 if (/K\=(\d+)/) {
    50          
1248 7         17 $data{'-num_site_classes'} = $1;
1249 7         16 while ( $_ = $self->_readline ) {
1250 12 100       41 unless ( $_ =~ /^\s+$/ ) {
1251 7         13 $self->_pushback($_);
1252 7         9 last;
1253             }
1254             }
1255 7 100       15 if (/^site class/) {
1256 1         3 $self->_readline;
1257 1         2 my $tmp = $self->_readline;
1258 1         6 my @p = $tmp =~ /(\d+\.\d{5})/g;
1259 1         3 $tmp = $self->_readline;
1260 1         4 my @b_w = $tmp =~ /(\d+\.\d{5})/g;
1261 1         2 $tmp = $self->_readline;
1262 1         5 my @f_w = $tmp =~ /(\d+\.\d{5})/g;
1263 1         2 my @w;
1264              
1265 1         4 foreach my $i ( 0 .. $#b_w ) {
1266 4         13 push @w,
1267             {
1268             q/background/ => $b_w[$i],
1269             q/foreground/ => $f_w[$i]
1270             };
1271             }
1272 1         5 $data{'-dnds_site_classes'} = {
1273             q/p/ => \@p,
1274             q/w/ => \@w
1275             };
1276             }
1277             else {
1278 6         11 my $tmp = $self->_readline;
1279 6         35 my @p = $tmp =~ /(\d+\.\d{5})/g;
1280 6         12 $tmp = $self->_readline;
1281 6         25 my @w = $tmp =~ /(\d+\.\d{5})/g;
1282 6         28 $data{'-dnds_site_classes'} = {
1283             'p' => \@p,
1284             'w' => \@w
1285             };
1286             }
1287             }
1288             elsif (/for each branch/) {
1289 8         20 my %branch_dnds = $self->_parse_branch_dnds;
1290 8 50       21 if ( !defined $data{'-trees'} ) {
1291 0         0 $self->warn(
1292             "No trees have been loaded, can't do anything\n");
1293 0         0 next;
1294             }
1295 8         7 my ($tree) = @{ $data{'-trees'} };
  8         14  
1296 8 50 33     48 if ( !$tree
      33        
1297             || !ref($tree)
1298             || !$tree->isa('Bio::Tree::Tree') )
1299             {
1300 0         0 $self->warn("no tree object already stored!\n");
1301 0         0 next;
1302             }
1303              
1304             # These need to be added to the Node/branches
1305 8         26 while ( my ( $k, $v ) = each %branch_dnds ) {
1306              
1307             # we can probably do better by caching at some point
1308 56         38 my @nodes;
1309 56         114 for my $id ( split( /\.\./, $k ) ) {
1310             my @nodes_L =
1311 241         741 map { $tree->find_node( -id => $_ ) }
1312 112         75 @{ $idlookup->{$id} };
  112         148  
1313 112 100       224 my $n =
1314             @nodes_L < 2
1315             ? shift(@nodes_L)
1316             : $tree->get_lca(@nodes_L);
1317 112 50       154 if ( !$n ) {
1318 0         0 $self->warn(
1319             "no node could be found for $id (no lca?)");
1320             }
1321 112 100 66     173 unless ( $n->is_Leaf && $n->id ) {
1322 72         105 $n->id($id);
1323             }
1324 112         173 push @nodes, $n;
1325             }
1326 56         79 my ( $parent, $child ) = @nodes;
1327 56         185 while ( my ( $kk, $vv ) = each %$v ) {
1328 504         538 $child->add_tag_value( $kk, $vv );
1329             }
1330             }
1331             }
1332             }
1333             elsif (/^Parameters in beta:/) {
1334 0         0 $_ = $self->_readline; # need the next line
1335 0 0       0 if (/p\=\s+(\S+)\s+q\=\s+(\S+)/) {
1336 0         0 $data{'-shape_params'} = {
1337             'shape' => 'beta',
1338             'p' => $1,
1339             'q' => $2
1340             };
1341             }
1342             else {
1343 0         0 $self->warn("unparseable beta parameters: $_");
1344             }
1345             }
1346             elsif (/^Parameters in beta\&w\>1:/) {
1347              
1348             # Parameters in beta&w>1:
1349             # p0= 1.00000 p= 0.07642 q= 0.85550
1350             # (p1= 0.00000) w= 1.00000
1351 0         0 $_ = $self->_readline; # need the next line
1352 0         0 my ( $p0, $p, $q, $p1, $w );
1353 0 0       0 if (/p0\=\s+(\S+)\s+p\=\s+(\S+)\s+q\=\s+(\S+)/) {
1354 0         0 $p0 = $1;
1355 0         0 $p = $2;
1356 0         0 $q = $3;
1357             }
1358             else {
1359 0         0 $self->warn("unparseable beta parameters: $_");
1360             }
1361 0         0 $_ = $self->_readline; # need the next line
1362 0 0       0 if (/\(p1\=\s+(\S+)\)\s+w\=\s*(\S+)/) {
1363 0         0 $p1 = $1;
1364 0         0 $w = $2;
1365 0         0 $data{'-shape_params'} = {
1366             'shape' => 'beta',
1367             'p0' => $p0,
1368             'p' => $p,
1369             'q' => $q,
1370             'p1' => $p1,
1371             'w' => $w
1372             };
1373             }
1374             else {
1375 0         0 $self->warn("unparseable beta parameters: $_");
1376             }
1377             }
1378             elsif (/^alpha\s+\(gamma\)\s+\=\s+(\S+)/) {
1379 3         6 my $gamma = $1;
1380 3         7 $_ = $self->_readline;
1381 3         4 my ( @r, @f );
1382 3 50       16 if (s/^r\s+\(\s*\d+\)\:\s+//) {
1383 3         8 @r = split;
1384             }
1385 3         5 $_ = $self->_readline;
1386 3 50       17 if (s/^f\s*\:\s+//) {
1387 3         8 @f = split;
1388             }
1389 3         23 $data{'-shape_params'} = {
1390             'shape' => 'alpha',
1391             'gamma' => $gamma,
1392             'r' => \@r,
1393             'f' => \@f
1394             };
1395             }
1396             }
1397 9         105 return new Bio::Tools::Phylo::PAML::ModelResult(%data);
1398             }
1399              
1400             sub _parse_Pos_selected_sites {
1401 6     6   6 my $self = shift;
1402 6         7 my $okay = 0;
1403 6         22 my (%sites) = (
1404             'default' => [],
1405             'neb' => [],
1406             'beb' => []
1407             );
1408 6         8 my $type = 'default';
1409 6         12 while ( defined( $_ = $self->_readline ) ) {
1410 116 100 100     390 next if ( /^\s+$/ || /^\s+Pr\(w\>1\)/ );
1411 80 100 66     230 if ( /^Time used/ || /^TREE/ ) {
1412 4         8 $self->_pushback($_);
1413 4         6 last;
1414             }
1415 76 100 100     556 if (/^Naive Empirical Bayes/i) {
    100 100        
    100 100        
    100          
    100          
    100          
1416 5         10 $type = 'neb';
1417             }
1418             elsif (/^Bayes Empirical Bayes/i) {
1419 2         5 $type = 'beb';
1420             }
1421             elsif (/^Positively selected sites/) {
1422 4         9 $okay = 1;
1423             }
1424             elsif ( $okay
1425             && /^\s+(\d+)\s+(\S+)\s+(\-?\d+(?:\.\d+)?)(\**)\s+(\-?\d+(?:\.\d+)?)\s+\+\-\s+(\-?\d+(?:\.\d+)?)/
1426             )
1427             {
1428 9         11 my $signif = $4;
1429 9 50       10 $signif = '' unless defined $signif;
1430 9         6 push @{ $sites{$type} }, [ $1, $2, $3, $signif, $5, $6 ];
  9         42  
1431             }
1432             elsif ( $okay
1433             && /^\s+(\d+)\s+(\S+)\s+(\-?\d*(?:.\d+))(\**)\s+(\-?\d+(?:\.\d+)?)/
1434             )
1435             {
1436 8         12 my $signif = $4;
1437 8 50       11 $signif = '' unless defined $signif;
1438 8         6 push @{ $sites{$type} }, [ $1, $2, $3, $signif, $5 ];
  8         39  
1439             }
1440             elsif ( $okay && /^\s+(\d+)\s+(\S)\s+([\d\.\-\+]+)(\**)/ ) {
1441 8         28 my $signif = $4;
1442 8 50       13 $signif = '' unless defined $signif;
1443 8         5 push @{ $sites{$type} }, [ $1, $2, $3, $signif ];
  8         33  
1444             }
1445             }
1446 6         17 return ( $sites{'default'}, $sites{'neb'}, $sites{'beb'} );
1447             }
1448              
1449             sub _parse_branch_dnds {
1450 10     10   12 my $self = shift;
1451 10         15 my ($okay) = (0);
1452 10         9 my %branch_dnds;
1453             my @header;
1454 10         16 while ( defined( $_ = $self->_readline ) ) {
1455 136 100       318 next if (/^\s+$/);
1456 98 50 66     163 next unless ( $okay || /^\s+branch\s+t/ );
1457 98 100       246 if (/^\s+branch\s+(.+)/) {
    100          
1458 10         30 s/^\s+//;
1459 10         61 @header = split( /\s+/, $_ );
1460 10         19 $okay = 1;
1461             }
1462             elsif (/^\s*(\d+\.\.\d+)/) {
1463 78         86 my $branch = $1;
1464 78         118 s/^\s+//;
1465 78         58 my $i = 0;
1466              
1467             # fancyness just maps the header names like 't' or 'dN'
1468             # into the hash so we get at the end of the day
1469             # 't' => 0.067
1470             # 'dN'=> 0.001
1471 78         150 $branch_dnds{$branch} = { map { $header[ $i++ ] => $_ } split };
  702         1049  
1472             }
1473             else {
1474 10         19 $self->_pushback($_);
1475 10         13 last;
1476             }
1477             }
1478 10         58 return %branch_dnds;
1479             }
1480              
1481             #baseml stuff
1482             sub _parse_nt_freqs {
1483 2     2   3 my ($self) = @_;
1484 2         3 my ( $okay, $done, $header ) = ( 0, 0, 0 );
1485 2         2 my (@bases);
1486 2 50       2 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
  2         6  
1487 2         4 while ( defined( $_ = $self->_readline ) ) {
1488 28 50 33     76 if ( /^TREE/ || /^Distances/ ) { $self->_pushback($_); last }
  0         0  
  0         0  
1489 28 100       35 last if ($done);
1490 26 100 66     82 next if ( /^\s+$/ || /^\(Ambiguity/ );
1491 16 100       60 if (/^Frequencies\./) {
    50          
    100          
    100          
    100          
1492 2         4 $okay = 1;
1493             }
1494             elsif ( !$okay ) { # skip till we see 'Frequencies.
1495 0         0 next;
1496             }
1497             elsif ( !$header ) {
1498 2         5 s/^\s+//; # remove leading whitespace
1499 2         6 @bases = split; # get an array of the all the aa names
1500 2         2 $header = 1;
1501 2         3 $self->{'_summary'}->{'ntfreqs'} = {}; # reset/clear values
1502 2         4 next;
1503             }
1504             elsif (
1505             /^\#\s+constant\s+sites\:\s+
1506             (\d+)\s+ # constant sites
1507             \(\s*([\d\.]+)\s*\%\s*\)/ox
1508             )
1509             {
1510 2         12 $self->{'_summary'}->{'stats'}->{'constant_sites'} = $1;
1511 2         6 $self->{'_summary'}->{'stats'}->{'constant_sites_percentage'} = $2;
1512             }
1513             elsif (/^ln\s+Lmax\s+\(unconstrained\)\s+\=\s+(\S+)/ox) {
1514 2         5 $self->{'_summary'}->{'stats'}->{'loglikelihood'} = $1;
1515 2         3 $done = 1; # done for sure
1516             }
1517             else {
1518 8         18 my ( $seqname, @freqs ) = split;
1519 8         10 my $basect = 0;
1520 8         11 foreach my $f (@freqs) {
1521              
1522             # this will also store 'Average'
1523             $self->{'_summary'}->{'ntfreqs'}->{$seqname}
1524 32         52 ->{ $bases[ $basect++ ] } = $f;
1525             }
1526             }
1527             }
1528             }
1529              
1530             sub _parse_nt_dists {
1531 2     2   2 my ($self) = @_;
1532 2         3 my ( $okay, $seen, $done ) = ( 0, 0, 0 );
1533 2         1 my ( %matrix, @names );
1534 2 50       2 my $numseqs = scalar @{ $self->{'_summary'}->{'seqs'} || [] };
  2         4  
1535 2         3 my $type = '';
1536 2         4 while ( defined( $_ = $self->_readline ) ) {
1537 13 50       18 if (/^TREE/) { $self->_pushback($_); last; }
  0         0  
  0         0  
1538 13 100       15 last if $done;
1539 12 100       19 next if (/^This matrix is not used in later/);
1540 10 100       23 if (/^\s+$/) {
1541 2 50       4 last if ($seen);
1542 2         4 next;
1543             }
1544 8 100       18 if (/^Distances:(\S+)\s+\(([^\)]+)\)\s+\(alpha set at (\-?\d+\.\d+)\)/)
1545             {
1546 2         1 $okay = 1;
1547 2         6 $type = $1;
1548 2         5 next;
1549             }
1550 6         17 s/\s+$//g; # remove trailing space
1551 6 50       10 if ($okay) {
1552 6         12 my ( $seqname, $vl ) = split( /\s+/, $_, 2 );
1553 6         7 $seen = 1;
1554 6         3 my $i = 0;
1555 6 100       10 if ( defined $vl ) {
1556 4         16 while ( $vl =~ /(\-?\d+\.\d+)\s*\(\s*(\-?\d+\.\d+)\s*\)\s*/g ) {
1557 6         9 my ( $kappa, $alpha ) = ( $1, $2 );
1558             $matrix{$seqname}{ $names[$i] } =
1559 6         17 $matrix{ $names[$i] }{$seqname} = [ $kappa, $alpha ];
1560              
1561 6         13 $i++;
1562             }
1563 4 50       7 unless ($i) {
1564 0         0 $self->warn("no matches for $vl\n");
1565             }
1566             }
1567 6         6 push @names, $seqname;
1568 6         18 $matrix{$seqname}->{$seqname} = [ 0, 0 ];
1569             }
1570 6 100       18 $done = 1 if ( scalar @names == $numseqs );
1571             }
1572 2         3 my %dist;
1573 2         2 my $i = 0;
1574 2         4 my ( @kvalues, @avalues );
1575 2         3 foreach my $lname (@names) {
1576 6         3 my ( @arow, @krow );
1577 6         5 my $j = 0;
1578 6         5 foreach my $rname (@names) {
1579 18         16 my $v = $matrix{$lname}{$rname};
1580              
1581 18         16 push @krow, $v->[0]; # kappa values
1582 18         12 push @arow, $v->[1]; # alpha
1583 18         28 $dist{$lname}{$rname} = [ $i, $j++ ];
1584             }
1585 6         5 $i++;
1586 6         5 push @kvalues, \@krow;
1587 6         8 push @avalues, \@arow;
1588             }
1589             return (
1590             Bio::Matrix::PhylipDist->new(
1591             -program => $self->{'_summary'}->{'seqtype'},
1592             -matrix => \%dist,
1593             -names => \@names,
1594             -values => \@kvalues
1595             ),
1596             Bio::Matrix::PhylipDist->new(
1597 2         32 -program => $self->{'_summary'}->{'seqtype'},
1598             -matrix => \%dist,
1599             -names => \@names,
1600             -values => \@avalues
1601             )
1602             );
1603             }
1604              
1605             # BASEML
1606             sub _parse_rate_parametes {
1607 0     0   0 my $self = shift;
1608 0         0 my (%rate_parameters);
1609 0         0 while ( defined( $_ = $self->_readline ) ) {
1610 0 0       0 if (/^Rate\s+parameters:\s+/) {
    0          
    0          
    0          
    0          
1611 0         0 s/\s+$//;
1612 0         0 $rate_parameters{'rate_parameters'} = [ split( /\s+/, $_ ) ];
1613             }
1614             elsif (/^Base\s+frequencies:\s+/) {
1615 0         0 s/\s+$//;
1616 0         0 $rate_parameters{'base_frequencies'} = [ split( /\s+/, $_ ) ];
1617             }
1618             elsif (
1619             m/^Rate\s+matrix\s+Q,\s+Average\s+Ts\/Tv\s+(\([^\)+]+\))?\s*\=\s+
1620             (\-?\d+\.\d+)/x
1621             )
1622             {
1623 0         0 $rate_parameters{'average_TsTv'} = $1;
1624 0         0 while ( defined( $_ = $self->_readline ) ) {
1625              
1626             # short circuit
1627 0 0       0 last if (/^\s+$/);
1628 0 0       0 if (/^alpha/) {
1629 0         0 $self->_pushback($_);
1630 0         0 last;
1631             }
1632 0         0 s/^\s+//;
1633 0         0 s/\s+$//;
1634 0         0 push @{ $rate_parameters{'rate_matrix_Q'} }, [split];
  0         0  
1635             }
1636             }
1637             elsif (/^alpha\s+\(gamma,\s+K=\s*(\d+)\s*\)\s*\=\s*(\-?\d+\.\d+)/) {
1638 0         0 $rate_parameters{'K'} = $1;
1639 0         0 $rate_parameters{'alpha'} = $2;
1640             }
1641             elsif (s/^(r|f):\s+//) {
1642 0         0 my ($p) = $1;
1643 0         0 s/\s+$//;
1644 0         0 $rate_parameters{$p} = [split];
1645             }
1646             }
1647             }
1648              
1649             # RST parsing
1650             sub _parse_rst {
1651 20     20   20 my ($self) = @_;
1652 20 50 66     83 return unless $self->{'_dir'} && -d $self->{'_dir'} && -r $self->{'_dir'};
      66        
1653              
1654 1         10 my $rstfile = File::Spec->catfile( $self->{'_dir'}, $RSTFILENAME );
1655 1 50 33     27 return unless -e $rstfile && !-z $rstfile;
1656 1         5 my $rstio = Bio::Root::IO->new( -file => $rstfile );
1657              
1658             # define whatever data structures you need to store the data
1659             # key points are to reuse existing bioperl objs (like Bio::Seq)
1660             # where appropriate
1661 1         2 my ( @firstseq, @seqs, @trees, @per_site_prob );
1662 0         0 my $count;
1663 1         3 while ( defined( $_ = $rstio->_readline ) ) {
1664              
1665             # implement the parsing here
1666 616 100 66     2536 if (/^TREE\s+\#\s+(\d+)/) {
    100          
    100          
    50          
    100          
1667 1         2 while ( defined( $_ = $rstio->_readline ) ) {
1668 10 100       20 if (/tree\s+with\s+node\s+labels\s+for/) {
1669 1         3 my $tree = Bio::TreeIO->new(
1670             -noclose => 1,
1671             -fh => $rstio->_fh,
1672             -format => 'newick'
1673             )->next_tree;
1674              
1675             # cleanup leading/trailing whitespace
1676 1         2 for my $n ( $tree->get_nodes ) {
1677 12         13 my $id = $n->id;
1678 12         14 $id =~ s/^\s+//;
1679 12         18 $id =~ s/\s+$//;
1680 12         13 $n->id($id);
1681 12 100       12 if ( defined( my $blen = $n->branch_length ) ) {
1682 11         10 $blen =~ s/^\s+//;
1683 11         8 $blen =~ s/\s+$//;
1684 11         12 $n->branch_length($blen);
1685             }
1686             }
1687 1         2 push @trees, $tree;
1688 1         4 last;
1689             }
1690             }
1691             }
1692             elsif (/^Prob\sof\sbest\scharacter\sat\seach\snode,\slisted\sby\ssite/)
1693             {
1694 1         3 $self->{'_rst'}->{'persite'} = [];
1695 1         3 while ( defined( $_ = $rstio->_readline ) ) {
1696 135 100 100     484 next if ( /^Site/i || /^\s+$/ );
1697 131 100       472 if (s/^\s+(\d+)\s+(\d+)\s+([^:]+)\s*:\s*(.+)//) {
    50          
1698 130         232 my ( $sitenum, $freq, $extant, $ancestral ) =
1699             ( $1, $2, $3, $4 );
1700 130         79 my ( @anc_site, @extant_site );
1701 130         131 @extant_site = {};
1702 130         358 while ( $extant =~ s/^([A-Z\-]{3})\s+\(([A-Z*])\)\s+//g ) {
1703 910         3234 push @extant_site, { 'codon' => $1, 'aa' => $2 };
1704             }
1705 130         399 while (
1706             $ancestral =~ s/^([A-Z\-]{3})\s+([A-Z*])\s+ # codon AA
1707             (\S+)\s+ # Prob
1708             \(([A-Z*])\s+(\S+)\)\s*//xg # AA Prob
1709             )
1710             {
1711 650         3252 push @anc_site,
1712             {
1713             'codon' => $1,
1714             'aa' => $2,
1715             'prob' => $3,
1716             'Yang95_aa' => $4,
1717             'Yang95_aa_prob' => $5
1718             };
1719             }
1720              
1721             # saving persite
1722 130         471 $self->{'_rst'}->{'persite'}->[$sitenum] =
1723             [ @extant_site, @anc_site ];
1724             }
1725             elsif (/^Summary\sof\schanges\salong\sbranches\./) {
1726 1         3 last;
1727             }
1728             }
1729             }
1730             elsif (/^Check\sroot\sfor\sdirections\sof\schange\./
1731             || /^Summary\sof\schanges\salong\sbranches\./ )
1732             {
1733 1         1 my ( @branches, @branch2node, $branch, $node );
1734 1         2 my $tree = $trees[-1];
1735 1 50       3 if ( !$tree ) {
1736 0         0 $self->warn("No tree built before parsing Branch changes\n");
1737 0         0 last;
1738             }
1739             my @nodes = (
1740 12         10 map { $_->[0] }
1741 30         22 sort { $a->[1] <=> $b->[1] }
1742 1         4 map { [ $_, $_->id =~ /^(\d+)\_?/ ] } $tree->get_nodes
  12         15  
1743             );
1744 1         4 unshift @nodes,
1745             undef; # fake first node so that index will match nodeid
1746 1         3 while ( defined( $_ = $rstio->_readline ) ) {
1747 93 100       202 next if /^\s+$/;
1748 57 100       204 if (m/^List\sof\sextant\sand\sreconstructed\ssequences/) {
    100          
    50          
1749 1         3 $rstio->_pushback($_);
1750 1         3 last;
1751             }
1752             elsif (/^Branch\s+(\d+):\s+(\d+)\.\.(\d+)\s+/) {
1753 11         8 my ( $left, $right );
1754 11         19 ( $branch, $left, $right ) = ( $1, $2, $3 );
1755 11         12 ($node) = $nodes[$right];
1756 11 50       16 if ( !$node ) {
1757 0         0 $self->warn(
1758             "cannot find $right in $tree ($branch $left..$right)\n"
1759             );
1760 0         0 last;
1761             }
1762 11 50       31 if (/\(n=\s*(\S+)\s+s=\s*(\S+)\)/) {
1763 11         18 $node->add_tag_value( 'n', $1 );
1764 11         12 $node->add_tag_value( 's', $2 );
1765             }
1766 11         42 $branch2node[$branch] = $right;
1767             }
1768             elsif (
1769             /^\s+(\d+)\s+([A-Z*])\s+(\S+)\s+\-\>\s+([A-Z*])\s+(\S+)?/)
1770             {
1771 45         67 my ( $site, $anc, $aprob, $derived, $dprob ) =
1772             ( $1, $2, $3, $4, $5 );
1773 45 50       51 if ( !$node ) {
1774 0         0 $self->warn("no branch line was previously parsed!");
1775 0         0 next;
1776             }
1777 45         105 my %c = (
1778             'site' => $site,
1779             'anc_aa' => $anc,
1780             'anc_prob' => $aprob,
1781             'derived_aa' => $derived,
1782             );
1783 45 100       71 $c{'derived_prob'} = $dprob if defined $dprob;
1784 45         61 $node->add_tag_value( 'changes', \%c );
1785             }
1786             }
1787             }
1788             elsif (
1789             /^Overall\s+accuracy\s+of\s+the\s+(\d+)\s+ancestral\s+sequences:/)
1790             {
1791 0         0 my $line = $rstio->_readline;
1792 0         0 $line =~ s/^\s+//;
1793 0         0 $line =~ s/\s+$//;
1794 0         0 my @overall_site = split( /\s+/, $line );
1795              
1796             # skip next 2 lines, want the third
1797 0         0 for ( 1 .. 3 ) {
1798 0         0 $line = $rstio->_readline;
1799             }
1800 0         0 $line =~ s/^\s+//;
1801 0         0 $line =~ s/\s+$//;
1802 0         0 my @overall_seq = split( /\s+/, $line );
1803 0 0 0     0 if ( @overall_seq != @overall_site
1804             || @overall_seq != @seqs )
1805             {
1806 0         0 $self->warn(
1807             "out of sync somehow seqs, site scores don't match\n");
1808 0         0 $self->warn("@seqs @overall_seq @overall_site\n");
1809             }
1810 0         0 for (@seqs) {
1811 0         0 $_->description(
1812             sprintf(
1813             "overall_accuracy_site=%s overall_accuracy_seq=%s",
1814             shift @overall_site,
1815             shift @overall_seq
1816             )
1817             );
1818             }
1819             }
1820             elsif (m/^List of extant and reconstructed sequences/o) {
1821 3         2 my $seqcount = 0;
1822 3         7 while ( defined( $_ = $rstio->_readline ) ) {
1823 311 100       370 last if (/^Overall accuracy of the/);
1824 309 100       479 if (/^\s+$/) {
1825 61 50 33     78 last if $seqcount && $seqcount == @seqs;
1826 61         81 next;
1827             }
1828 248 50       447 if (/^\s*(\d+)\s+(\d+)\s+$/) { $seqcount = $1; next }
  0         0  
  0         0  
1829              
1830             # runmode = (0)
1831             # this should in fact be packed into a Bio::SimpleAlign object
1832             # instead of an array but we'll stay with this for now
1833 248 100       435 if (/^node/) {
1834 20         60 my ( $name, $num, $seqstr ) = split( /\s+/, $_, 3 );
1835 20         21 $name .= $num;
1836 20         650 $seqstr =~ s/\s+//g; # remove whitespace
1837 20 100       35 unless (@firstseq) {
1838 1         52 @firstseq = split( //, $seqstr );
1839 1         9 push @seqs,
1840             Bio::LocatableSeq->new(
1841             -display_id => $name,
1842             -seq => $seqstr
1843             );
1844             }
1845             else {
1846 19         11 my $i = 0;
1847 19         14 my $v;
1848 19         42 while ( ( $v = index( $seqstr, '.', $i ) ) >= $i ) {
1849              
1850             # replace the '.' with the correct seq from the
1851 0         0 substr( $seqstr, $v, 1, $firstseq[$v] );
1852 0         0 $i = $v;
1853             }
1854 19         52 $self->debug("adding seq $seqstr\n");
1855 19         56 push @seqs,
1856             Bio::LocatableSeq->new(
1857             -display_id => $name,
1858             -seq => $seqstr
1859             );
1860             }
1861             }
1862             }
1863 3         7 $self->{'_rst'}->{'rctrted_seqs'} = \@seqs;
1864             }
1865             else {
1866             }
1867             }
1868 1         3 $self->{'_rst'}->{'trees'} = \@trees;
1869 1         17 return;
1870             }
1871              
1872             1;