File Coverage

blib/lib/Bio/Phylo/Parsers/Newick.pm
Criterion Covered Total %
statement 181 188 96.2
branch 73 80 91.2
condition 107 128 83.5
subroutine 13 14 92.8
pod n/a
total 374 410 91.2


line stmt bran cond sub pod time code
1             package Bio::Phylo::Parsers::Newick;
2 24     24   641 use warnings;
  24         43  
  24         753  
3 24     24   132 use strict;
  24         45  
  24         515  
4 24     24   105 use base 'Bio::Phylo::Parsers::Abstract';
  24         39  
  24         6334  
5 24     24   155 no warnings 'recursion';
  24         44  
  24         44087  
6              
7             =head1 NAME
8              
9             Bio::Phylo::Parsers::Newick - Parser used by Bio::Phylo::IO, no serviceable parts inside
10              
11             =head1 DESCRIPTION
12              
13             This module parses tree descriptions in parenthetical format. It is called by the
14             L facade, don't call it directly. Several additional flags can be
15             passed to the Bio::Phylo::IO parse and parse_tree functions to influence how to deal
16             with complex newick strings:
17              
18             -keep => [ ...list of taxa names... ]
19              
20             The C<-keep> flag allows you to only retain certain taxa of interest, ignoring others
21             while building the tree object.
22              
23             -ignore_comments => 1,
24              
25             This will treat comments in square brackets as if they are a normal taxon name character,
26             this so that names such as C are parsed
27             "successfully". (Note: square brackets should NOT be used in this way as it will break
28             many parsers).
29              
30             -keep_whitespace => 1,
31              
32             This will treat unescaped whitespace as if it is a normal taxon name character. Normally,
33             whitespace is only retained inside quoted strings (e.g. C<'Homo sapiens'>), otherwise it
34             is the convention to use underscores (C). This is because some programs
35             introduce whitespace to prettify a newick string, e.g. to indicate indentation/depth,
36             in which case you almost certainly want to ignore it. This is the default behaviour. The
37             option to keep it is provided for dealing with incorrectly formatted data.
38              
39             =cut
40              
41 83     83   289 sub _return_is_scalar { 1 }
42              
43              
44             sub _simplify {
45             # Simplify a Newick tree string by removing unneeded nodes. The leaves to
46             # keep are given as $ids, an arrayref of terminal node IDs. Note that only
47             # cherries are simplified to keep the function fast. Ternary or higher order
48             # branches are left alone. Quoted strings should be handled properly.
49 56     56   122 my ($string, $ids) = @_;
50 56         106 my %id_hash = map { $_ => undef } @$ids;
  106         248  
51              
52             # Setup some regular expressions:
53             # 1/ ID is anything but these characters (except when quoted): , ; : ( ) " '
54 56         172 my $id_re_simple = qr/[^)(,:"';]+/;
55 56         103 my $id_re_squote = qr/[^']+/;
56 56         95 my $id_re_dquote = qr/[^']+/;
57 56         273 my $id_re = qr/ (?: $id_re_simple | '$id_re_squote' | "$id_re_dquote" ) /x;
58             # 2/ Distance is a real number (regexp taken from Regexp::Common $RE{num}{real})
59 56         108 my $dist_re = qr/(?:(?i)(?:[+-]?)(?:(?=[.]?[0123456789])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))/;
60             # 3/ A pair of ID and distance (both optional)
61 56         264 my $pair_re = qr/ ($id_re)? (?: \: ($dist_re) )? /x;
62             # 4/ Cherry
63 56         426 my $cherry_re = qr/ ( \( $pair_re , $pair_re \) $pair_re ) /x;
64             # 5/ Whitespaces
65 56         115 my $ws_re = qr/ \s+ /msx;
66              
67             # Remove spaces and newlines (no spaces allowed in node names)
68 56         205 $string =~ s/$ws_re//g;
69              
70             # Prune cherries
71 56         87 my $prev_string = '';
72 56         133 while (not $string eq $prev_string) {
73 134         194 $prev_string = $string;
74 134         1378 $string =~ s/ $cherry_re / _prune_cherry($1, $2, $3, $4, $5, $6, $7, \%id_hash) /gex;
  108         251  
75             }
76 56         167 __PACKAGE__->_logger->debug("simplified string by removing unneeded nodes");
77 56         392 return $string;
78             }
79              
80              
81             sub _prune_cherry {
82 108     108   437 my ($match, $id1, $dist1, $id2, $dist2, $idp, $distp, $id_hash) = @_;
83 108         169 my $repl;
84 108   100     299 my $id1_exists = defined $id1 && exists $id_hash->{$id1};
85 108   100     216 my $id2_exists = defined $id2 && exists $id_hash->{$id2};
86 108 100 100     228 if ( $id1_exists && $id2_exists ) {
87             # Keep both leaves
88 27         38 $repl = $match;
89             } else {
90             # There are from zero to one leaves to keep. Delete one of them.
91 81 100       146 my ($id, $dist) = $id1_exists ? ($id1, $dist1) : ($id2, $dist2);
92 81 100 100     162 if ( defined($dist) || defined($distp) ) {
93 64   100     316 $dist = ':'.(($dist||0) + ($distp||0));
      100        
94             }
95 81 100 50     194 $id ||= '' if not defined $id;
96 81 100 50     142 $dist ||= '' if not defined $dist;
97 81         116 $repl = $id.$dist;
98             }
99 108         450 return $repl;
100             }
101              
102              
103             sub _parse {
104 84     84   176 my $self = shift;
105 84         364 my $fh = $self->_handle;
106 84         340 my $forest = $self->_factory->create_forest;
107              
108 84         188 my $string;
109 84         733 while (<$fh>) {
110 106         280 chomp;
111 106         467 $string .= $_;
112             }
113              
114 84         481 my $ids = $self->_args->{'-keep'};
115 84         334 my $ignore = $self->_args->{'-ignore_comments'};
116 84         280 my $whitespace = $self->_args->{'-keep_whitespace'};
117 84         346 my $quotes = $self->_args->{'-ignore_quotes'};
118              
119             # remove comments, split on tree descriptions
120 84         168 my $counter = 1;
121              
122 84         347 for my $newick ( $self->_split($string,$ignore,$whitespace,$quotes) ) {
123 104         467 $self->_logger->debug("going to process newick string " . $counter++);
124             # simplify tree
125 104 100       928 if ($ids) {
126 1         6 $newick = _simplify($string, $ids);
127             }
128            
129             # parse trees
130 104         394 my $tree = $self->_parse_string($newick);
131              
132             # adding labels to untagged nodes
133 104 50       567 if ( $self->_args->{'-label'} ) {
134 0         0 my $i = 1;
135             $tree->visit(
136             sub {
137 0     0   0 my $n = shift;
138 0 0       0 $n->set_name( 'n' . $i++ ) unless $n->get_name;
139             }
140 0         0 );
141             }
142 104         641 $forest->insert($tree);
143             }
144 84         380 return $forest;
145             }
146              
147             =begin comment
148              
149             Type : Parser
150             Title : _split($string)
151             Usage : my @strings = $newick->_split($string);
152             Function: Creates an array of (decommented) tree descriptions
153             Returns : A Bio::Phylo::Forest::Tree object.
154             Args : $string = concatenated tree descriptions
155              
156             =end comment
157              
158             =cut
159              
160             sub _split {
161 84     84   335 my ( $self, $string, $ignore, $whitespace, $quotes ) = @_;
162 84         342 my $log = $self->_logger;
163 84         233 my ( $QUOTED, $COMMENTED ) = ( 0, 0 );
164 84         176 my $decommented = '';
165 84         222 my @trees;
166 84         526 TOKEN: for my $i ( 0 .. length($string) ) {
167 79115         114565 my $token = substr( $string, $i, 1 );
168              
169             # detect apostrophe as ' between two letters
170 79115 100       130223 my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
171 79115 100       127852 my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
172 79115   33     153620 my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
173 79115 50       118978 $log->debug("detected apostrophe") if $apostr;
174              
175 79115 100 100     614928 if ( !$QUOTED && !$COMMENTED && $token eq "'" && ! $quotes && ! $apostr ) {
    100 100        
    100 66        
    100 66        
      100        
      100        
      100        
      100        
      100        
      66        
      66        
      100        
      66        
      66        
      33        
176 3         6 $QUOTED++;
177             }
178             elsif ( !$QUOTED && !$COMMENTED && $token eq "[" && ! $ignore ) {
179 2         5 $COMMENTED++;
180 2         10 $log->debug("quote level changed to $COMMENTED");
181 2         5 next TOKEN;
182             }
183             elsif ( !$QUOTED && $COMMENTED && $token eq "]" && ! $ignore ) {
184 2         4 $COMMENTED--;
185 2         4 next TOKEN;
186             }
187             elsif ($QUOTED
188             && !$COMMENTED
189             && $token eq "'"
190             && substr( $string, $i, 2 ) ne "''" && ! $quotes && ! $apostr )
191             {
192 3         9 $QUOTED--;
193             }
194 79111 100 100     199377 if ( !$QUOTED && $token eq ' ' && ! $whitespace ) {
      66        
195 18         27 next TOKEN;
196             }
197 79093 100       126529 $decommented .= $token unless $COMMENTED;
198 79093 100 100     273749 if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq ';' ) {
      100        
199 104         367 push @trees, $decommented;
200 104         236 $decommented = '';
201             }
202              
203             }
204 84         717 $log->debug("removed comments, split on tree descriptions");
205 84         469 $log->debug("found ".scalar(@trees)." tree descriptions");
206 84         359 return @trees;
207             }
208              
209             =begin comment
210              
211             Type : Parser
212             Title : _parse_string($string)
213             Usage : my $tree = $newick->_parse_string($string);
214             Function: Creates a populated Bio::Phylo::Forest::Tree object from a newick
215             string.
216             Returns : A Bio::Phylo::Forest::Tree object.
217             Args : $string = a newick tree description
218              
219             =end comment
220              
221             =cut
222              
223             sub _parse_string {
224 104     104   291 my ( $self, $string ) = @_;
225 104         704 my $fac = $self->_factory;
226 104         331 $self->_logger->debug("going to parse tree string '$string'");
227 104         886 my $tree = $fac->create_tree;
228 104         274 my $remainder = $string;
229 104         222 my $token;
230             my @tokens;
231 104         527 while ( ( $token, $remainder ) = $self->_next_token($remainder) ) {
232 16732 100 66     48117 last if ( !defined $token || !defined $remainder );
233 16628         34316 $self->_logger->debug("fetched token '$token'");
234              
235 16628         39245 push @tokens, $token;
236             }
237 104         221 my $i;
238 104         469 for ( $i = $#tokens ; $i >= 0 ; $i-- ) {
239 104 50       427 last if $tokens[$i] eq ';';
240             }
241 104         1321 my $root = $fac->create_node;
242 104         820 $tree->insert($root);
243 104         1922 $self->_parse_node_data( $root, @tokens[ 0 .. ( $i - 1 ) ] );
244 104         1415 $self->_parse_clade( $tree, $root, @tokens[ 0 .. ( $i - 1 ) ] );
245 104         3087 return $tree;
246             }
247              
248             sub _parse_clade {
249 4044     4044   27004 my ( $self, $tree, $root, @tokens ) = @_;
250 4044         9628 my $fac = $self->_factory;
251 4044         8074 $self->_logger->debug("recursively parsing clade '@tokens'");
252 4044         6957 my ( @clade, $depth, @remainder );
253 4044         9208 TOKEN: for my $i ( 0 .. $#tokens ) {
254 230305 100 100     492717 if ( $tokens[$i] eq '(' ) {
    100          
    100          
255 25265 100       33112 if ( not defined $depth ) {
256 1907         2447 $depth = 1;
257 1907         3258 next TOKEN;
258             }
259             else {
260 23358         25326 $depth++;
261             }
262             }
263             elsif ( $tokens[$i] eq ',' && $depth == 1 ) {
264 2033         10876 my $node = $fac->create_node;
265 2033         6657 $root->set_child($node);
266 2033         5858 $tree->insert($node);
267 2033         6899 $self->_parse_node_data( $node, @clade );
268 2033         8901 $self->_parse_clade( $tree, $node, @clade );
269 2033         8825 @clade = ();
270 2033         3800 next TOKEN;
271             }
272             elsif ( $tokens[$i] eq ')' ) {
273 25265         27040 $depth--;
274 25265 100       34756 if ( $depth == 0 ) {
275 1907         5572 @remainder = @tokens[ ( $i + 1 ) .. $#tokens ];
276 1907         10528 my $node = $fac->create_node;
277 1907         6281 $root->set_child($node);
278 1907         5574 $tree->insert($node);
279 1907         6294 $self->_parse_node_data( $node, @clade );
280 1907         7192 $self->_parse_clade( $tree, $node, @clade );
281 1907         20958 last TOKEN;
282             }
283             }
284 224458         309481 push @clade, $tokens[$i];
285             }
286             }
287              
288             sub _parse_node_data {
289 3829     3829   38253 my ( $self, $node, @clade ) = @_;
290 3829         9710 $self->_logger->debug("parsing name and branch length for node");
291 3829         5905 my @tail;
292 3829         9160 PARSE_TAIL: for ( my $i = $#clade ; $i >= 0 ; $i-- ) {
293 11832 100       26025 if ( $clade[$i] eq ')' ) {
    100          
294 1800         5026 @tail = @clade[ ( $i + 1 ) .. $#clade ];
295 1800         3650 last PARSE_TAIL;
296             }
297             elsif ( $i == 0 ) {
298 2029         5597 @tail = @clade;
299             }
300             }
301            
302 3829 50 100     14594 if ( defined($tail[-1]) and $tail[-1] =~ /(\[.+\])$/ and scalar @tail != 1 ) {
      66        
303 0         0 my $anno = $1;
304 0         0 $self->_logger->info("discarding branch comment $anno");
305 0         0 $tail[-1] =~ s/\Q$anno\E//;
306             }
307              
308             # name only
309 3829 100       10808 if ( scalar @tail == 1 ) {
    100          
    100          
310 317         1375 $node->set_name( $tail[0] );
311             }
312             elsif ( scalar @tail == 2 ) {
313 245         711 $node->set_branch_length( $tail[-1] );
314             }
315             elsif ( scalar @tail == 3 ) {
316 3075         9968 $node->set_name( $tail[0] );
317 3075         9160 $node->set_branch_length( $tail[-1] );
318             }
319             }
320              
321             sub _next_token {
322 16732     16732   26200 my ( $self, $string ) = @_;
323 16732         28268 $self->_logger->debug("tokenizing string '$string'");
324 16732         35811 my $ignore = $self->_args->{'-ignore_comments'};
325 16732         20910 my $QUOTED = 0;
326 16732         18947 my $COMMENTED = 0;
327 16732         19574 my $token = '';
328 16732         36024 my $TOKEN_DELIMITER = qr/[():,;]/;
329 16732         32404 TOKEN: for my $i ( 0 .. length($string) ) {
330 86182         140163 $token .= substr( $string, $i, 1 );
331 86182         174473 $self->_logger->debug("growing token: '$token'");
332            
333             # detect apostrophe as ' between two letters
334 86182 100       187491 my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
335 86182 100       161850 my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
336 86182   33     175867 my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
337 86182 50       134870 $self->_logger->debug("detected apostrophe") if $apostr;
338              
339             # if -ignore_comments was specified the string can still contain comments
340             # that can contain token delimiters, so we still need to track
341             # whether we are inside a comment
342 86182 100 100     183907 if ( $ignore && $token =~ /\[$/ ) {
343 267         432 $COMMENTED++;
344             }
345 86182 100 100     167629 if ( $ignore && $token =~ /\]$/ ) {
346 267         429 $COMMENTED--;
347 267         704 next TOKEN;
348             }
349 85915 100 100     370369 if ( !$QUOTED && !$COMMENTED && $token =~ $TOKEN_DELIMITER ) {
      100        
350 16628         23385 my $length = length($token);
351 16628 100       25717 if ( $length == 1 ) {
352 9486         19122 $self->_logger->debug("single char token: '$token'");
353 9486         44375 return $token, substr( $string, ( $i + 1 ) );
354             }
355             else {
356 7142         14669 $self->_logger->debug(
357             sprintf( "range token: %s",
358             substr( $token, 0, $length - 1 ) )
359             );
360 7142         48290 return substr( $token, 0, $length - 1 ),
361             substr( $token, $length - 1, 1 )
362             . substr( $string, ( $i + 1 ) );
363             }
364             }
365 69287 100 100     317260 if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq "'" && ! $apostr ) {
    100 100        
      66        
      66        
      100        
      66        
      66        
366 3         12 $QUOTED++;
367             }
368             elsif ($QUOTED && !$COMMENTED
369             && substr( $string, $i, 1 ) eq "'"
370             && substr( $string, $i, 2 ) ne "''" && ! $apostr)
371             {
372 3         11 $QUOTED--;
373             }
374             }
375             }
376              
377             # podinherit_insert_token
378              
379             =head1 SEE ALSO
380              
381             There is a mailing list at L
382             for any user or developer questions and discussions.
383              
384             =over
385              
386             =item L
387              
388             The newick parser is called by the L object.
389             Look there to learn how to parse newick strings.
390              
391             =item L
392              
393             Also see the manual: L and L.
394              
395             =back
396              
397             =head1 CITATION
398              
399             If you use Bio::Phylo in published research, please cite it:
400              
401             B, B, B, B
402             and B, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
403             I B<12>:63.
404             L
405              
406             =cut
407              
408             1;