File Coverage

blib/lib/Bio/Phylo/Parsers/Newick.pm

Criterion	Covered	Total	%
statement	181	188	96.2
branch	73	80	91.2
condition	107	128	83.5
subroutine	13	14	92.8
pod			n/a
total	374	410	91.2

line	stmt	bran	cond	sub	time	code
1						package Bio::Phylo::Parsers::Newick;
2	24			24	453	use warnings;
	24				45
	24				695
3	24			24	108	use strict;
	24				42
	24				459
4	24			24	100	use base 'Bio::Phylo::Parsers::Abstract';
	24				42
	24				6200
5	24			24	155	no warnings 'recursion';
	24				43
	24				42759
6
7						=head1 NAME
8
9						Bio::Phylo::Parsers::Newick - Parser used by Bio::Phylo::IO, no serviceable parts inside
10
11						=head1 DESCRIPTION
12
13						This module parses tree descriptions in parenthetical format. It is called by the
14						L<Bio::Phylo::IO> facade, don't call it directly. Several additional flags can be
15						passed to the Bio::Phylo::IO parse and parse_tree functions to influence how to deal
16						with complex newick strings:
17
18						-keep => [ ...list of taxa names... ]
19
20						The C<-keep> flag allows you to only retain certain taxa of interest, ignoring others
21						while building the tree object.
22
23						-ignore_comments => 1,
24
25						This will treat comments in square brackets as if they are a normal taxon name character,
26						this so that names such as C<Choristoneura diversana\|BC ZSM Lep 23401[05/*> are parsed
27						"successfully". (Note: square brackets should NOT be used in this way as it will break
28						many parsers).
29
30						-keep_whitespace => 1,
31
32						This will treat unescaped whitespace as if it is a normal taxon name character. Normally,
33						whitespace is only retained inside quoted strings (e.g. C<'Homo sapiens'>), otherwise it
34						is the convention to use underscores (C<Homo_sapiens>). This is because some programs
35						introduce whitespace to prettify a newick string, e.g. to indicate indentation/depth,
36						in which case you almost certainly want to ignore it. This is the default behaviour. The
37						option to keep it is provided for dealing with incorrectly formatted data.
38
39						=cut
40
41	83			83	293	sub _return_is_scalar { 1 }
42
43
44						sub _simplify {
45						# Simplify a Newick tree string by removing unneeded nodes. The leaves to
46						# keep are given as $ids, an arrayref of terminal node IDs. Note that only
47						# cherries are simplified to keep the function fast. Ternary or higher order
48						# branches are left alone. Quoted strings should be handled properly.
49	56			56	132	my ($string, $ids) = @_;
50	56				103	my %id_hash = map { $_ => undef } @$ids;
	106				239
51
52						# Setup some regular expressions:
53						# 1/ ID is anything but these characters (except when quoted): , ; : ( ) " '
54	56				167	my $id_re_simple = qr/[^)(,:"';]+/;
55	56				103	my $id_re_squote = qr/[^']+/;
56	56				97	my $id_re_dquote = qr/[^']+/;
57	56				288	my $id_re = qr/ (?: $id_re_simple \| '$id_re_squote' \| "$id_re_dquote" ) /x;
58						# 2/ Distance is a real number (regexp taken from Regexp::Common $RE{num}{real})
59	56				101	my $dist_re = qr/(?:(?i)(?:[+-]?)(?:(?=[.]?[0123456789])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))\|))/;
60						# 3/ A pair of ID and distance (both optional)
61	56				241	my $pair_re = qr/ ($id_re)? (?: \: ($dist_re) )? /x;
62						# 4/ Cherry
63	56				334	my $cherry_re = qr/ ( $ $pair_re , $pair_re $ $pair_re ) /x;
64						# 5/ Whitespaces
65	56				102	my $ws_re = qr/ \s+ /msx;
66
67						# Remove spaces and newlines (no spaces allowed in node names)
68	56				193	$string =~ s/$ws_re//g;
69
70						# Prune cherries
71	56				85	my $prev_string = '';
72	56				121	while (not $string eq $prev_string) {
73	134				205	$prev_string = $string;
74	134				1307	$string =~ s/ $cherry_re / _prune_cherry($1, $2, $3, $4, $5, $6, $7, \%id_hash) /gex;
	108				256
75						}
76	56				159	__PACKAGE__->_logger->debug("simplified string by removing unneeded nodes");
77	56				373	return $string;
78						}
79
80
81						sub _prune_cherry {
82	108			108	441	my ($match, $id1, $dist1, $id2, $dist2, $idp, $distp, $id_hash) = @_;
83	108				153	my $repl;
84	108		100		286	my $id1_exists = defined $id1 && exists $id_hash->{$id1};
85	108		100		225	my $id2_exists = defined $id2 && exists $id_hash->{$id2};
86	108	100	100		245	if ( $id1_exists && $id2_exists ) {
87						# Keep both leaves
88	27				44	$repl = $match;
89						} else {
90						# There are from zero to one leaves to keep. Delete one of them.
91	81	100			139	my ($id, $dist) = $id1_exists ? ($id1, $dist1) : ($id2, $dist2);
92	81	100	100		179	if ( defined($dist) \|\| defined($distp) ) {
93	64		100		304	$dist = ':'.(($dist\|\|0) + ($distp\|\|0));
			100
94						}
95	81	100	50		200	$id \|\|= '' if not defined $id;
96	81	100	50		154	$dist \|\|= '' if not defined $dist;
97	81				129	$repl = $id.$dist;
98						}
99	108				442	return $repl;
100						}
101
102
103						sub _parse {
104	84			84	177	my $self = shift;
105	84				376	my $fh = $self->_handle;
106	84				296	my $forest = $self->_factory->create_forest;
107
108	84				176	my $string;
109	84				595	while (<$fh>) {
110	106				233	chomp;
111	106				408	$string .= $_;
112						}
113
114	84				411	my $ids = $self->_args->{'-keep'};
115	84				220	my $ignore = $self->_args->{'-ignore_comments'};
116	84				205	my $whitespace = $self->_args->{'-keep_whitespace'};
117	84				189	my $quotes = $self->_args->{'-ignore_quotes'};
118
119						# remove comments, split on tree descriptions
120	84				157	my $counter = 1;
121
122	84				290	for my $newick ( $self->_split($string,$ignore,$whitespace,$quotes) ) {
123	104				401	$self->_logger->debug("going to process newick string " . $counter++);
124						# simplify tree
125	104	100			305	if ($ids) {
126	1				3	$newick = _simplify($string, $ids);
127						}
128
129						# parse trees
130	104				353	my $tree = $self->_parse_string($newick);
131
132						# adding labels to untagged nodes
133	104	50			412	if ( $self->_args->{'-label'} ) {
134	0				0	my $i = 1;
135						$tree->visit(
136						sub {
137	0			0	0	my $n = shift;
138	0	0			0	$n->set_name( 'n' . $i++ ) unless $n->get_name;
139						}
140	0				0	);
141						}
142	104				547	$forest->insert($tree);
143						}
144	84				337	return $forest;
145						}
146
147						=begin comment
148
149						Type : Parser
150						Title : _split($string)
151						Usage : my @strings = $newick->_split($string);
152						Function: Creates an array of (decommented) tree descriptions
153						Returns : A Bio::Phylo::Forest::Tree object.
154						Args : $string = concatenated tree descriptions
155
156						=end comment
157
158						=cut
159
160						sub _split {
161	84			84	252	my ( $self, $string, $ignore, $whitespace, $quotes ) = @_;
162	84				281	my $log = $self->_logger;
163	84				211	my ( $QUOTED, $COMMENTED ) = ( 0, 0 );
164	84				144	my $decommented = '';
165	84				145	my @trees;
166	84				320	TOKEN: for my $i ( 0 .. length($string) ) {
167	85317				108684	my $token = substr( $string, $i, 1 );
168
169						# detect apostrophe as ' between two letters
170	85317	100			121081	my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
171	85317	100			123455	my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
172	85317		33		143991	my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
173	85317	50			114419	$log->debug("detected apostrophe") if $apostr;
174
175	85317	100	100		612818	if ( !$QUOTED && !$COMMENTED && $token eq "'" && ! $quotes && ! $apostr ) {
		100	100
		100	66
		100	66
			100
			100
			100
			100
			100
			66
			66
			100
			66
			66
			33
176	3				5	$QUOTED++;
177						}
178						elsif ( !$QUOTED && !$COMMENTED && $token eq "[" && ! $ignore ) {
179	2				3	$COMMENTED++;
180	2				10	$log->debug("quote level changed to $COMMENTED");
181	2				6	next TOKEN;
182						}
183						elsif ( !$QUOTED && $COMMENTED && $token eq "]" && ! $ignore ) {
184	2				4	$COMMENTED--;
185	2				6	next TOKEN;
186						}
187						elsif ($QUOTED
188						&& !$COMMENTED
189						&& $token eq "'"
190						&& substr( $string, $i, 2 ) ne "''" && ! $quotes && ! $apostr )
191						{
192	3				5	$QUOTED--;
193						}
194	85313	100	100		195225	if ( !$QUOTED && $token eq ' ' && ! $whitespace ) {
			66
195	18				25	next TOKEN;
196						}
197	85295	100			121982	$decommented .= $token unless $COMMENTED;
198	85295	100	100		270849	if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq ';' ) {
			100
199	104				373	push @trees, $decommented;
200	104				234	$decommented = '';
201						}
202
203						}
204	84				568	$log->debug("removed comments, split on tree descriptions");
205	84				411	$log->debug("found ".scalar(@trees)." tree descriptions");
206	84				307	return @trees;
207						}
208
209						=begin comment
210
211						Type : Parser
212						Title : _parse_string($string)
213						Usage : my $tree = $newick->_parse_string($string);
214						Function: Creates a populated Bio::Phylo::Forest::Tree object from a newick
215						string.
216						Returns : A Bio::Phylo::Forest::Tree object.
217						Args : $string = a newick tree description
218
219						=end comment
220
221						=cut
222
223						sub _parse_string {
224	104			104	247	my ( $self, $string ) = @_;
225	104				321	my $fac = $self->_factory;
226	104				294	$self->_logger->debug("going to parse tree string '$string'");
227	104				801	my $tree = $fac->create_tree;
228	104				237	my $remainder = $string;
229	104				203	my $token;
230						my @tokens;
231	104				392	while ( ( $token, $remainder ) = $self->_next_token($remainder) ) {
232	18523	100	66		56037	last if ( !defined $token \|\| !defined $remainder );
233	18419				36752	$self->_logger->debug("fetched token '$token'");
234
235	18419				44712	push @tokens, $token;
236						}
237	104				217	my $i;
238	104				420	for ( $i = $#tokens ; $i >= 0 ; $i-- ) {
239	104	50			340	last if $tokens[$i] eq ';';
240						}
241	104				982	my $root = $fac->create_node;
242	104				692	$tree->insert($root);
243	104				1975	$self->_parse_node_data( $root, @tokens[ 0 .. ( $i - 1 ) ] );
244	104				1286	$self->_parse_clade( $tree, $root, @tokens[ 0 .. ( $i - 1 ) ] );
245	104				3645	return $tree;
246						}
247
248						sub _parse_clade {
249	4442			4442	35341	my ( $self, $tree, $root, @tokens ) = @_;
250	4442				11214	my $fac = $self->_factory;
251	4442				9732	$self->_logger->debug("recursively parsing clade '@tokens'");
252	4442				8271	my ( @clade, $depth, @remainder );
253	4442				11006	TOKEN: for my $i ( 0 .. $#tokens ) {
254	316654	100	100		716953	if ( $tokens[$i] eq '(' ) {
		100
		100
255	34793	100			47046	if ( not defined $depth ) {
256	2106				3199	$depth = 1;
257	2106				3996	next TOKEN;
258						}
259						else {
260	32687				35416	$depth++;
261						}
262						}
263						elsif ( $tokens[$i] eq ',' && $depth == 1 ) {
264	2232				13320	my $node = $fac->create_node;
265	2232				8163	$root->set_child($node);
266	2232				7307	$tree->insert($node);
267	2232				8438	$self->_parse_node_data( $node, @clade );
268	2232				10595	$self->_parse_clade( $tree, $node, @clade );
269	2232				12624	@clade = ();
270	2232				4620	next TOKEN;
271						}
272						elsif ( $tokens[$i] eq ')' ) {
273	34793				36607	$depth--;
274	34793	100			48287	if ( $depth == 0 ) {
275	2106				7211	@remainder = @tokens[ ( $i + 1 ) .. $#tokens ];
276	2106				13298	my $node = $fac->create_node;
277	2106				7505	$root->set_child($node);
278	2106				6777	$tree->insert($node);
279	2106				8448	$self->_parse_node_data( $node, @clade );
280	2106				8983	$self->_parse_clade( $tree, $node, @clade );
281	2106				34363	last TOKEN;
282						}
283						}
284	310210				440971	push @clade, $tokens[$i];
285						}
286						}
287
288						sub _parse_node_data {
289	4227			4227	56980	my ( $self, $node, @clade ) = @_;
290	4227				12000	$self->_logger->debug("parsing name and branch length for node");
291	4227				6184	my @tail;
292	4227				11043	PARSE_TAIL: for ( my $i = $#clade ; $i >= 0 ; $i-- ) {
293	13225	100			29688	if ( $clade[$i] eq ')' ) {
		100
294	1999				6968	@tail = @clade[ ( $i + 1 ) .. $#clade ];
295	1999				4443	last PARSE_TAIL;
296						}
297						elsif ( $i == 0 ) {
298	2228				6506	@tail = @clade;
299						}
300						}
301
302	4227	50	100		18621	if ( defined($tail[-1]) and $tail[-1] =~ /(\[.+\])$/ and scalar @tail != 1 ) {
			66
303	0				0	my $anno = $1;
304	0				0	$self->_logger->info("discarding branch comment $anno");
305	0				0	$tail[-1] =~ s/\Q$anno\E//;
306						}
307
308						# name only
309	4227	100			13054	if ( scalar @tail == 1 ) {
		100
		100
310	317				986	$node->set_name( $tail[0] );
311						}
312						elsif ( scalar @tail == 2 ) {
313	245				674	$node->set_branch_length( $tail[-1] );
314						}
315						elsif ( scalar @tail == 3 ) {
316	3473				13133	$node->set_name( $tail[0] );
317	3473				11128	$node->set_branch_length( $tail[-1] );
318						}
319						}
320
321						sub _next_token {
322	18523			18523	28911	my ( $self, $string ) = @_;
323	18523				31344	$self->_logger->debug("tokenizing string '$string'");
324	18523				41069	my $ignore = $self->_args->{'-ignore_comments'};
325	18523				24019	my $QUOTED = 0;
326	18523				21813	my $COMMENTED = 0;
327	18523				22104	my $token = '';
328	18523				41896	my $TOKEN_DELIMITER = qr/[():,;]/;
329	18523				38898	TOKEN: for my $i ( 0 .. length($string) ) {
330	93180				137919	$token .= substr( $string, $i, 1 );
331	93180				169601	$self->_logger->debug("growing token: '$token'");
332
333						# detect apostrophe as ' between two letters
334	93180	100			184040	my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
335	93180	100			163328	my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
336	93180		33		177277	my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
337	93180	50			133121	$self->_logger->debug("detected apostrophe") if $apostr;
338
339						# if -ignore_comments was specified the string can still contain comments
340						# that can contain token delimiters, so we still need to track
341						# whether we are inside a comment
342	93180	100	100		166848	if ( $ignore && $token =~ /\[$/ ) {
343	267				390	$COMMENTED++;
344						}
345	93180	100	100		164600	if ( $ignore && $token =~ /\]$/ ) {
346	267				383	$COMMENTED--;
347	267				543	next TOKEN;
348						}
349	92913	100	100		406501	if ( !$QUOTED && !$COMMENTED && $token =~ $TOKEN_DELIMITER ) {
			100
350	18419				26563	my $length = length($token);
351	18419	100			27330	if ( $length == 1 ) {
352	10481				20687	$self->_logger->debug("single char token: '$token'");
353	10481				51884	return $token, substr( $string, ( $i + 1 ) );
354						}
355						else {
356	7938				16293	$self->_logger->debug(
357						sprintf( "range token: %s",
358						substr( $token, 0, $length - 1 ) )
359						);
360	7938				56400	return substr( $token, 0, $length - 1 ),
361						substr( $token, $length - 1, 1 )
362						. substr( $string, ( $i + 1 ) );
363						}
364						}
365	74494	100	100		320521	if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq "'" && ! $apostr ) {
		100	100
			66
			66
			100
			66
			66
366	3				7	$QUOTED++;
367						}
368						elsif ($QUOTED && !$COMMENTED
369						&& substr( $string, $i, 1 ) eq "'"
370						&& substr( $string, $i, 2 ) ne "''" && ! $apostr)
371						{
372	3				7	$QUOTED--;
373						}
374						}
375						}
376
377						# podinherit_insert_token
378
379						=head1 SEE ALSO
380
381						There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo>
382						for any user or developer questions and discussions.
383
384						=over
385
386						=item L<Bio::Phylo::IO>
387
388						The newick parser is called by the L<Bio::Phylo::IO> object.
389						Look there to learn how to parse newick strings.
390
391						=item L<Bio::Phylo::Manual>
392
393						Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>.
394
395						=back
396
397						=head1 CITATION
398
399						If you use Bio::Phylo in published research, please cite it:
400
401						B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
402						and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
403						I<BMC Bioinformatics> B<12>:63.
404						L<http://dx.doi.org/10.1186/1471-2105-12-63>
405
406						=cut
407
408						1;