File Coverage

blib/lib/Bio/Phylo/Parsers/Newick.pm

Criterion	Covered	Total	%
statement	181	188	96.2
branch	73	80	91.2
condition	107	128	83.5
subroutine	13	14	92.8
pod			n/a
total	374	410	91.2

line	stmt	bran	cond	sub	time	code
1						package Bio::Phylo::Parsers::Newick;
2	24			24	641	use warnings;
	24				43
	24				753
3	24			24	132	use strict;
	24				45
	24				515
4	24			24	105	use base 'Bio::Phylo::Parsers::Abstract';
	24				39
	24				6334
5	24			24	155	no warnings 'recursion';
	24				44
	24				44087
6
7						=head1 NAME
8
9						Bio::Phylo::Parsers::Newick - Parser used by Bio::Phylo::IO, no serviceable parts inside
10
11						=head1 DESCRIPTION
12
13						This module parses tree descriptions in parenthetical format. It is called by the
14						L facade, don't call it directly. Several additional flags can be
15						passed to the Bio::Phylo::IO parse and parse_tree functions to influence how to deal
16						with complex newick strings:
17
18						-keep => [ ...list of taxa names... ]
19
20						The C<-keep> flag allows you to only retain certain taxa of interest, ignoring others
21						while building the tree object.
22
23						-ignore_comments => 1,
24
25						This will treat comments in square brackets as if they are a normal taxon name character,
26						this so that names such as C are parsed
27						"successfully". (Note: square brackets should NOT be used in this way as it will break
28						many parsers).
29
30						-keep_whitespace => 1,
31
32						This will treat unescaped whitespace as if it is a normal taxon name character. Normally,
33						whitespace is only retained inside quoted strings (e.g. C<'Homo sapiens'>), otherwise it
34						is the convention to use underscores (C). This is because some programs
35						introduce whitespace to prettify a newick string, e.g. to indicate indentation/depth,
36						in which case you almost certainly want to ignore it. This is the default behaviour. The
37						option to keep it is provided for dealing with incorrectly formatted data.
38
39						=cut
40
41	83			83	289	sub _return_is_scalar { 1 }
42
43
44						sub _simplify {
45						# Simplify a Newick tree string by removing unneeded nodes. The leaves to
46						# keep are given as $ids, an arrayref of terminal node IDs. Note that only
47						# cherries are simplified to keep the function fast. Ternary or higher order
48						# branches are left alone. Quoted strings should be handled properly.
49	56			56	122	my ($string, $ids) = @_;
50	56				106	my %id_hash = map { $_ => undef } @$ids;
	106				248
51
52						# Setup some regular expressions:
53						# 1/ ID is anything but these characters (except when quoted): , ; : ( ) " '
54	56				172	my $id_re_simple = qr/[^)(,:"';]+/;
55	56				103	my $id_re_squote = qr/[^']+/;
56	56				95	my $id_re_dquote = qr/[^']+/;
57	56				273	my $id_re = qr/ (?: $id_re_simple \| '$id_re_squote' \| "$id_re_dquote" ) /x;
58						# 2/ Distance is a real number (regexp taken from Regexp::Common $RE{num}{real})
59	56				108	my $dist_re = qr/(?:(?i)(?:[+-]?)(?:(?=[.]?[0123456789])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))\|))/;
60						# 3/ A pair of ID and distance (both optional)
61	56				264	my $pair_re = qr/ ($id_re)? (?: \: ($dist_re) )? /x;
62						# 4/ Cherry
63	56				426	my $cherry_re = qr/ ( $ $pair_re , $pair_re $ $pair_re ) /x;
64						# 5/ Whitespaces
65	56				115	my $ws_re = qr/ \s+ /msx;
66
67						# Remove spaces and newlines (no spaces allowed in node names)
68	56				205	$string =~ s/$ws_re//g;
69
70						# Prune cherries
71	56				87	my $prev_string = '';
72	56				133	while (not $string eq $prev_string) {
73	134				194	$prev_string = $string;
74	134				1378	$string =~ s/ $cherry_re / _prune_cherry($1, $2, $3, $4, $5, $6, $7, \%id_hash) /gex;
	108				251
75						}
76	56				167	__PACKAGE__->_logger->debug("simplified string by removing unneeded nodes");
77	56				392	return $string;
78						}
79
80
81						sub _prune_cherry {
82	108			108	437	my ($match, $id1, $dist1, $id2, $dist2, $idp, $distp, $id_hash) = @_;
83	108				169	my $repl;
84	108		100		299	my $id1_exists = defined $id1 && exists $id_hash->{$id1};
85	108		100		216	my $id2_exists = defined $id2 && exists $id_hash->{$id2};
86	108	100	100		228	if ( $id1_exists && $id2_exists ) {
87						# Keep both leaves
88	27				38	$repl = $match;
89						} else {
90						# There are from zero to one leaves to keep. Delete one of them.
91	81	100			146	my ($id, $dist) = $id1_exists ? ($id1, $dist1) : ($id2, $dist2);
92	81	100	100		162	if ( defined($dist) \|\| defined($distp) ) {
93	64		100		316	$dist = ':'.(($dist\|\|0) + ($distp\|\|0));
			100
94						}
95	81	100	50		194	$id \|\|= '' if not defined $id;
96	81	100	50		142	$dist \|\|= '' if not defined $dist;
97	81				116	$repl = $id.$dist;
98						}
99	108				450	return $repl;
100						}
101
102
103						sub _parse {
104	84			84	176	my $self = shift;
105	84				364	my $fh = $self->_handle;
106	84				340	my $forest = $self->_factory->create_forest;
107
108	84				188	my $string;
109	84				733	while (<$fh>) {
110	106				280	chomp;
111	106				467	$string .= $_;
112						}
113
114	84				481	my $ids = $self->_args->{'-keep'};
115	84				334	my $ignore = $self->_args->{'-ignore_comments'};
116	84				280	my $whitespace = $self->_args->{'-keep_whitespace'};
117	84				346	my $quotes = $self->_args->{'-ignore_quotes'};
118
119						# remove comments, split on tree descriptions
120	84				168	my $counter = 1;
121
122	84				347	for my $newick ( $self->_split($string,$ignore,$whitespace,$quotes) ) {
123	104				467	$self->_logger->debug("going to process newick string " . $counter++);
124						# simplify tree
125	104	100			928	if ($ids) {
126	1				6	$newick = _simplify($string, $ids);
127						}
128
129						# parse trees
130	104				394	my $tree = $self->_parse_string($newick);
131
132						# adding labels to untagged nodes
133	104	50			567	if ( $self->_args->{'-label'} ) {
134	0				0	my $i = 1;
135						$tree->visit(
136						sub {
137	0			0	0	my $n = shift;
138	0	0			0	$n->set_name( 'n' . $i++ ) unless $n->get_name;
139						}
140	0				0	);
141						}
142	104				641	$forest->insert($tree);
143						}
144	84				380	return $forest;
145						}
146
147						=begin comment
148
149						Type : Parser
150						Title : _split($string)
151						Usage : my @strings = $newick->_split($string);
152						Function: Creates an array of (decommented) tree descriptions
153						Returns : A Bio::Phylo::Forest::Tree object.
154						Args : $string = concatenated tree descriptions
155
156						=end comment
157
158						=cut
159
160						sub _split {
161	84			84	335	my ( $self, $string, $ignore, $whitespace, $quotes ) = @_;
162	84				342	my $log = $self->_logger;
163	84				233	my ( $QUOTED, $COMMENTED ) = ( 0, 0 );
164	84				176	my $decommented = '';
165	84				222	my @trees;
166	84				526	TOKEN: for my $i ( 0 .. length($string) ) {
167	79115				114565	my $token = substr( $string, $i, 1 );
168
169						# detect apostrophe as ' between two letters
170	79115	100			130223	my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
171	79115	100			127852	my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
172	79115		33		153620	my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
173	79115	50			118978	$log->debug("detected apostrophe") if $apostr;
174
175	79115	100	100		614928	if ( !$QUOTED && !$COMMENTED && $token eq "'" && ! $quotes && ! $apostr ) {
		100	100
		100	66
		100	66
			100
			100
			100
			100
			100
			66
			66
			100
			66
			66
			33
176	3				6	$QUOTED++;
177						}
178						elsif ( !$QUOTED && !$COMMENTED && $token eq "[" && ! $ignore ) {
179	2				5	$COMMENTED++;
180	2				10	$log->debug("quote level changed to $COMMENTED");
181	2				5	next TOKEN;
182						}
183						elsif ( !$QUOTED && $COMMENTED && $token eq "]" && ! $ignore ) {
184	2				4	$COMMENTED--;
185	2				4	next TOKEN;
186						}
187						elsif ($QUOTED
188						&& !$COMMENTED
189						&& $token eq "'"
190						&& substr( $string, $i, 2 ) ne "''" && ! $quotes && ! $apostr )
191						{
192	3				9	$QUOTED--;
193						}
194	79111	100	100		199377	if ( !$QUOTED && $token eq ' ' && ! $whitespace ) {
			66
195	18				27	next TOKEN;
196						}
197	79093	100			126529	$decommented .= $token unless $COMMENTED;
198	79093	100	100		273749	if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq ';' ) {
			100
199	104				367	push @trees, $decommented;
200	104				236	$decommented = '';
201						}
202
203						}
204	84				717	$log->debug("removed comments, split on tree descriptions");
205	84				469	$log->debug("found ".scalar(@trees)." tree descriptions");
206	84				359	return @trees;
207						}
208
209						=begin comment
210
211						Type : Parser
212						Title : _parse_string($string)
213						Usage : my $tree = $newick->_parse_string($string);
214						Function: Creates a populated Bio::Phylo::Forest::Tree object from a newick
215						string.
216						Returns : A Bio::Phylo::Forest::Tree object.
217						Args : $string = a newick tree description
218
219						=end comment
220
221						=cut
222
223						sub _parse_string {
224	104			104	291	my ( $self, $string ) = @_;
225	104				704	my $fac = $self->_factory;
226	104				331	$self->_logger->debug("going to parse tree string '$string'");
227	104				886	my $tree = $fac->create_tree;
228	104				274	my $remainder = $string;
229	104				222	my $token;
230						my @tokens;
231	104				527	while ( ( $token, $remainder ) = $self->_next_token($remainder) ) {
232	16732	100	66		48117	last if ( !defined $token \|\| !defined $remainder );
233	16628				34316	$self->_logger->debug("fetched token '$token'");
234
235	16628				39245	push @tokens, $token;
236						}
237	104				221	my $i;
238	104				469	for ( $i = $#tokens ; $i >= 0 ; $i-- ) {
239	104	50			427	last if $tokens[$i] eq ';';
240						}
241	104				1321	my $root = $fac->create_node;
242	104				820	$tree->insert($root);
243	104				1922	$self->_parse_node_data( $root, @tokens[ 0 .. ( $i - 1 ) ] );
244	104				1415	$self->_parse_clade( $tree, $root, @tokens[ 0 .. ( $i - 1 ) ] );
245	104				3087	return $tree;
246						}
247
248						sub _parse_clade {
249	4044			4044	27004	my ( $self, $tree, $root, @tokens ) = @_;
250	4044				9628	my $fac = $self->_factory;
251	4044				8074	$self->_logger->debug("recursively parsing clade '@tokens'");
252	4044				6957	my ( @clade, $depth, @remainder );
253	4044				9208	TOKEN: for my $i ( 0 .. $#tokens ) {
254	230305	100	100		492717	if ( $tokens[$i] eq '(' ) {
		100
		100
255	25265	100			33112	if ( not defined $depth ) {
256	1907				2447	$depth = 1;
257	1907				3258	next TOKEN;
258						}
259						else {
260	23358				25326	$depth++;
261						}
262						}
263						elsif ( $tokens[$i] eq ',' && $depth == 1 ) {
264	2033				10876	my $node = $fac->create_node;
265	2033				6657	$root->set_child($node);
266	2033				5858	$tree->insert($node);
267	2033				6899	$self->_parse_node_data( $node, @clade );
268	2033				8901	$self->_parse_clade( $tree, $node, @clade );
269	2033				8825	@clade = ();
270	2033				3800	next TOKEN;
271						}
272						elsif ( $tokens[$i] eq ')' ) {
273	25265				27040	$depth--;
274	25265	100			34756	if ( $depth == 0 ) {
275	1907				5572	@remainder = @tokens[ ( $i + 1 ) .. $#tokens ];
276	1907				10528	my $node = $fac->create_node;
277	1907				6281	$root->set_child($node);
278	1907				5574	$tree->insert($node);
279	1907				6294	$self->_parse_node_data( $node, @clade );
280	1907				7192	$self->_parse_clade( $tree, $node, @clade );
281	1907				20958	last TOKEN;
282						}
283						}
284	224458				309481	push @clade, $tokens[$i];
285						}
286						}
287
288						sub _parse_node_data {
289	3829			3829	38253	my ( $self, $node, @clade ) = @_;
290	3829				9710	$self->_logger->debug("parsing name and branch length for node");
291	3829				5905	my @tail;
292	3829				9160	PARSE_TAIL: for ( my $i = $#clade ; $i >= 0 ; $i-- ) {
293	11832	100			26025	if ( $clade[$i] eq ')' ) {
		100
294	1800				5026	@tail = @clade[ ( $i + 1 ) .. $#clade ];
295	1800				3650	last PARSE_TAIL;
296						}
297						elsif ( $i == 0 ) {
298	2029				5597	@tail = @clade;
299						}
300						}
301
302	3829	50	100		14594	if ( defined($tail[-1]) and $tail[-1] =~ /(\[.+\])$/ and scalar @tail != 1 ) {
			66
303	0				0	my $anno = $1;
304	0				0	$self->_logger->info("discarding branch comment $anno");
305	0				0	$tail[-1] =~ s/\Q$anno\E//;
306						}
307
308						# name only
309	3829	100			10808	if ( scalar @tail == 1 ) {
		100
		100
310	317				1375	$node->set_name( $tail[0] );
311						}
312						elsif ( scalar @tail == 2 ) {
313	245				711	$node->set_branch_length( $tail[-1] );
314						}
315						elsif ( scalar @tail == 3 ) {
316	3075				9968	$node->set_name( $tail[0] );
317	3075				9160	$node->set_branch_length( $tail[-1] );
318						}
319						}
320
321						sub _next_token {
322	16732			16732	26200	my ( $self, $string ) = @_;
323	16732				28268	$self->_logger->debug("tokenizing string '$string'");
324	16732				35811	my $ignore = $self->_args->{'-ignore_comments'};
325	16732				20910	my $QUOTED = 0;
326	16732				18947	my $COMMENTED = 0;
327	16732				19574	my $token = '';
328	16732				36024	my $TOKEN_DELIMITER = qr/[():,;]/;
329	16732				32404	TOKEN: for my $i ( 0 .. length($string) ) {
330	86182				140163	$token .= substr( $string, $i, 1 );
331	86182				174473	$self->_logger->debug("growing token: '$token'");
332
333						# detect apostrophe as ' between two letters
334	86182	100			187491	my $prev = $i > 0 ? substr( $string, $i-1, 1 ) : 0;
335	86182	100			161850	my $next = $i< length($string) ? substr( $string, $i+1, 1 ) : 0;
336	86182		33		175867	my $apostr = substr( $string, $i, 1 ) eq "'" && $prev=~/[a-z]/i && $next=~/[a-z]/i;
337	86182	50			134870	$self->_logger->debug("detected apostrophe") if $apostr;
338
339						# if -ignore_comments was specified the string can still contain comments
340						# that can contain token delimiters, so we still need to track
341						# whether we are inside a comment
342	86182	100	100		183907	if ( $ignore && $token =~ /\[$/ ) {
343	267				432	$COMMENTED++;
344						}
345	86182	100	100		167629	if ( $ignore && $token =~ /\]$/ ) {
346	267				429	$COMMENTED--;
347	267				704	next TOKEN;
348						}
349	85915	100	100		370369	if ( !$QUOTED && !$COMMENTED && $token =~ $TOKEN_DELIMITER ) {
			100
350	16628				23385	my $length = length($token);
351	16628	100			25717	if ( $length == 1 ) {
352	9486				19122	$self->_logger->debug("single char token: '$token'");
353	9486				44375	return $token, substr( $string, ( $i + 1 ) );
354						}
355						else {
356	7142				14669	$self->_logger->debug(
357						sprintf( "range token: %s",
358						substr( $token, 0, $length - 1 ) )
359						);
360	7142				48290	return substr( $token, 0, $length - 1 ),
361						substr( $token, $length - 1, 1 )
362						. substr( $string, ( $i + 1 ) );
363						}
364						}
365	69287	100	100		317260	if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq "'" && ! $apostr ) {
		100	100
			66
			66
			100
			66
			66
366	3				12	$QUOTED++;
367						}
368						elsif ($QUOTED && !$COMMENTED
369						&& substr( $string, $i, 1 ) eq "'"
370						&& substr( $string, $i, 2 ) ne "''" && ! $apostr)
371						{
372	3				11	$QUOTED--;
373						}
374						}
375						}
376
377						# podinherit_insert_token
378
379						=head1 SEE ALSO
380
381						There is a mailing list at L
382						for any user or developer questions and discussions.
383
384						=over
385
386						=item L
387
388						The newick parser is called by the L object.
389						Look there to learn how to parse newick strings.
390
391						=item L
392
393						Also see the manual: L and L.
394
395						=back
396
397						=head1 CITATION
398
399						If you use Bio::Phylo in published research, please cite it:
400
401						B, B, B, B
402						and B, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
403						I B<12>:63.
404						L
405
406						=cut
407
408						1;