File Coverage

blib/lib/Bio/Phylo/Parsers/Nexus.pm

Criterion	Covered	Total	%
statement	558	677	82.4
branch	195	282	69.1
condition	114	195	58.4
subroutine	45	53	84.9
pod			n/a
total	912	1207	75.5

line	stmt	bran	cond	sub	time	code
1						package Bio::Phylo::Parsers::Nexus;
2	6			6	39	use strict;
	6				12
	6				183
3	6			6	30	use base 'Bio::Phylo::Parsers::Abstract';
	6				12
	6				1757
4	6			6	54	use Bio::Phylo::Factory;
	6				13
	6				26
5	6			6	36	use Bio::Phylo::IO 'parse';
	6				14
	6				369
6	6			6	40	use Bio::Phylo::Util::CONSTANT qw':objecttypes looks_like_instance';
	6				15
	6				1318
7	6			6	42	use Bio::Phylo::Util::Exceptions 'throw';
	6				10
	6				3241
8
9						# TODO: handle mixed? distances, splits, bipartitions
10						my $TAXA = _TAXA_;
11						my $MATRIX = _MATRIX_;
12
13						# useful regular expressions
14						my $COMMENT = qr\|^\[\|; # crude, only checks first char, use after tokenizing!
15						my $QUOTES_OR_BRACKETS =
16						qr/[\[\]'"]/mox; # catch all for opening/closing square brackets and quotes
17						my $OPENING_QUOTE_OR_BRACKET =
18						qr/^(.?)([\['"].)$/mox; # capturing regex for opening sq. br. & q.
19
20						# this is a dispatch table whose sub references are invoked
21						# during parsing. the keys match the tokens upon which the
22						# respective subs are called. Underscored (private) fields are for parsing
23						# context. The fields of this table comprise the default state of the
24						# parser object.
25						my %defaults = (
26						'_lines' => undef,
27						'_current' => undef,
28						'_previous' => undef,
29						'_begin' => undef,
30						'_ntax' => undef,
31						'_nchar' => undef,
32						'_gap' => undef,
33						'_missing' => undef,
34						'_i' => undef,
35						'_tree' => undef,
36						'_trees' => undef,
37						'_treename' => undef,
38						'_treestart' => undef,
39						'_row' => undef,
40						'_matrixtype' => undef,
41						'_found' => 0,
42						'_linemode' => 0,
43						'_taxlabels' => [],
44						'_tokens' => [],
45						'_context' => [],
46						'_translate' => [],
47						'_symbols' => [],
48						'_charlabels' => [],
49						'_statelabels' => [],
50						'_charstatelabels' => [],
51						'_tmpstatelabels' => [],
52						'_comments' => [],
53						'_treenames' => [],
54						'_matrixrowlabels' => [],
55						'_matrix' => {},
56						'_charset' => {},
57						'_taxset' => {},
58						'begin' => \&_begin,
59						'taxa' => \&_taxa,
60						'title' => \&_title,
61						'dimensions' => \&_dimensions,
62						'ntax' => \&_ntax,
63						'taxlabels' => \&_taxlabels,
64						'blockid' => \&_blockid,
65						'data' => \&_data,
66						'characters' => \&_characters,
67						'codons' => \&_codons,
68						'nchar' => \&_nchar,
69						'format' => \&_format,
70						'datatype' => \&_datatype,
71						'matchchar' => \&_matchchar,
72						'gap' => \&_gap,
73						'missing' => \&_missing,
74						'charlabels' => \&_charlabels,
75						'statelabels' => \&_statelabels,
76						'charstatelabels' => \&_charstatelabels,
77						'symbols' => \&_symbols,
78						'items' => \&_items,
79						'matrix' => \&_matrix,
80						'charset' => \&_charset,
81						'taxset' => \&_taxset,
82						'trees' => \&_trees,
83						'translate' => \&_translate,
84						'tree' => \&_tree,
85						'utree' => \&_tree,
86						'end' => \&_end,
87						'endblock' => \&_end,
88						'#nexus' => \&_nexus,
89						'link' => \&_link,
90						';' => \&_semicolon,
91						'interleave' => \&_interleave,
92						);
93
94						=head1 NAME
95
96						Bio::Phylo::Parsers::Nexus - Parser used by Bio::Phylo::IO, no serviceable parts inside
97
98						=head1 DESCRIPTION
99
100						This module parses nexus files. It is called by the L module,
101						there is no direct usage. The parser can handle files and strings with multiple
102						tree, taxon, and characters blocks whose links are defined using Mesquite's
103						"TITLE = 'some_name'" and "LINK TAXA = 'some_name'" tokens.
104
105						The parser returns a reference to an array containing one or more taxa, trees
106						and matrices objects. Nexus comments are stripped, private nexus blocks (and the
107						'assumptions' block) are skipped. It currently doesn't handle 'mixed' data.
108
109						=begin comment
110
111						Type : Constructor
112						Title : _new
113						Usage : my $nexus = Bio::Phylo::Parsers::Nexus->_new;
114						Function: Initializes a Bio::Phylo::Parsers::Nexus object.
115						Returns : A Bio::Phylo::Parsers::Nexus object.
116						Args : none.
117
118						=end comment
119
120						=cut
121
122						sub _process_defaults {
123	15			15	31	my $self = shift;
124	15				274	for my $key ( keys %defaults ) {
125	990	100			1717	if ( looks_like_instance( $defaults{$key}, 'ARRAY' ) ) {
		100
126	180				414	$self->{$key} = [];
127						}
128						elsif ( looks_like_instance( $defaults{$key}, 'HASH' ) ) {
129	45				130	$self->{$key} = {};
130						}
131						else {
132	765				1961	$self->{$key} = $defaults{$key};
133						}
134						}
135	15				75	return $self;
136						}
137
138						=begin comment
139
140						Type : Wrapper
141						Title : _from_handle(\*FH)
142						Usage : $nexus->_from_handle(\*FH);
143						Function: Does all the parser magic, from a file handle
144						Returns : ARRAY
145						Args : \*FH = file handle
146
147						=end comment
148
149						=cut
150
151						sub _parse {
152	15			15	37	my $self = shift;
153	15				63	$self->_process_defaults;
154	15				105	$self->_logger->info("going to parse nexus data");
155	15				68	$self->{'_lines'} = $self->_stringify(@_);
156	15				71	$self->{'_tokens'} = $self->_tokenize( $self->{'_lines'} );
157
158						# iterate over tokens, dispatch methods from %{ $self } table
159						# This is the meat of the parsing, from here everything else is called.
160	15				63	$self->_logger->info("tokenized and split data, going to parse blocks");
161	15				37	my $i = 0;
162	15				32	my $private_block;
163	15				50	my $token_queue = [ undef, undef, undef ];
164	6			6	47	no strict 'refs';
	6				16
	6				38463
165	15				32	TOKEN_LINE: for my $token_line ( @{ $self->{'_tokens'} } ) {
	15				54
166	313	100			644	if ( not $self->{'_linemode'} ) {
		50
167	265				341	RAW_TOKEN: for my $raw_token ( @{$token_line} ) {
	265				495
168	1060	100			4593	if ( $raw_token =~ qr/^\[/ ) {
169	10				16	push @{ $self->{'_comments'} }, $raw_token;
	10				29
170	10				22	next RAW_TOKEN;
171						}
172	1050				2212	my $lower_case_token = lc($raw_token);
173	1050				1795	push @$token_queue, $lower_case_token;
174	1050				1400	shift @$token_queue;
175	1050	100	66		3829	if ( exists $self->{$lower_case_token} and not $private_block )
		100	66
176						{
177	415	50			1258	if ( ref $self->{$lower_case_token} eq 'CODE' ) {
178	415				670	$self->{'_previous'} = $self->{'_current'};
179	415				598	$self->{'_current'} = $lower_case_token;
180
181						# pull code ref from dispatch table
182	415				584	my $c = $self->{$lower_case_token};
183
184						# invoke as object method
185	415				1418	$self->$c($raw_token);
186	414				982	next RAW_TOKEN;
187						}
188						}
189						elsif ( $self->{'_current'} and not $private_block ) {
190	634				1084	my $c = $self->{ $self->{'_current'} };
191	634				1211	$self->$c($raw_token);
192	634				1181	next RAW_TOKEN;
193						}
194
195						# $self->{'_begin'} is switched 'on' by &_begin(), and 'off'
196						# again by any one of the appropriate subsequent tokens, i.e.
197						# taxa, data, characters and trees
198	1	0	33		4	if ( $self->{'_begin'}
			33
199						and not exists $self->{$lower_case_token}
200						and not $private_block )
201						{
202	0				0	$private_block = $raw_token;
203	0				0	next RAW_TOKEN;
204						}
205
206						# jump over private block content
207	1	50	33		4	if ( $private_block
			33
208						and $token_queue->[-2] eq 'end'
209						and $token_queue->[-1] eq ';' )
210						{
211	0				0	$private_block = 0;
212	0				0	$self->_logger->info(
213						"Skipped private $private_block block");
214	0				0	next RAW_TOKEN;
215						}
216						else {
217	1				3	next RAW_TOKEN;
218						}
219						}
220						}
221						elsif ( $self->{'_linemode'} ) {
222	48				92	my $c = $self->{ $self->{'_current'} };
223	48				64	push @{$token_queue}, $token_line;
	48				86
224	48				67	shift @$token_queue;
225	48				108	$self->$c($token_line);
226	47				123	next TOKEN_LINE;
227						}
228						}
229	13				73	return $self->_post_process(@_);
230						}
231
232						# makes array reference of strings, one string per line, from input
233						# file handle or string;
234						sub _stringify {
235	15			15	33	my $self = shift;
236	15				49	$self->_logger->info("going to split nexus data on lines");
237	15				41	my %opts = @_;
238	15				34	my @lines;
239	15				74	my $handle = $self->_handle;
240	15				116	while (<$handle>) {
241	435				624	my $line = $_;
242	435				1607	push @lines, grep { /\S/ } split( /\n\|\r\|\r\n/, $line );
	379				1187
243	435				986	$self->_logger->debug("read line: $line");
244						}
245	15				65	return \@lines;
246						}
247
248						=begin comment
249
250						Type : Method
251						Title : _tokenize()
252						Usage : $nexus->_tokenize($lines);
253						Function: Tokenizes lines in $lines array ref
254						Returns : Two dimensional ARRAY
255						Args : An array ref of lines (e.g. read from an input file);
256						Comments: This method accepts an array ref holding lines that may contain
257						single quotes, double quotes or square brackets. Line breaks and
258						spaces inside these quoted/bracketed fragments are ignored, otherwise
259						it is split, e.g.:
260
261						[
262						[ '#NEXUS' ],
263						[ 'BEGIN TAXA; [taxablock comment]' ],
264						[ 'DIMENSIONS NTAX=3;' ],
265						[ 'TAXLABELS "Taxon \' A" \'Taxon B\' TAXON[comment]C' ],
266						...etc...
267						]
268
269						becomes:
270						[
271						[ '#NEXUS' ],
272						[
273						'BEGIN',
274						'TAXA',
275						';',
276						'[taxablock comment]'
277						],
278						[
279						'DIMENSIONS',
280						'NTAX',
281						'=',
282						'3',
283						';'
284						],
285						[
286						'TAXLABELS',
287						'"Taxon \' A"',
288						'\'Taxon B\'',
289						'TAXON',
290						'[comment]',
291						'C'
292						],
293						...etc...
294						]
295
296
297						=end comment
298
299						=cut
300
301						sub _tokenize {
302	15			15	129	my ( $self, $lines ) = @_;
303	15				50	$self->_logger->info("going to split lines on tokens");
304	15				50	my ( $extract, $INSIDE_QUOTE, $continue ) = ( '', 0, 0 );
305	15				77	my ( @tokens, @split );
306	15				97	my $CLOSING_BRACKET_MIDLINE = qr/^.?(\])(.)$/mox;
307	15				54	my $CONTEXT_QB_AT_START = qr/^([\['"])(.*)$/mox;
308	15				54	my $CONTEXT_CLOSER;
309						my $QuoteContext; # either " ' or [
310	15				0	my $QuoteStartLine;
311	15				33	my $LineCount = 0;
312	15				80	my %CLOSE_CHAR = (
313						'"' => '"',
314						"'" => "'",
315						'[' => ']',
316						);
317	15				257	my %INVERSE_CLOSE_CHAR = (
318						'"' => '"',
319						"'" => "'",
320						']' => '[',
321						')' => '(',
322						);
323
324						# tokenize
325	15				34	LINE: for my $line ( @{$lines} ) {
	15				46
326	379				495	$LineCount++;
327	379				1092	TOKEN: while ( $line =~ /\S/ ) {
328
329						# line in file has no quoting/bracketing characters, and
330						# is no extension of a quoted/bracketed fragment starting
331						# on a previous line
332	408	100	100		2543	if ( $line !~ $QUOTES_OR_BRACKETS && !$INSIDE_QUOTE ) {
		100	100
		100	66
		100	66
		50	33
333	332	100			549	if ($continue) {
334	20				31	push @{ $tokens[-1] }, $line;
	20				46
335	20				33	$continue = 0;
336						}
337						else {
338	312				580	push @tokens, [$line];
339						}
340	332				458	my $logline = join( ' ', @{ $tokens[-1] } );
	332				647
341	332				495	chomp($logline);
342	332				669	$self->_logger->debug("Tokenized line $LineCount: $logline");
343	332				646	next LINE;
344						}
345
346						# line in file has opening quoting/bracketing characters, and
347						# is no extension of a quoted/bracketed fragment starting
348						# on a previous line
349						elsif ( $line =~ $OPENING_QUOTE_OR_BRACKET && !$INSIDE_QUOTE ) {
350	20				96	my ( $start, $quoted ) = ( $1, $2 );
351	20				49	push @tokens, [$start];
352	20				34	$line = $quoted;
353	20				35	$extract = $quoted;
354	20				26	$INSIDE_QUOTE++;
355	20				37	$continue = 1;
356	20				59	$QuoteContext = substr( $quoted, 0, 1 );
357	20				55	$self->_logger->debug("Line $LineCount contains $QuoteContext");
358	20				36	$QuoteStartLine = $LineCount;
359	20				290	$CONTEXT_QB_AT_START = qr/^(\Q$QuoteContext\E)(.*)$/;
360	20				55	my $context_closer = $CLOSE_CHAR{$QuoteContext};
361	20				191	$CONTEXT_CLOSER = qr/^(.?)(\Q$context_closer\E)(.)$/;
362	20				91	next TOKEN;
363						}
364
365						# line in file has no quoting/bracketing characters, and
366						# is an extension of a quoted/bracketed fragment starting
367						# on a previous line
368						elsif ( $line !~ $CONTEXT_CLOSER && $INSIDE_QUOTE ) {
369	30				68	$self->_logger->debug(
370						"Line $LineCount extends quote or comment");
371	30				39	$extract .= $line;
372	30				45	next LINE;
373						}
374						elsif ( $line =~ $CONTEXT_QB_AT_START && $INSIDE_QUOTE ) {
375	20				171	my ( $q, $remainder ) = ( $1, $1 . $2 );
376	20	100	100		100	if ( $q eq '"' \|\| $q eq "'" ) {
		50
377	9	50			151	if ( $remainder =~ m/^($q[^$q]?$q)(.)$/ ) {
		0
378	9				45	$self->_logger->debug(
379						"Line $LineCount closes $INVERSE_CLOSE_CHAR{$q} with $q"
380						);
381	9				16	push @{ $tokens[-1] }, ($1);
	9				59
382	9				28	$line = $2;
383	9				19	$INSIDE_QUOTE--;
384	9				43	next TOKEN;
385						}
386						elsif ( $remainder =~ m/^$q[^$q]*$/ ) {
387	0				0	$extract .= $line;
388	0				0	$continue = 1;
389	0				0	next LINE;
390						}
391						}
392						elsif ( $q eq '[' ) {
393	11				33	for my $i ( 1 .. length($line) ) {
394	830	100			1180	$INSIDE_QUOTE++ if substr( $line, $i, 1 ) eq '[';
395	830	100	66		1710	if ( $i and !$INSIDE_QUOTE ) {
396	8				11	push @{ $tokens[-1] }, substr( $line, 0, $i );
	8				34
397	8				19	my $logqc = substr( $line, ( $i - 1 ), 1 );
398	8				25	$self->_logger->debug(
399						"Line $LineCount closes $INVERSE_CLOSE_CHAR{$logqc} with $logqc"
400						);
401	8				16	$line = substr( $line, $i );
402	8				27	next TOKEN;
403						}
404	822	100			1254	$INSIDE_QUOTE-- if substr( $line, $i, 1 ) eq ']';
405						}
406	3				8	$extract = $line;
407	3				5	$continue = 1;
408	3				9	next LINE;
409						}
410						}
411						elsif ( $line =~ $CONTEXT_CLOSER && $INSIDE_QUOTE ) {
412	6				26	my ( $start, $q, $remainder ) = ( $1, $2, $3 );
413	6				18	$self->_logger->debug(
414						"Line $LineCount closes $INVERSE_CLOSE_CHAR{$q} with $q");
415	6	50			32	$start = $extract . $start if $continue;
416	6	50	33		31	if ( $q eq '"' or $q eq "'" ) {
		50
417	0				0	push @{ $tokens[-1] }, $start;
	0				0
418	0				0	$line = $remainder;
419	0				0	next TOKEN;
420						}
421						elsif ( $q eq ']' ) {
422	6				17	for my $i ( 0 .. length($line) ) {
423	414	100			576	$INSIDE_QUOTE++ if substr( $line, $i, 1 ) eq '[';
424	414	100	100		837	if ( $i and !$INSIDE_QUOTE ) {
425	3				7	my $segment = substr( $line, 0, $i );
426	3	50			8	if ($continue) {
427	3				6	push @{ $tokens[-1] }, $extract . $segment;
	3				14
428						}
429						else {
430	0				0	push @{ $tokens[-1] }, $segment;
	0				0
431						}
432	3				8	$line = substr( $line, $i );
433	3				9	next TOKEN;
434						}
435	411	100			603	$INSIDE_QUOTE-- if substr( $line, $i, 1 ) eq ']';
436						}
437	3	50			7	if ($continue) {
438	3				9	$extract .= $line;
439						}
440						else {
441	0				0	$extract = $line;
442						}
443	3				5	$continue = 1;
444	3				8	next LINE;
445						}
446						}
447						}
448						}
449
450						# an exception here means that an opening quote symbol " ' [
451						# ($QuoteContext) was encountered at input file/string line $QuoteStartLine.
452						# This can happen if any of these symbols is used in an illegal
453						# way, e.g. by using double quotes as gap symbols in matrices.
454	15	50			47	if ($INSIDE_QUOTE) {
455	0				0	throw 'BadArgs' =>
456						"Unbalanced $QuoteContext starting at line $QuoteStartLine";
457						}
458
459						# final split: non-quoted/bracketed fragments are split on whitespace,
460						# others are preserved verbatim
461						$self->_logger->info(
462	15				50	"going to split non-quoted/commented fragments on whitespace");
463	15				47	foreach my $line (@tokens) {
464	332				422	my @line;
465	332				480	foreach my $word (@$line) {
466	372	100			1155	if ( $word !~ $QUOTES_OR_BRACKETS ) {
467	352				1664	$word =~ s/(=\|;\|,)/ $1 /g;
468	352				1464	push @line, grep { /\S/ } split /\s+/, $word;
	1470				3265
469						}
470						else {
471	20				47	push @line, $word;
472						}
473						}
474	332				701	push @split, \@line;
475						}
476	15				175	return \@split;
477						}
478
479						# link matrices and forests to taxa
480						sub _post_process {
481	13			13	31	my $self = shift;
482	13				30	my $taxa = [];
483	13				35	foreach my $block ( @{ $self->{'_context'} } ) {
	13				54
484	26	100	33		132	if ( $block->_type == $TAXA ) {
		50
485	13				27	push @{$taxa}, $block;
	13				36
486						}
487						elsif ( $block->_type != $TAXA and $block->can('set_taxa') ) {
488	13	50	33		113	if ( $taxa->[-1]
			33
489						and $taxa->[-1]->can('_type') == $TAXA
490						and not $block->get_taxa )
491						{
492	0				0	$block->set_taxa( $taxa->[-1] ); # XXX exception here?
493						}
494						}
495						}
496	13				37	my $blocks = $self->{'_context'};
497
498						# initialize object, note we have to
499						# force data type references to be empty
500	13				31	@{$taxa} = ();
	13				36
501	13				313	for my $key ( keys %defaults ) {
502	858	100			1487	if ( looks_like_instance( $defaults{$key}, 'ARRAY' ) ) {
		100
503	156				583	$self->{$key} = [];
504						}
505						elsif ( looks_like_instance( $defaults{$key}, 'HASH' ) ) {
506	39				105	$self->{$key} = {};
507						}
508						else {
509	663				1235	$self->{$key} = $defaults{$key};
510						}
511						}
512	13				56	return @{$blocks};
	13				104
513						}
514
515						=begin comment
516
517						The following subs are called by the dispatch table stored in the object when
518						their respective tokens are encountered.
519
520						=end comment
521
522						=cut
523
524						sub _nexus {
525	14			14	43	my $self = shift;
526	14	50			63	if ( uc( $_[0] ) eq '#NEXUS' ) {
527	14				46	$self->_logger->info("found nexus token");
528						}
529						}
530
531						sub _begin {
532	43			43	77	my $self = shift;
533	43				90	$self->{'_begin'} = 1;
534						}
535
536						sub _taxa {
537	18			18	37	my $self = shift;
538	18	100			55	if ( $self->{'_begin'} ) {
539	13				74	my $taxa = $self->_factory->create_taxa;
540	13				32	push @{ $self->{'_context'} }, $taxa;
	13				77
541	13				56	$self->_logger->info("starting taxa block");
542	13				35	$self->{'_begin'} = 0;
543						}
544						else {
545	5				19	$self->{'_current'} = 'link'; # because of 'link taxa = blah' construct
546						}
547						}
548
549						sub _charset {
550	17			17	23	my $self = shift;
551	17				19	my $token = shift;
552
553						# first thing after the CHARSET token is the set name
554	17	100	100		124	if ( $token !~ /CHARSET/i && ! $self->{'_charset'}->{'name'} ) {
		100	100
		100	100
		100
555	2				8	$self->{'_charset'}->{'name'} = $token;
556	2				8	$self->{'_charset'}->{'range'} = [];
557						}
558
559						# then there might be a mesquite-style matrix reference, e.g. (CHARACTERS = matrix_name)
560						elsif ( $token =~ m/^\(/ ) {
561	1				4	$self->{'_charset'}->{'matrix'} = '';
562						}
563						elsif ( defined $self->{'_charset'}->{'matrix'} && ! $self->{'_charset'}->{'matrix'} && $token !~ /(?:\(?CHARACTERS\|=)/i ) {
564	1				6	$token =~ s/\)$//;
565	1				4	$self->{'_charset'}->{'matrix'} = $token;
566						}
567
568						# then come the indices
569						elsif ( $token =~ /(?:\d+\|-)/ ) {
570	8				11	push @{ $self->{'_charset'}->{'range'} }, $token;
	8				18
571						}
572						}
573
574						sub _taxset {
575	19			19	24	my $self = shift;
576	19				23	my $token = shift;
577
578						# first thing after the TAXSET token is the set name
579	19	100	100		141	if ( $token !~ /TAXSET/i && ! $self->{'_taxset'}->{'name'} ) {
		100	100
		100	100
		100
580	2				6	$self->{'_taxset'}->{'name'} = $token;
581	2				6	$self->{'_taxset'}->{'range'} = [];
582						}
583
584						# then there might be a mesquite-style taxa reference, e.g. (TAXA = matrix_name)
585						elsif ( $token =~ m/^\(/ ) {
586	1				3	$self->{'_taxset'}->{'taxa'} = '';
587						}
588						elsif ( defined $self->{'_taxset'}->{'taxa'} && ! $self->{'_taxset'}->{'taxa'} && $token !~ /(?:\(?TAXA\|=)/ ) {
589	1				8	$token =~ s/\)$//;
590	1				6	$self->{'_taxset'}->{'taxa'} = $token;
591						}
592
593						# then come the indices
594						elsif ( $token =~ /(?:\d+\|-)/ ) {
595	10				14	push @{ $self->{'_taxset'}->{'range'} }, $token;
	10				21
596						}
597						}
598
599						sub _interleave {
600	0			0	0	my $self = shift;
601	0				0	my $token = shift;
602	0				0	$self->_logger->info("perhaps we'll need to parse interleaved");
603	0	0	0		0	if ( defined $token and uc($token) eq 'NO' ) {
604	0				0	$self->_logger->info("no, we don't need to parse interleaved");
605						}
606						}
607
608						sub _title {
609	25			25	47	my $self = shift;
610	25				50	my $token = shift;
611	25	100	66		127	if ( defined $token and uc($token) ne 'TITLE' ) {
612	11				22	my $title = $token;
613	11	50			47	if ( not $self->_current->get_name ) {
614	11				35	$self->_current->set_name($title);
615	11				45	$self->_logger->info("block has title '$title'");
616						}
617						}
618						}
619
620						sub _link {
621	2			2	6	my $self = shift;
622	2				7	my $token = shift;
623	2	50	33		34	if ( defined $token and $token !~ m/^(?:LINK\|TAXA\|=)$/i ) {
624	0				0	my $link = $token;
625	0	0			0	if ( not $self->_current->get_taxa ) {
626	0				0	foreach my $block ( @{ $self->{'_context'} } ) {
	0				0
627	0	0	0		0	if ( $block->get_name and $block->get_name eq $link ) {
628	0				0	$self->_current->set_taxa($block);
629	0				0	last;
630						}
631						}
632						$self->_logger->info(
633	0				0	"block links to taxa block with title '$link'");
634						}
635						}
636						}
637
638				22		sub _dimensions {
639
640						#my $self = shift;
641						}
642
643						sub _ntax {
644	39			39	60	my $self = shift;
645	39	100	66		213	if ( defined $_[0] and $_[0] =~ m/^\d+$/ ) {
646	13				39	$self->{'_ntax'} = shift;
647	13				34	my $ntax = $self->{'_ntax'};
648	13				49	$self->_logger->info("number of taxa: $ntax");
649						}
650						}
651
652						sub _taxlabels {
653	72			72	97	my $self = shift;
654	72	100	66		284	if ( defined $_[0] and uc( $_[0] ) ne 'TAXLABELS' ) {
		50	33
655	59				88	my $taxon = shift;
656	59				127	$self->_logger->debug("taxon: $taxon");
657	59				87	push @{ $self->{'_taxlabels'} }, $taxon;
	59				120
658						}
659						elsif ( defined $_[0] and uc( $_[0] ) eq 'TAXLABELS' ) {
660						$self->_current->set_generic(
661	13				56	'nexus_comments' => $self->{'_comments'} );
662	13				34	$self->{'_comments'} = [];
663	13				44	$self->_logger->info("starting taxlabels");
664						}
665						}
666
667						sub _blockid {
668	0			0	0	my $self = shift;
669	0	0	0		0	if ( defined $_[0] and uc( $_[0] ) ne 'BLOCKID' ) {
670	0				0	my $blockid = shift;
671	0				0	$self->_logger->debug("blockid: $blockid");
672	0				0	$self->_current->set_generic( 'blockid' => $blockid );
673						}
674						}
675
676						sub _data {
677	0			0	0	my $self = shift;
678	0	0			0	if ( $self->{'_begin'} ) {
679	0				0	$self->{'_begin'} = 0;
680	0				0	push @{ $self->{'_context'} }, $self->_factory->create_matrix;
	0				0
681	0				0	$self->_logger->info("starting data block");
682						}
683						}
684
685						sub _characters {
686	9			9	22	my $self = shift;
687	9	50			39	if ( $self->{'_begin'} ) {
688	9				23	$self->{'_begin'} = 0;
689	9				20	push @{ $self->{'_context'} }, $self->_factory->create_matrix;
	9				44
690	9				53	$self->_logger->info("starting characters block");
691						}
692						}
693
694						sub _nchar {
695	27			27	43	my $self = shift;
696	27	100	66		160	if ( defined $_[0] and $_[0] =~ m/^\d+$/ ) {
697	9				26	$self->{'_nchar'} = shift;
698	9				23	my $nchar = $self->{'_nchar'};
699	9				34	$self->_logger->info("number of characters: $nchar");
700						}
701						}
702
703				9		sub _format {
704
705						#my $self = shift;
706						}
707
708						sub _datatype {
709	27			27	41	my $self = shift;
710	27	100	66		141	if ( defined $_[0] and $_[0] !~ m/^(?:DATATYPE\|=)/i ) {
711	9				27	my $datatype = shift;
712	9				34	$self->_current->set_type($datatype);
713	9				38	$self->_logger->info("datatype: $datatype");
714						}
715						}
716
717						sub _matchchar {
718	0			0	0	my $self = shift;
719	0	0	0		0	if ( defined $_[0] and $_[0] !~ m/^(?:MATCHCHAR\|=)/i ) {
720	0				0	my $matchchar = shift;
721	0				0	$self->_current->set_matchchar($matchchar);
722	0				0	$self->_logger->info("matchchar: $matchchar");
723						}
724						}
725
726				0		sub _items {
727
728						#my $self = shift;
729						}
730
731						sub _gap {
732	27			27	48	my $self = shift;
733	27	100	66		138	if ( $_[0] !~ m/^(?:GAP\|=)/i and !$self->{'_gap'} ) {
734	9				24	$self->{'_gap'} = shift;
735	9				25	my $gap = $self->{'_gap'};
736	9				31	$self->_current->set_gap($gap);
737	9				30	$self->_logger->info("gap character: $gap");
738	9				33	undef $self->{'_gap'};
739						}
740						}
741
742						sub _missing {
743	27			27	47	my $self = shift;
744	27	100	66		146	if ( $_[0] !~ m/^(?:MISSING\|=)/i and !$self->{'_missing'} ) {
745	9				23	$self->{'_missing'} = shift;
746	9				22	my $missing = $self->{'_missing'};
747	9				31	$self->_current->set_missing($missing);
748	9				32	$self->_logger->info("missing character: $missing");
749	9				25	undef $self->{'_missing'};
750						}
751						}
752
753						sub _symbols {
754	15			15	25	my $self = shift;
755	15	100	66		96	if ( $_[0] !~ m/^(?:SYMBOLS\|=)$/i and $_[0] =~ m/^"?(.+)"?$/ ) {
756	5				31	my $sym = $1;
757	5				23	$sym =~ s/"//g;
758	5				48	my @syms = grep { /\S+/ } split /\s+/, $sym;
	17				60
759	5				14	push @{ $self->{'_symbols'} }, @syms;
	5				22
760	5				22	$self->_logger->debug("recorded character state symbols '@syms'");
761						}
762						}
763
764						sub _charlabels {
765	0			0	0	my $self = shift;
766	0	0	0		0	if ( defined $_[0] and uc $_[0] ne 'CHARLABELS' ) {
767	0				0	push @{ $self->{'_charlabels'} }, shift;
	0				0
768						}
769						}
770
771						sub _charstatelabels {
772	18			18	30	my $self = shift;
773	18				31	my $token = shift;
774	18				48	$self->_logger->debug($token);
775	18	100	66		77	if ( defined $token and uc $token ne 'CHARSTATELABELS' ) {
776	16				29	push @{ $self->{'_charstatelabels'} }, $token;
	16				54
777						}
778						}
779
780						sub _statelabels {
781	0			0	0	my $self = shift;
782	0				0	my $token = shift;
783	0	0	0		0	if ( defined $token and uc $token ne 'STATELABELS' ) {
784	0	0			0	if ( $token eq ',' ) {
785	0				0	my $tmpstatelabels = $self->{'_tmpstatelabels'};
786	0				0	my $index = shift @{$tmpstatelabels};
	0				0
787	0				0	$self->{'_statelabels'}->[ $index - 1 ] = $tmpstatelabels;
788	0				0	$self->{'_tmpstatelabels'} = [];
789						}
790						else {
791	0				0	push @{ $self->{'_tmpstatelabels'} }, $token;
	0				0
792						}
793						}
794						}
795
796						# for data type, character labels, state labels
797						sub _add_matrix_metadata {
798	57			57	75	my $self = shift;
799	57				127	$self->_logger->info("adding matrix metadata");
800	57	100			133	if ( not defined $self->{'_matrixtype'} ) {
801	9				103	$self->{'_matrixtype'} = $self->_current->get_type;
802	9	50			19	if ( @{ $self->{'_charlabels'} } ) {
	9				35
803	0				0	$self->_current->set_charlabels( $self->{'_charlabels'} );
804	0				0	$self->_logger->debug("adding character labels");
805						}
806	9	50			17	if ( @{ $self->{'_statelabels'} } ) {
	9				34
807	0				0	$self->_current->set_statelabels( $self->{'_statelabels'} );
808	0				0	$self->_logger->debug("adding state labels");
809						}
810	9	50			17	if ( my @symbols = @{ $self->{'_symbols'} } ) {
	9				42
811	0				0	$self->_logger->debug("updating state lookup table");
812	0				0	my $to = $self->_current->get_type_object;
813	0				0	my $lookup = $to->get_lookup;
814	0	0			0	if ($lookup) {
815	0				0	for my $sym (@symbols) {
816	0	0			0	if ( not exists $lookup->{$sym} ) {
817	0				0	$lookup->{$sym} = [$sym];
818						}
819						}
820						}
821						}
822						}
823	57				83	return $self;
824						}
825
826						sub _add_tokens_to_row {
827	48			48	85	my ( $self, $tokens ) = @_;
828	48				68	my $rowname;
829	48				64	for my $token ( @{$tokens} ) {
	48				100
830	90				200	$self->_logger->debug("token: $token");
831	90	100			290	last if $token eq ';';
832
833						# mesquite sometimes writes multiline (but not interleaved)
834						# matrix rows (harrumph).
835	81	100	66		533	if ( not defined $rowname and $token !~ $COMMENT ) {
		50	33
836	39				54	my $taxa;
837	39	50			81	if ( $taxa = $self->_current->get_taxa ) {
		50
838	0	0			0	if ( my $taxon = $taxa->get_by_name($token) ) {
839	0				0	$rowname = $token;
840						}
841						else {
842	0				0	$rowname = $self->{'_matrixrowlabels'}->[-1];
843						}
844						}
845						elsif ( $taxa = $self->_find_last_seen_taxa_block ) {
846	39	50			144	if ( my $taxon = $taxa->get_by_name($token) ) {
847	39				70	$rowname = $token;
848						}
849						else {
850	0				0	$rowname = $self->{'_matrixrowlabels'}->[-1];
851						}
852						}
853						else {
854	0				0	$rowname = $token;
855						}
856	39	50			99	if ( not exists $self->{'_matrix'}->{$rowname} ) {
857	39				103	$self->{'_matrix'}->{$rowname} = [];
858	39				57	push @{ $self->{'_matrixrowlabels'} }, $rowname;
	39				104
859						}
860						}
861						elsif ( defined $rowname and $token !~ $COMMENT ) {
862	42				98	my $row = $self->{'_matrix'}->{$rowname};
863	42	100			84	if ( $self->{'_matrixtype'} =~ m/^continuous$/i ) {
864	6				8	push @{$row}, split( /\s+/, $token );
	6				21
865						}
866						else {
867	36				54	push @{$row}, split( //, $token );
	36				213
868						}
869						}
870						}
871						}
872
873						sub _find_last_seen_taxa_block {
874	85			85	152	my $self = shift;
875	85				135	my $name = shift;
876	85				118	for ( my $i = $#{ $self->{'_context'} } ; $i >= 0 ; $i-- ) {
	85				313
877	180	100			556	if ( $self->{'_context'}->[$i]->_type == $TAXA ) {
878	84	100			171	if ( $name ) {
879	2	100			7	if ( $self->{'_context'}->[$i]->get_name eq $name ) {
880	1				3	return $self->{'_context'}->[$i];
881						}
882						}
883						else {
884	82				279	return $self->{'_context'}->[$i];
885						}
886						}
887						}
888	2				7	return;
889						}
890
891						sub _find_last_seen_matrix {
892	4			4	10	my $self = shift;
893	4				12	my $name = shift;
894	4				10	for ( my $i = $#{ $self->{'_context'} } ; $i >= 0 ; $i-- ) {
	4				23
895	5	50			28	if ( $self->{'_context'}->[$i]->_type == $MATRIX ) {
896	5	100			16	if ( $name ) {
897	2	100			7	if ( $self->{'_context'}->[$i]->get_name eq $name ) {
898	1				5	return $self->{'_context'}->[$i];
899						}
900						}
901						else {
902	3				18	return $self->{'_context'}->[$i];
903						}
904						}
905						}
906	0				0	return;
907						}
908
909						sub _set_taxon {
910	74			74	154	my ( $self, $obj, $taxa ) = @_;
911
912						# first case: a taxon by $obj's name already exists
913	74	50			199	if ( my $taxon = $taxa->get_by_name( $obj->get_name ) ) {
914	74				248	$obj->set_taxon($taxon);
915	74				163	return $self;
916						}
917
918						# second case: no taxon by $obj's name exists yet
919						else {
920	0				0	my $taxon = $self->_factory->create_taxon( '-name' => $obj->get_name );
921	0				0	$taxa->insert($taxon);
922	0				0	$obj->set_taxon($taxon);
923	0				0	return $self;
924						}
925						}
926
927						sub _resolve_taxon {
928	132			132	219	my ( $self, $obj ) = @_;
929	132				263	my $container = $self->_current;
930
931						# first case: the object is actually already
932						# linked to a taxon
933	132	100			321	if ( my $taxon = $obj->get_taxon ) {
934	88				330	return $self;
935						}
936
937						# second case: the container is already linked
938						# to a taxa block, but the object isn't
939	44	100			198	if ( my $taxa = $container->get_taxa ) {
940	30				81	$self->_set_taxon( $obj, $taxa );
941						}
942
943						# third case: the container isn't explicitly linked,
944						# but a taxa block has been seen
945	44	100			155	if ( my $taxa = $self->_find_last_seen_taxa_block ) {
946	42				213	$container->set_taxa($taxa);
947	42				127	$self->_set_taxon( $obj, $taxa );
948						}
949
950						# final case: no taxa block exists
951						else {
952	2				10	my $taxa = $container->make_taxa;
953	2				5	pop @{ $self->{'_context'} };
	2				9
954	2				6	push @{ $self->{'_context'} }, $taxa, $container;
	2				6
955	2				11	$self->_set_taxon( $obj, $taxa );
956						}
957						}
958
959						sub _resolve_ambig {
960	39			39	92	my ( $self, $datum, $chars ) = @_;
961	39				149	my %brackets = (
962						'(' => ')',
963						'{' => '}',
964						);
965	39				111	my $to = $datum->get_type_object;
966	39				65	my @resolved;
967	39				64	my $in_set = 0;
968	39				73	my @set;
969						my $close;
970	39				57	for my $c ( @{$chars} ) {
	39				95
971
972	228	50	33		576	if ( not $in_set and not exists $brackets{$c} ) {
		0	0
		0	0
		0	0
973	228	50			487	push @resolved, $c if defined $c;
974						}
975						elsif ( not $in_set and exists $brackets{$c} ) {
976	0				0	$in_set++;
977	0				0	$close = $brackets{$c};
978						}
979						elsif ( $in_set and $c ne $close ) {
980	0				0	push @set, $c;
981						}
982						elsif ( $in_set and $c eq $close ) {
983	0				0	push @resolved, $to->get_symbol_for_states(@set);
984	0				0	@set = ();
985	0				0	$in_set = 0;
986	0				0	$close = undef;
987						}
988						}
989	39				120	return \@resolved;
990						}
991
992						sub _codons {
993	0			0	0	my $self = shift;
994	0				0	$self->_logger->info(shift);
995	0	0			0	if ( $self->{'_begin'} ) {
996	0				0	$self->{'_begin'} = 0;
997						}
998						}
999
1000						sub _matrix {
1001	57			57	88	my $self = shift;
1002	57				83	my $token = shift;
1003	57				151	$self->_add_matrix_metadata;
1004
1005						# first token: 'MATRIX', i.e. we're just starting to parse
1006						# the actual matrix. Here we need to switch to "linemode",
1007						# so that subsequently tokens will be array references (all
1008						# the tokens on a line). This is so that we can handle
1009						# interleaved matrices, which unfortunately need line breaks
1010						# in them.
1011	57	100	66		144	if ( not looks_like_instance( $token, 'ARRAY' ) and uc($token) eq 'MATRIX' )
		100	66
		50	33
1012						{
1013	9				30	$self->{'_linemode'} = 1;
1014	9				33	$self->_logger->info("starting matrix");
1015	9				19	return;
1016						}
1017
1018						# a row inside the matrix, after adding tokens to row, nothing
1019						# else to do
1020						elsif ( looks_like_instance( $token, 'ARRAY' )
1021	90				309	and not grep { /^;$/ } @{$token} )
	48				100
1022						{
1023	39				125	$self->_add_tokens_to_row($token);
1024	39				103	$self->_logger->info("adding tokens to row");
1025	39				71	return;
1026						}
1027
1028						# the last row of the matrix, after adding tokens to row,
1029						# instantiate & populate datum objects, link against taxa
1030						# objects
1031						elsif ( looks_like_instance( $token, 'ARRAY' )
1032	9				54	and grep { /^;$/ } @{$token} )
	9				32
1033						{
1034	9				37	$self->_add_tokens_to_row($token);
1035
1036						# link to taxa
1037	9				28	for my $row ( @{ $self->{'_matrixrowlabels'} } ) {
	9				35
1038
1039						# create new datum
1040	39				125	my $datum = $self->_factory->create_datum(
1041						'-type_object' => $self->_current->get_type_object,
1042						'-name' => $row,
1043						);
1044						my $char =
1045	39				200	$self->_resolve_ambig( $datum, $self->{'_matrix'}->{$row} );
1046	39				162	$datum->set_char($char);
1047
1048						# insert new datum in matrix
1049	39				107	$self->_current->insert($datum);
1050
1051						# link to taxon
1052	39				144	$self->_resolve_taxon($datum);
1053	39				168	my ( $length, $seq ) = ( $datum->get_length, $datum->get_char );
1054	39				149	$self->_logger->info("parsed $length characters for ${row}: $seq");
1055						}
1056
1057						# Let's avoid these!
1058	9	100	33		38	if ( $self->_current->get_nchar != $self->{'_nchar'} ) {
		50
1059						my ( $obs, $exp ) =
1060	1				4	( $self->_current->get_nchar, $self->{'_nchar'} );
1061	1				7	_bad_format("Observed and expected nchar mismatch: $obs vs. $exp");
1062						}
1063
1064						# ntax is only defined for "data" blocks (which have ntax token),
1065						# not for "characters" blocks (which should match up with taxa block)
1066						elsif ( defined $self->{'_ntax'}
1067						and $self->_current->get_ntax != $self->{'_ntax'} )
1068						{
1069	0				0	my ( $obs, $exp ) = ( $self->_current->get_ntax, $self->{'_ntax'} );
1070	0				0	_bad_format("Observed and expected ntax mismatch: $obs vs. $exp");
1071						}
1072
1073						# XXX matrix clean up here
1074	8				24	$self->{'_ntax'} = undef;
1075	8				19	$self->{'_nchar'} = undef;
1076	8				24	$self->{'_matrixtype'} = undef;
1077	8				57	$self->{'_matrix'} = {};
1078	8				28	$self->{'_matrixrowlabels'} = [];
1079	8				26	$self->{'_linemode'} = 0;
1080						}
1081						}
1082
1083						sub _bad_format {
1084	2			2	15	throw 'BadFormat' => shift;
1085						}
1086	408			408	1569	sub _current { shift->{'_context'}->[-1] }
1087
1088						sub _trees {
1089	5			5	14	my $self = shift;
1090	5	50			19	if ( $self->{'_begin'} ) {
1091	5				13	$self->{'_begin'} = 0;
1092	5				14	$self->{'_trees'} = '';
1093	5				14	$self->{'_treenames'} = [];
1094	5				9	push @{ $self->{'_context'} }, $self->_factory->create_forest;
	5				27
1095	5				28	$self->_logger->info("starting trees block");
1096						}
1097						}
1098
1099						sub _translate {
1100	171			171	240	my $self = shift;
1101	171				234	my $i = $self->{'_i'};
1102	171	100	100		404	if ( $i && $i == 1 )
1103						{ # actually, $i can be 0 according to BayesPhylogenies translation table
1104	5				20	$self->_logger->info("starting translation table");
1105						}
1106	171	100	100		971	if ( !defined($i) && $_[0] =~ m/^\d+$/ ) {
		100	66
			33
			33
1107	57				95	$self->{'_i'} = shift;
1108	57				141	$self->{'_translate'}->[ $self->{'_i'} ] = undef;
1109						}
1110						elsif (defined($i)
1111						&& exists $self->{'_translate'}->[$i]
1112						&& !defined $self->{'_translate'}->[$i]
1113						&& $_[0] ne ';' )
1114						{
1115	57				96	$self->{'_translate'}->[$i] = $_[0];
1116	57				144	$self->_logger->debug("Translation: $i => $_[0]");
1117	57				102	$self->{'_i'} = undef;
1118						}
1119						}
1120
1121						sub _tree {
1122	212			212	299	my $self = shift;
1123	212	100	100		470	if ( not $self->{'_treename'} and $_[0] !~ m/^(U?TREE\|\*)$/i ) {
1124	13				28	$self->{'_treename'} = $_[0];
1125						}
1126	212	100	66		424	if ( $_[0] eq '=' and not $self->{'_treestart'} ) {
1127	13				164	$self->{'_treestart'} = 1;
1128						}
1129	212	100	100		578	if ( $_[0] ne '=' and $self->{'_treestart'} ) {
1130	173				260	$self->{'_tree'} .= $_[0];
1131						}
1132
1133						# tr/// returns # of replacements, hence can be used to check
1134						# tree description is balanced
1135	212	100	100		847	if ( $self->{'_treestart'}
			100
1136						and $self->{'_tree'}
1137						and $self->{'_tree'} =~ tr/(/(/ == $self->{'_tree'} =~ tr/)/)/ )
1138						{
1139	13				25	my $translated = $self->{'_tree'};
1140	13				25	my $translate = $self->{'_translate'};
1141	13	100			33	my $start =
1142						exists $translate->[0]
1143						? 0
1144						: 1; # BayesPhylogenies starts translation table w. 0
1145	13				26	for my $i ( $start .. $#{$translate} ) {
	13				45
1146	93				2415	$translated =~ s/($\|,)$i(,\|$\|:)/$1$translate->[$i]$2/;
1147						}
1148						my ( $logtreename, $logtree ) =
1149	13				60	( $self->{'_treename'}, $self->{'_tree'} );
1150	13				57	$self->_logger->info("tree: $logtreename string: $logtree");
1151	13				55	$self->{'_trees'} .= $translated . ';';
1152	13				28	push @{ $self->{'_treenames'} }, $self->{'_treename'};
	13				36
1153
1154						# XXX tree cleanup here
1155	13				28	$self->{'_treestart'} = 0;
1156	13				24	$self->{'_tree'} = undef;
1157	13				36	$self->{'_treename'} = undef;
1158						}
1159						}
1160
1161						sub _end {
1162	33			33	73	my $self = shift;
1163	33				89	$self->{'_translate'} = [];
1164	33	100	100		190	if ( uc $self->{'_previous'} eq ';' and $self->{'_trees'} ) {
1165	5				39	my $forest = $self->_current;
1166						my $trees = parse(
1167						'-format' => 'newick',
1168	5				51	'-string' => $self->{'_trees'},
1169						'-as_project' => 0
1170						);
1171	5				52	for my $tree ( @{ $trees->get_entities } ) {
	5				28
1172	13				44	$forest->insert($tree);
1173						}
1174
1175						# set tree names
1176	5				78	for my $i ( 0 .. $#{ $self->{'_treenames'} } ) {
	5				32
1177	13				70	$forest->get_by_index($i)->set_name( $self->{'_treenames'}->[$i] );
1178						}
1179
1180						# link tips to taxa
1181	5				13	for my $tree ( @{ $forest->get_entities } ) {
	5				20
1182	13				28	for my $tip ( @{ $tree->get_terminals } ) {
	13				67
1183	93				185	$self->_resolve_taxon($tip);
1184						}
1185						}
1186
1187						# XXX trees cleanup here
1188	5				32	$self->{'_trees'} = '';
1189	5				83	$self->{'_treenames'} = [];
1190						}
1191						}
1192
1193						sub _semicolon {
1194	189			189	290	my $self = shift;
1195	189	50			1392	if ( uc $self->{'_previous'} eq 'MATRIX' ) {
		100
		100
		100
		100
		100
		50
		50
1196	0				0	$self->{'_matrixtype'} = undef;
1197	0				0	$self->{'_matrix'} = {};
1198	0				0	$self->{'_charlabels'} = [];
1199	0				0	$self->{'_statelabels'} = [];
1200	0				0	$self->{'_linemode'} = 0;
1201	0	0			0	if ( not $self->_current->get_ntax ) {
1202	0				0	my $taxon = {};
1203	0				0	foreach my $row ( @{ $self->_current->get_entities } ) {
	0				0
1204	0				0	$taxon->{ $row->get_taxon }++;
1205						}
1206	0				0	my $ntax = scalar keys %{$taxon};
	0				0
1207						}
1208						}
1209
1210						# finalize character set
1211						elsif ( uc $self->{'_previous'} eq 'CHARSET' ) {
1212	2				13	my $matrix = $self->_find_last_seen_matrix( $self->{'_charset'}->{'matrix'} );
1213	2				7	my $characters = $matrix->get_characters;
1214	2				7	my $set = $self->_factory->create_set( '-name' => $self->{'_charset'}->{'name'} );
1215	2				20	$characters->add_set($set);
1216	2				5	my $range = $self->{'_charset'}->{'range'};
1217	2				4	my @range;
1218	2	50			9	if ( ref($range) eq 'ARRAY' ) {
1219	2				5	while ( @{ $range } ) {
	6				12
1220	4				6	my $index = shift @{ $range };
	4				7
1221	4	100	66		17	if ( $range->[0] && $range->[0] eq '-' ) {
1222	2				4	shift @{ $range };
	2				4
1223	2				3	my $end = shift @{ $range };
	2				5
1224	2				10	push @range, ( $index - 1 ) .. ( $end - 1 );
1225						}
1226						else {
1227	2				6	push @range, ( $index - 1 );
1228						}
1229						}
1230	2				6	for my $i ( @range ) {
1231	6				19	my $character = $characters->get_by_index($i);
1232	6	50			11	if ( $character ) {
1233	6				20	$characters->add_to_set($character,$set);
1234						}
1235						else {
1236	0				0	throw 'API' => "No character at index $i";
1237						}
1238						}
1239						}
1240	2				8	$self->{'_charset'} = {};
1241						}
1242
1243						# finalize character state labels
1244						elsif ( uc $self->{'_previous'} eq 'CHARSTATELABELS' ) {
1245	2				21	my $matrix = $self->_find_last_seen_matrix;
1246	2				6	my @labels = @{ $self->{'_charstatelabels'} };
	2				12
1247	2	100			19	if ( $matrix->get_type =~ m/continuous/i ) {
1248	1				4	my @charlabels;
1249	1				5	my $charnum = 1;
1250	1				5	while (@labels) {
1251
1252						# expecting an index at the beginning of the statement
1253	2				6	my $index = shift @labels;
1254	2	50			10	$index != $charnum && _bad_format( "Expecting character number $charnum, observed $index in CHARSTATELABELS" );
1255
1256						# then the character label
1257	2				6	push @charlabels, shift @labels;
1258
1259						# then a comma
1260	2	100			6	if ( @labels ) {
1261	1	50			6	$labels[0] eq ',' ? shift @labels : _bad_format( "Expecting , observed $labels[0] in CHARSTATELABELS" );
1262						}
1263	2				8	$charnum++;
1264						}
1265	1				10	$matrix->set_charlabels(\@charlabels);
1266	1				5	$self->{'_charstatelabels'} = [];
1267						}
1268						else {
1269	1				5	my ( @charlabels, @statelabels );
1270	1				3	my $charnum = 1;
1271	1				5	while (@labels) {
1272
1273						# expecting an index at the beginning of the statement
1274	2				6	my $index = shift @labels;
1275	2	50			10	$index != $charnum && _bad_format( "Expecting character number $charnum, observed $index in CHARSTATELABELS" );
1276
1277						# then the character label
1278	2				7	push @charlabels, shift @labels;
1279
1280						# then a forward slash
1281	2				5	my $slash = shift @labels;
1282	2	50			8	$slash ne '/' && _bad_format( "Expecting /, observed $slash in CHARSTATELABELS" );
1283
1284						# then a list of state labels
1285	2				6	my @stateset;
1286	2		100		27	push @stateset, shift @labels while(@labels and $labels[0] ne ',');
1287	2				7	push @statelabels, \@stateset;
1288
1289						# then a comma
1290	2	100			8	if ( @labels ) {
1291	1	50			6	$labels[0] eq ',' ? shift @labels : _bad_format( "Expecting , observed $labels[0] in CHARSTATELABELS" );
1292						}
1293	2				8	$charnum++;
1294						}
1295	1				15	$matrix->set_charlabels(\@charlabels);
1296	1				4	$matrix->set_statelabels(\@statelabels);
1297	1				5	$self->{'_charstatelabels'} = [];
1298						}
1299						}
1300
1301						# finalize taxon set
1302						elsif ( uc $self->{'_previous'} eq 'TAXSET' ) {
1303	2				12	my $taxa = $self->_find_last_seen_taxa_block( $self->{'_taxset'}->{'taxa'} );
1304	2				8	my $set = $self->_factory->create_set( '-name' => $self->{'_taxset'}->{'name'} );
1305	2				17	$taxa->add_set($set);
1306	2				6	my $range = $self->{'_taxset'}->{'range'};
1307	2				8	my @range;
1308	2				5	while ( @{ $range } ) {
	8				18
1309	6				11	my $index = shift @{ $range };
	6				11
1310	6	100	100		23	if ( $range->[0] && $range->[0] eq '-' ) {
1311	2				4	shift @{ $range };
	2				5
1312	2				4	my $end = shift @{ $range };
	2				6
1313	2				13	push @range, ( $index - 1 ) .. ( $end - 1 );
1314						}
1315						else {
1316	4				8	push @range, ( $index - 1 );
1317						}
1318						}
1319	2				7	for my $i ( @range ) {
1320	8				29	my $taxon = $taxa->get_by_index($i);
1321	8	50			18	if ( $taxon ) {
1322	8				25	$taxa->add_to_set($taxon,$set);
1323						}
1324						else {
1325	0				0	_bad_format( "No taxon at index $i" );
1326						}
1327						}
1328	2				9	$self->{'_taxset'} = {};
1329						}
1330
1331						# finalize taxa labels
1332						elsif ( uc $self->{'_previous'} eq 'TAXLABELS' ) {
1333	13				26	foreach my $name ( @{ $self->{'_taxlabels'} } ) {
	13				38
1334	59				154	my $taxon = $self->_factory->create_taxon( '-name' => $name );
1335	59				156	$self->_current->insert($taxon);
1336						}
1337	13	100			48	if ( $self->_current->get_ntax != $self->{'_ntax'} ) {
1338						_bad_format(
1339						sprintf(
1340						'Mismatch between observed and expected ntax: %d vs %d',
1341	1				6	$self->_current->get_ntax, $self->{'_ntax'}
1342						)
1343						);
1344						}
1345
1346						# XXX taxa cleanup here
1347	12				36	$self->{'_ntax'} = undef;
1348	12				43	$self->{'_taxlabels'} = [];
1349						}
1350
1351						# finalize symbols list
1352						elsif ( uc $self->{'_previous'} eq 'SYMBOLS' ) {
1353	5				14	my $logsymbols = join( ' ', @{ $self->{'_symbols'} } );
	5				19
1354	5				20	$self->_logger->info("symbols: $logsymbols");
1355	5				17	$self->{'_symbols'} = [];
1356						}
1357
1358						# finalize character labels
1359						elsif ( uc $self->{'_previous'} eq 'CHARLABELS' ) {
1360	0	0				if ( @{ $self->{'_charlabels'} } ) {
	0
1361	0					my $logcharlabels = join( ' ', @{ $self->{'_charlabels'} } );
	0
1362	0					$self->_logger->info("charlabels: $logcharlabels");
1363						}
1364						}
1365
1366						# finalize state labels
1367						elsif ( uc $self->{'_previous'} eq 'STATELABELS' ) {
1368	0	0				if ( @{ $self->{'_statelabels'} } ) {
	0
1369	0					my $logstatelabels = join( ' ', @{ $self->{'_statelabels'} } );
	0
1370	0					$self->_logger->info("statelabels: $logstatelabels");
1371						}
1372						}
1373						}
1374
1375						# podinherit_insert_token
1376
1377						=head1 SEE ALSO
1378
1379						There is a mailing list at L
1380						for any user or developer questions and discussions.
1381
1382						=over
1383
1384						=item L
1385
1386						The nexus parser is called by the L object. Look there for
1387						examples of file parsing and manipulation.
1388
1389						=item L
1390
1391						Also see the manual: L and L.
1392
1393						=back
1394
1395						=head1 CITATION
1396
1397						If you use Bio::Phylo in published research, please cite it:
1398
1399						B, B, B, B
1400						and B, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
1401						I B<12>:63.
1402						L
1403
1404						=cut
1405
1406						1;