File Coverage

Bio/AlignIO/phylip.pm

Criterion	Covered	Total	%
statement	169	202	83.6
branch	62	98	63.2
condition	15	33	45.4
subroutine	16	17	94.1
pod	10	10	100.0
total	272	360	75.5

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::AlignIO::phylip
3							#
4							# Copyright Heikki Lehvaslaiho
5							#
6
7							=head1 NAME
8
9							Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
10
11							=head1 SYNOPSIS
12
13							# Do not use this module directly. Use it via the Bio::AlignIO class.
14
15							use Bio::AlignIO;
16							use Bio::SimpleAlign;
17							#you can set the name length to something other than the default 10
18							#if you use a version of phylip (hacked) that accepts ids > 10
19							my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
20							-fh => \*STDOUT,
21							-idlength=>30);
22							# convert data from one format to another
23							my $gcgstream = Bio::AlignIO->new(-format => 'msf',
24							-file => 't/data/cysprot1a.msf');
25
26							while( my $aln = $gcgstream->next_aln ) {
27							$phylipstream->write_aln($aln);
28							}
29
30							# do it again with phylip sequential format format
31							$phylipstream->interleaved(0);
32							# can also initialize the object like this
33							$phylipstream = Bio::AlignIO->new(-interleaved => 0,
34							-format => 'phylip',
35							-fh => \*STDOUT,
36							-idlength=>10);
37							$gcgstream = Bio::AlignIO->new(-format => 'msf',
38							-file => 't/data/cysprot1a.msf');
39
40							while( my $aln = $gcgstream->next_aln ) {
41							$phylipstream->write_aln($aln);
42							}
43
44							=head1 DESCRIPTION
45
46							This object can transform Bio::SimpleAlign objects to and from PHYLIP
47							format. By default it works with the interleaved format. By specifying
48							the flag -interleaved =E 0 in the initialization the module can
49							read or write data in sequential format.
50
51							Long IDs up to 50 characters are supported by flag -longid =E
52							1. ID strings can be surrounded by single quoted. They are mandatory
53							only if the IDs contain spaces.
54
55							=head1 FEEDBACK
56
57							=head2 Support
58
59							Please direct usage questions or support issues to the mailing list:
60
61							I
62
63							rather than to the module maintainer directly. Many experienced and
64							reponsive experts will be able look at the problem and quickly
65							address it. Please include a thorough description of the problem
66							with code and data examples if at all possible.
67
68							=head2 Reporting Bugs
69
70							Report bugs to the Bioperl bug tracking system to help us keep track
71							the bugs and their resolution. Bug reports can be submitted via the
72							web:
73
74							https://github.com/bioperl/bioperl-live/issues
75
76							=head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich
77
78							Email: heikki at ebi.ac.uk
79							Email: jason at bioperl.org
80
81							=head1 APPENDIX
82
83							The rest of the documentation details each of the object
84							methods. Internal methods are usually preceded with a _
85
86							=cut
87
88							# Let the code begin...
89
90							package Bio::AlignIO::phylip;
91	4			4		481	use vars qw($DEFAULTIDLENGTH $DEFAULTLINELEN $DEFAULTTAGLEN);
	4					7
	4					200
92	4			4		14	use strict;
	4					4
	4					70
93
94	4			4		504	use Bio::SimpleAlign;
	4					6
	4					77
95	4			4		1764	use POSIX; # for the rounding call
	4					17746
	4					17
96
97	4			4		7854	use base qw(Bio::AlignIO);
	4					8
	4					590
98
99							BEGIN {
100	4			4		8	$DEFAULTIDLENGTH = 10;
101	4					5	$DEFAULTLINELEN = 60;
102	4					5445	$DEFAULTTAGLEN = 10;
103							}
104
105							=head2 new
106
107							Title : new
108							Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
109							-file => '>file',
110							-idlength => 10,
111							-idlinebreak => 1);
112							Function: Initialize a new L reader or writer
113							Returns : L object
114							Args : [specific for writing of phylip format files]
115							-idlength => integer - length of the id (will pad w/
116							spaces if needed)
117							-interleaved => boolean - whether interleaved
118							or sequential format required
119							-line_length => integer of how long a sequence lines should be
120							-idlinebreak => insert a line break after the sequence id
121							so that sequence starts on the next line
122							-flag_SI => whether or not write a "S" or "I" just after
123							the num.seq. and line len., in the first line
124							-tag_length => integer of how long the tags have to be in
125							each line between the space separator. set it
126							to 0 to have 1 tag only.
127							-wrap_sequential => boolean for whether or not sequential
128							format should be broken up or a single line
129							default is false (single line)
130							-longid => boolean for allowing arbitrary long IDs (default is false)
131
132							=cut
133
134							sub _initialize {
135	14			14		28	my($self,@args) = @_;
136	14					38	$self->SUPER::_initialize(@args);
137
138	14					56	my ($interleave,$linelen,$idlinebreak,
139							$idlength, $flag_SI, $tag_length,$ws, $longid) =
140							$self->_rearrange([qw(INTERLEAVED
141							LINE_LENGTH
142							IDLINEBREAK
143							IDLENGTH
144							FLAG_SI
145							TAG_LENGTH
146							WRAP_SEQUENTIAL
147							LONGID)],@args);
148	14	50				51	$self->interleaved($interleave ? 1 : 0) if defined $interleave;
		100
149	14		33			49	$self->idlength($idlength \|\| $DEFAULTIDLENGTH);
150	14	50				25	$self->id_linebreak(1) if( $idlinebreak );
151	14	50	33			31	$self->line_length($linelen) if defined $linelen && $linelen > 0;
152	14	50				24	$self->flag_SI(1) if ( $flag_SI );
153	14	50	33			55	$self->tag_length($tag_length) if ( $tag_length \|\| $DEFAULTTAGLEN );
154	14	50				48	$self->wrap_sequential($ws ? 1 : 0);
155	14	100				37	$self->longid($longid ? 1 : 0);
156	14					25	1;
157							}
158
159							=head2 next_aln
160
161							Title : next_aln
162							Usage : $aln = $stream->next_aln()
163							Function: returns the next alignment in the stream.
164							Throws an exception if trying to read in PHYLIP
165							sequential format.
166							Returns : L object
167							Args :
168
169							=cut
170
171							sub next_aln {
172	10			10	1	33	my $self = shift;
173	10					10	my $entry;
174	10					9	my ($seqcount, $residuecount, %hash, $name,$str,
175							@names,$seqname,$start,$end,$count,$seq);
176
177	10					64	my $aln = Bio::SimpleAlign->new(-source => 'phylip');
178
179							# First, parse up through the header.
180							# If we see a non-blank line that isn't the seqcount and residuecount line
181							# then bail out of next_aln (return)
182	10					44	while ($entry = $self->_readline) {
183	10	50				61	if ($entry =~ /^\s?$/) {
		50
184	0					0	next;
185							} elsif ($entry =~ /\s*(\d+)\s+(\d+)/) {
186	10					27	($seqcount, $residuecount) = ($1, $2);
187	10					15	last;
188							} else {
189	0					0	$self->warn ("Failed to parse PHYLIP: Did not see a sequence count and residue count.");
190	0					0	return;
191							}
192							}
193	10	50	33			78	return unless $seqcount and $residuecount;
194
195							# First alignment section. We expect to see a name and (part of) a sequence.
196	10					21	my $idlen = $self->idlength;
197	10					15	$count = 0;
198
199	10					20	while ($entry = $self->_readline) {
200	74	100				158	if ($entry =~ /^\s?$/) { # eat the newlines
201	4					7	next;
202							}
203
204							# Names can be in a few different formats:
205							# 1. they can be traditional phylip: 10 chars long, period. If this is the case, that name can have spaces.
206							# 2. they can be hacked with a long ID, as passed in with the flag -longid.
207							# 3. if there is a long ID, the name can have spaces as long as it is wrapped in single quotes.
208	70	100				86	if ($self->longid()) { # 2 or 3
209	12	100				32	if ($entry =~ /^'(.+)'\s+(.+)$/) { # 3. name has single quotes.
210	1					3	$name = $1;
211	1					2	$str = $2;
212							} else { # 2. name does not have single quotes, so should not have spaces.
213							# therefore, the first part of the line is the name and the rest is the seq.
214							# make sure that the line does not lead with extra spaces.
215	11					15	$entry =~ s/^\s+//;
216	11					30	($name, $str) = split (/\s+/,$entry, 2);
217							}
218							} else { # 1. traditional phylip.
219	58					111	$entry =~ /^(.{1,10})\s(.+)$/;
220	58					72	$name = $1;
221	58					60	$str = $2;
222	58					95	$name =~ s/\s+$//; # eat any trailing spaces
223	58					73	$name =~ s/\s+/_/g;
224							}
225	70					119	push @names, $name;
226							#clean sequence of spaces:
227	70					1101	$str =~ s/\s+//g;
228
229							# are we sequential? If so, we should keep adding to the sequence until we've got all the residues.
230	70	100				84	if (($self->interleaved) == 0) {
231	4					8	while (length($str) < $residuecount) {
232	20					30	$entry = $self->_readline;
233	20					25	$str .= $entry;
234	20					97	$str =~ s/\s+//g;
235	20	50				91	if ($entry =~ /^\s*$/) { # we ran into a newline before we got a complete sequence: bail!
236	0					0	$self->warn("Failed to parse PHYLIP: Sequence $name was shorter than expected: " . length($str) . " instead of $residuecount.");
237	0					0	last;
238							}
239							}
240							}
241	70					92	$hash{$count} = $str;
242
243	70					52	$count++;
244							# if we've read as many seqs as we're supposed to, move on.
245	70	100				129	if ($count == $seqcount) {
246	10					12	last;
247							}
248							}
249
250							# if we are interleaved, we're going to keep seeing chunks of sequence until we get all of it.
251	10	100				17	if ($self->interleaved) {
252	9					36	while (length($hash{$seqcount-1}) < $residuecount) {
253	106					68	$count = 0;
254	106					145	while ($entry = $self->_readline) {
255	427	100				695	if ($entry =~ /^\s*$/) { # eat newlines
256	106	50				132	if ($count != 0) { # there was a newline at an unexpected place!
257	0					0	$self->warn("Failed to parse PHYLIP: Interleaved file is missing a segment: saw $count, expected $seqcount.");
258	0					0	return;
259							}
260	106					157	next;
261							} else { # start taking in chunks
262	321					1782	$entry =~ s/\s//g;
263	321					349	$hash{$count} .= $entry;
264	321					214	$count++;
265							}
266	321	100				531	if ($count >= $seqcount) { # we've read all of the sequences for this chunk, so move on.
267	106					167	last;
268							}
269							}
270							}
271							}
272	10	50				24	if ((scalar @names) != $seqcount) {
273	0					0	$self->warn("Failed to parse PHYLIP: Did not see the correct number of seqs: saw " . scalar(@names) . ", expected $seqcount.");
274	0					0	return;
275							}
276	10					24	for ($count=0; $count<$seqcount; $count++) {
277	70					73	$str = $hash{$count};
278	70					67	my $seqname = $names[$count];
279	70	50				93	if (length($str) != $residuecount) {
280	0					0	$self->warn("Failed to parse PHYLIP: Sequence $seqname was the wrong length: " . length($str) . " instead of $residuecount.");
281							}
282	70					160	$seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
283							'-display_id' => $seqname);
284	70					144	$aln->add_seq($seq);
285							}
286	10					41	return $aln;
287							}
288
289							=head2 write_aln
290
291							Title : write_aln
292							Usage : $stream->write_aln(@aln)
293							Function: writes the $aln object into the stream in phylip format
294							Returns : 1 for success and 0 for error
295							Args : L object
296
297							=cut
298
299							sub write_aln {
300	4			4	1	12	my ($self,@aln) = @_;
301	4					4	my $count = 0;
302	4					6	my $wrapped = 0;
303	4					7	my $maxname;
304	4					9	my $width = $self->line_length();
305	4					4	my ($length,$date,$name,$seq,$miss,$pad,
306							%hash,@arr,$tempcount,$index,$idlength,$flag_SI,$line_length, $tag_length);
307
308	4					8	foreach my $aln (@aln) {
309	4	50	33			25	if( ! $aln \|\| ! $aln->isa('Bio::Align::AlignI') ) {
310	0					0	$self->warn("Must provide a Bio::Align::AlignI object when calling write_aln");
311	0					0	next;
312							}
313	4	50				17	$self->throw("All sequences in the alignment must be the same length")
314							unless $aln->is_flush(1) ;
315
316	4					11	$flag_SI = $self->flag_SI();
317	4					15	$aln->set_displayname_flat(); # plain
318	4					12	$length = $aln->length();
319	4	50				12	if ($flag_SI) {
320	0	0				0	if ($self->interleaved() ) {
321	0					0	$self->_print (sprintf(" %s %s I\n", $aln->num_sequences, $aln->length));
322							} else {
323	0					0	$self->_print (sprintf(" %s %s S\n", $aln->num_sequences, $aln->length));
324							}
325							} else {
326	4					14	$self->_print (sprintf(" %s %s\n", $aln->num_sequences, $aln->length));
327							}
328
329	4					11	$idlength = $self->idlength();
330	4					8	$line_length = $self->line_length();
331	4					7	$tag_length = $self->tag_length();
332	4					9	foreach $seq ( $aln->each_seq() ) {
333	16					29	$name = $aln->displayname($seq->get_nse);
334	16	50				24	if ($self->longid) {
335	0	0				0	$self->warn("The length of the name is over 50 chars long [$name]")
336							if length($name) > 50;
337	0					0	$name = "'$name' "
338							} else {
339	16	100				26	$name = substr($name, 0, $idlength) if length($name) > $idlength;
340	16					39	$name = sprintf("%-".$idlength."s",$name);
341	16	50				22	if( $self->interleaved() ) {
		0
342	16					21	$name .= ' ' ;
343							} elsif( $self->id_linebreak) {
344	0					0	$name .= "\n";
345							}
346							}
347							#phylip needs dashes not dots
348	16					26	my $seq = $seq->seq();
349	16					25	$seq =~ s/\./-/g;
350	16					21	$hash{$name} = $seq;
351	16					21	push(@arr,$name);
352							}
353
354	4	50				11	if( $self->interleaved() ) {
355	4					5	my $numtags;
356	4	50				9	if ($tag_length <= $line_length) {
357	4					38	$numtags = floor($line_length/$tag_length);
358	4					12	$line_length = $tag_length*$numtags;
359							} else {
360	0					0	$numtags = 1;
361							}
362	4					9	while( $count < $length ) {
363
364							# there is another block to go!
365	11					15	foreach $name ( @arr ) {
366	58					38	my $dispname = $name;
367	58	100				76	$dispname = '' if $wrapped;
368	58					115	$self->_print (sprintf("%".($idlength+3)."s",$dispname));
369	58					41	$tempcount = $count;
370	58					44	$index = 0;
371	58	100				105	$self->debug("residue count: $count\n") if ($count%100000 == 0);
372	58		100			153	while( ($tempcount + $tag_length < $length) &&
373							($index < $numtags) ) {
374	286					651	$self->_print (sprintf("%s ",substr($hash{$name},
375							$tempcount,
376							$tag_length)));
377	286					224	$tempcount += $tag_length;
378	286					698	$index++;
379							}
380							# last
381	58	100				70	if( $index < $numtags) {
382							# space to print!
383	16					41	$self->_print (sprintf("%s",substr($hash{$name},
384							$tempcount)));
385	16					18	$tempcount += $tag_length;
386							}
387	58					68	$self->_print ("\n");
388							}
389	11					20	$self->_print ("\n");
390	11					12	$count = $tempcount;
391	11					19	$wrapped = 1;
392							}
393							} else {
394	0					0	foreach $name ( @arr ) {
395	0					0	my $dispname = $name;
396	0					0	my $line = sprintf("%s%s\n",$dispname,$hash{$name});
397	0	0				0	if( $self->wrap_sequential ) {
398	0					0	$line =~ s/(.{1,$width})/$1\n/g;
399							}
400	0					0	$self->_print ($line);
401							}
402							}
403							}
404	4	50	33			11	$self->flush if $self->_flush_on_write && defined $self->_fh;
405	4					20	return 1;
406							}
407
408							=head2 interleaved
409
410							Title : interleaved
411							Usage : my $interleaved = $obj->interleaved
412							Function: Get/Set Interleaved status
413							Returns : boolean
414							Args : boolean
415
416
417							=cut
418
419							sub interleaved {
420	101			101	1	82	my ($self,$value) = @_;
421	101	100				135	if( defined $value ) {
422	1	50				2	if ($value) {$self->{'_interleaved'} = 1 }
	0					0
423	1					1	else {$self->{'_interleaved'} = 0 }
424							}
425	101	100				251	return 1 unless defined $self->{'_interleaved'};
426	6					12	return $self->{'_interleaved'};
427							}
428
429							=head2 flag_SI
430
431							Title : flag_SI
432							Usage : my $flag = $obj->flag_SI
433							Function: Get/Set if the Sequential/Interleaved flag has to be shown
434							after the number of sequences and sequence length
435							Example :
436							Returns : boolean
437							Args : boolean
438
439
440							=cut
441
442							sub flag_SI{
443	4			4	1	6	my ($self,$value) = @_;
444	4					7	my $previous = $self->{'_flag_SI'};
445	4	50				20	if( defined $value ) {
446	0					0	$self->{'_flag_SI'} = $value;
447							}
448	4					5	return $previous;
449							}
450
451							=head2 idlength
452
453							Title : idlength
454							Usage : my $idlength = $obj->idlength
455							Function: Get/Set value of id length
456							Returns : string
457							Args : string
458
459
460							=cut
461
462							sub idlength {
463	28			28	1	26	my($self,$value) = @_;
464	28	100				48	if (defined $value){
465	14					19	$self->{'_idlength'} = $value;
466							}
467	28					36	return $self->{'_idlength'};
468							}
469
470							=head2 line_length
471
472							Title : line_length
473							Usage : $obj->line_length($newval)
474							Function:
475							Returns : value of line_length
476							Args : newvalue (optional)
477
478
479							=cut
480
481							sub line_length{
482	8			8	1	9	my ($self,$value) = @_;
483	8	50				15	if( defined $value) {
484	0					0	$self->{'_line_length'} = $value;
485							}
486	8		33			38	return $self->{'_line_length'} \|\| $DEFAULTLINELEN;
487
488							}
489
490							=head2 tag_length
491
492							Title : tag_length
493							Usage : $obj->tag_length($newval)
494							Function:
495							Example : my $tag_length = $obj->tag_length
496							Returns : value of the length for each space-separated tag in a line
497							Args : newvalue (optional) - set to zero to have one tag per line
498
499
500							=cut
501
502							sub tag_length{
503	18			18	1	19	my ($self,$value) = @_;
504	18	50				32	if( defined $value) {
505	0					0	$self->{'_tag_length'} = $value;
506							}
507	18		66			39	return $self->{'_tag_length'} \|\| $DEFAULTTAGLEN;
508							}
509
510
511							=head2 id_linebreak
512
513							Title : id_linebreak
514							Usage : $obj->id_linebreak($newval)
515							Function:
516							Returns : value of id_linebreak
517							Args : newvalue (optional)
518
519
520							=cut
521
522							sub id_linebreak{
523	0			0	1	0	my ($self,$value) = @_;
524	0	0				0	if( defined $value) {
525	0					0	$self->{'_id_linebreak'} = $value;
526							}
527	0		0			0	return $self->{'_id_linebreak'} \|\| 0;
528							}
529
530
531							=head2 wrap_sequential
532
533							Title : wrap_sequential
534							Usage : $obj->wrap_sequential($newval)
535							Function:
536							Returns : value of wrap_sequential
537							Args : newvalue (optional)
538
539
540							=cut
541
542							sub wrap_sequential{
543	14			14	1	14	my ($self,$value) = @_;
544	14	50				23	if( defined $value) {
545	14					21	$self->{'_wrap_sequential'} = $value;
546							}
547	14		50			26	return $self->{'_wrap_sequential'} \|\| 0;
548							}
549
550							=head2 longid
551
552							Title : longid
553							Usage : $obj->longid($newval)
554							Function:
555							Returns : value of longid
556							Args : newvalue (optional)
557
558
559							=cut
560
561							sub longid{
562	100			100	1	85	my ($self,$value) = @_;
563	100	100				127	if( defined $value) {
564	14					25	$self->{'_longid'} = $value;
565							}
566	100		100			249	return $self->{'_longid'} \|\| 0;
567							}
568
569							1;