| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# BioPerl module for Bio::AlignIO::nexus |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# Copyright Heikki Lehvaslaiho |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Bio::AlignIO::nexus - NEXUS format sequence input/output stream |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Do not use this module directly. Use it via the L class. |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use Bio::AlignIO; |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $in = Bio::AlignIO->new(-format => 'nexus', |
|
18
|
|
|
|
|
|
|
-file => 'aln.nexus'); |
|
19
|
|
|
|
|
|
|
while( my $aln = $in->next_aln ) { |
|
20
|
|
|
|
|
|
|
# do something with the alignment |
|
21
|
|
|
|
|
|
|
} |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
This object can transform L objects to and from NEXUS |
|
26
|
|
|
|
|
|
|
data blocks. See method documentation for supported NEXUS features. |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Will Fisher has written an excellent standalone NEXUS format parser in |
|
31
|
|
|
|
|
|
|
Perl, readnexus. A number of tricks were adapted from it. |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
=head1 FEEDBACK |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head2 Support |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
I |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
|
42
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
|
43
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
|
44
|
|
|
|
|
|
|
with code and data examples if at all possible. |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head2 Reporting Bugs |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
|
49
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
|
50
|
|
|
|
|
|
|
web: |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head1 AUTHORS - Heikki Lehvaslaiho |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Email: heikki-at-bioperl-dot-org |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=head1 APPENDIX |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
|
61
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=cut |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# Let the code begin... |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
package Bio::AlignIO::nexus; |
|
68
|
3
|
|
|
3
|
|
493
|
use vars qw(%valid_type); |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
129
|
|
|
69
|
3
|
|
|
3
|
|
10
|
use strict; |
|
|
3
|
|
|
|
|
2
|
|
|
|
3
|
|
|
|
|
51
|
|
|
70
|
3
|
|
|
3
|
|
9
|
no strict "refs"; |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
63
|
|
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
|
|
73
|
3
|
|
|
3
|
|
8
|
use base qw(Bio::AlignIO); |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
447
|
|
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
BEGIN { |
|
76
|
3
|
|
|
3
|
|
5
|
%valid_type = map {$_, 1} qw( dna rna protein standard ); |
|
|
12
|
|
|
|
|
5301
|
|
|
77
|
|
|
|
|
|
|
# standard throws error: inherited from Bio::PrimarySeq |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head2 new |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
Title : new |
|
83
|
|
|
|
|
|
|
Usage : $alignio = Bio::AlignIO->new(-format => 'nexus', -file => 'filename'); |
|
84
|
|
|
|
|
|
|
Function: returns a new Bio::AlignIO object to handle clustalw files |
|
85
|
|
|
|
|
|
|
Returns : Bio::AlignIO::clustalw object |
|
86
|
|
|
|
|
|
|
Args : -verbose => verbosity setting (-1,0,1,2) |
|
87
|
|
|
|
|
|
|
-file => name of file to read in or with ">" - writeout |
|
88
|
|
|
|
|
|
|
-fh => alternative to -file param - provide a filehandle |
|
89
|
|
|
|
|
|
|
to read from/write to |
|
90
|
|
|
|
|
|
|
-format => type of Alignment Format to process or produce |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Customization of nexus flavor output |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
-show_symbols => print the symbols="ATGC" in the data definition |
|
95
|
|
|
|
|
|
|
(MrBayes does not like this) |
|
96
|
|
|
|
|
|
|
boolean [default is 1] |
|
97
|
|
|
|
|
|
|
-show_endblock => print an 'endblock;' at the end of the data |
|
98
|
|
|
|
|
|
|
(MyBayes does not like this) |
|
99
|
|
|
|
|
|
|
boolean [default is 1] |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=cut |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
sub _initialize { |
|
104
|
25
|
|
|
25
|
|
43
|
my ($self, @args) = @_; |
|
105
|
25
|
|
|
|
|
63
|
$self->SUPER::_initialize(@args); |
|
106
|
25
|
|
|
|
|
92
|
my ($show_symbols, $endblock) = |
|
107
|
|
|
|
|
|
|
$self->_rearrange([qw(SHOW_SYMBOLS SHOW_ENDBLOCK)], @args); |
|
108
|
25
|
|
|
|
|
51
|
my @names = qw(symbols endblock); |
|
109
|
25
|
|
|
|
|
43
|
for my $v ( $show_symbols, $endblock ) { |
|
110
|
50
|
50
|
|
|
|
83
|
$v = 1 unless defined $v; # default value is 1 |
|
111
|
50
|
|
|
|
|
49
|
my $n = shift @names; |
|
112
|
50
|
|
|
|
|
82
|
$self->flag($n, $v); |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head2 next_aln |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
Title : next_aln |
|
120
|
|
|
|
|
|
|
Usage : $aln = $stream->next_aln() |
|
121
|
|
|
|
|
|
|
Function: Returns the next alignment in the stream. |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
Supports the following NEXUS format features: |
|
124
|
|
|
|
|
|
|
- The file has to start with '#NEXUS' |
|
125
|
|
|
|
|
|
|
- Reads in the name of the alignment from a comment |
|
126
|
|
|
|
|
|
|
(anything after 'TITLE: ') . |
|
127
|
|
|
|
|
|
|
- Sequence names can be given in a taxa block, too. |
|
128
|
|
|
|
|
|
|
- If matchchar notation is used, converts |
|
129
|
|
|
|
|
|
|
them back to sequence characters. |
|
130
|
|
|
|
|
|
|
- Does character conversions specified in the |
|
131
|
|
|
|
|
|
|
NEXUS equate command. |
|
132
|
|
|
|
|
|
|
- Sequence names of type 'Homo sapiens' and |
|
133
|
|
|
|
|
|
|
Homo_sapiens are treated identically. |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Returns : L object |
|
136
|
|
|
|
|
|
|
Args : |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=cut |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
sub next_aln { |
|
143
|
22
|
|
|
22
|
1
|
472
|
my $self = shift; |
|
144
|
22
|
|
|
|
|
25
|
my $entry; |
|
145
|
22
|
|
|
|
|
28
|
my ($aln_name, $seqcount, $residuecount, %hash, $alphabet, |
|
146
|
|
|
|
|
|
|
$match, $gap, $missing, $equate, $interleave, |
|
147
|
|
|
|
|
|
|
$name,$str,@names,$seqname,$start,$end,$count,$seq); |
|
148
|
22
|
|
|
|
|
34
|
local $Bio::LocatableSeq::OTHER_SYMBOLS = '\*\?\.'; |
|
149
|
22
|
|
|
|
|
21
|
local $Bio::LocatableSeq::GAP_SYMBOLS = '\-'; |
|
150
|
22
|
|
|
|
|
130
|
my $aln = Bio::SimpleAlign->new(-source => 'nexus'); |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# file starts with '#NEXUS' but we allow white space only lines before it |
|
153
|
22
|
|
|
|
|
68
|
$entry = $self->_readline; |
|
154
|
22
|
|
33
|
|
|
121
|
$entry = $self->_readline while defined $entry && $entry =~ /^\s+$/; |
|
155
|
|
|
|
|
|
|
|
|
156
|
22
|
50
|
|
|
|
34
|
return unless $entry; |
|
157
|
22
|
50
|
33
|
|
|
132
|
$self->throw("Not a valid interleaved NEXUS file! [#NEXUS] not starting the file\n$entry") |
|
158
|
|
|
|
|
|
|
unless ($entry && $entry =~ /^#NEXUS/i); |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# skip anything before either the taxa or data block |
|
161
|
|
|
|
|
|
|
# but read in the optional title in a comment |
|
162
|
22
|
|
|
|
|
48
|
while (defined($entry = $self->_readline)) { |
|
163
|
64
|
|
|
|
|
82
|
local ($_) = $entry; |
|
164
|
64
|
100
|
|
|
|
122
|
/\[TITLE. *([^\]]+)]\s+/i and $aln_name = $1; |
|
165
|
64
|
100
|
100
|
|
|
293
|
last if /^begin +data/i || /^begin +taxa/i; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
22
|
100
|
50
|
|
|
68
|
$aln_name =~ s/\s/_/g and $aln->id($aln_name) if $aln_name; |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# data and taxa blocks |
|
170
|
22
|
|
|
|
|
18
|
my $incomment; |
|
171
|
22
|
|
|
|
|
48
|
while (defined ($entry = $self->_readline)) { |
|
172
|
402
|
|
|
|
|
433
|
local ($_) = $entry; |
|
173
|
402
|
100
|
|
|
|
1290
|
next if s/\[[^\]]+\]//g; # remove comments |
|
174
|
220
|
100
|
|
|
|
541
|
if( s/\[[^\]]+$// ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
175
|
6
|
|
|
|
|
7
|
$incomment = 1; |
|
176
|
|
|
|
|
|
|
# skip line if it is now empty or contains only whitespace |
|
177
|
6
|
50
|
|
|
|
27
|
next if /^\s*$/; |
|
178
|
|
|
|
|
|
|
} elsif($incomment) { |
|
179
|
39
|
100
|
|
|
|
58
|
if( s/^[^\]]*\]// ) { |
|
180
|
6
|
|
|
|
|
8
|
$incomment = 0; |
|
181
|
|
|
|
|
|
|
} else { |
|
182
|
33
|
|
|
|
|
50
|
next; |
|
183
|
|
|
|
|
|
|
} |
|
184
|
|
|
|
|
|
|
} elsif( /taxlabels/i ) { |
|
185
|
|
|
|
|
|
|
# doesn't deal with taxlabels adequately and can mess things up! |
|
186
|
|
|
|
|
|
|
# @names = $self->_read_taxlabels; |
|
187
|
|
|
|
|
|
|
} else { |
|
188
|
|
|
|
|
|
|
|
|
189
|
162
|
100
|
|
|
|
299
|
/ntax\s*=\s*(\d+)/i and $seqcount = $1; |
|
190
|
162
|
100
|
|
|
|
285
|
/nchar\s*=\s*(\d+)/i and $residuecount = $1; |
|
191
|
162
|
100
|
|
|
|
221
|
/matchchar\s*=\s*(.)/i and $match = $1; |
|
192
|
162
|
100
|
|
|
|
249
|
/gap\s*=\s*(.)/i and $gap = $1; |
|
193
|
162
|
100
|
|
|
|
255
|
/missing\s*=\s*(.)/i and $missing = $1; |
|
194
|
162
|
50
|
|
|
|
223
|
/equate\s*=\s*\"([^\"]+)/i and $equate = $1; # "e.g. equate="T=C G=A"; |
|
195
|
162
|
100
|
|
|
|
235
|
/datatype\s*=\s*(\w+)/i and $alphabet = lc $1; |
|
196
|
162
|
100
|
|
|
|
209
|
/interleave/i and $interleave = 1 ; |
|
197
|
162
|
100
|
|
|
|
402
|
last if /matrix/io; |
|
198
|
|
|
|
|
|
|
} |
|
199
|
|
|
|
|
|
|
} |
|
200
|
22
|
50
|
|
|
|
44
|
$self->throw("Not a valid NEXUS sequence file. Datatype not specified.") |
|
201
|
|
|
|
|
|
|
unless $alphabet; |
|
202
|
|
|
|
|
|
|
$self->throw("Not a valid NEXUS sequence file. Datatype should not be [$alphabet]") |
|
203
|
22
|
50
|
|
|
|
61
|
unless $valid_type{$alphabet}; |
|
204
|
22
|
50
|
33
|
|
|
96
|
$self->throw("\"$gap\" is not a valid gap character. For compatability, gap char can not be one of: ()[]{}/\,;:=*'`\"<>^") |
|
205
|
|
|
|
|
|
|
if $gap && $gap =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/; |
|
206
|
22
|
50
|
66
|
|
|
91
|
$self->throw("\"$missing\" is not a valid missing character. For compatability, missing char can not be one of: ()[]{}/\,;:=*'`\"<>^") |
|
207
|
|
|
|
|
|
|
if $missing && $missing =~ /[\(\)\[\]\{\}\/\\\,\;\:\=\*\'\`\<\>\^]/; |
|
208
|
|
|
|
|
|
|
|
|
209
|
22
|
|
|
|
|
74
|
$aln->gap_char($gap); |
|
210
|
22
|
|
|
|
|
51
|
$aln->missing_char($missing); |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# |
|
213
|
|
|
|
|
|
|
# if data is not right after the matrix line |
|
214
|
|
|
|
|
|
|
# read the empty lines out |
|
215
|
|
|
|
|
|
|
# |
|
216
|
22
|
|
|
|
|
45
|
while ($entry = $self->_readline) { |
|
217
|
25
|
100
|
|
|
|
81
|
unless ($entry =~ /^\s+$/) { |
|
218
|
22
|
|
|
|
|
58
|
$self->_pushback($entry); |
|
219
|
22
|
|
|
|
|
23
|
last; |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
} |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# |
|
224
|
|
|
|
|
|
|
# matrix command |
|
225
|
|
|
|
|
|
|
# |
|
226
|
|
|
|
|
|
|
# first alignment section |
|
227
|
22
|
50
|
|
|
|
46
|
if (@names == 0) { # taxa block did not exist |
|
228
|
22
|
|
|
|
|
32
|
while ($entry = $self->_readline) { |
|
229
|
467
|
|
|
|
|
487
|
local ($_) = $entry; |
|
230
|
467
|
100
|
|
|
|
1173
|
if( s/\[[^\]]+\]//g ) { #] remove comments |
|
231
|
205
|
100
|
|
|
|
371
|
next if /^\s*$/; |
|
232
|
|
|
|
|
|
|
# skip line if it is now empty or contains only whitespace |
|
233
|
|
|
|
|
|
|
} |
|
234
|
459
|
100
|
100
|
|
|
1005
|
if ($interleave && defined$count && ($count <= $seqcount)) { |
|
|
|
|
66
|
|
|
|
|
|
235
|
78
|
100
|
|
|
|
159
|
/^\s+$/ and last; |
|
236
|
|
|
|
|
|
|
} else { |
|
237
|
381
|
100
|
|
|
|
671
|
/^\s+$/ and next; |
|
238
|
|
|
|
|
|
|
} |
|
239
|
448
|
100
|
|
|
|
719
|
/^\s*;/ and last; # stop if colon at end of matrix is on it's own line |
|
240
|
|
|
|
|
|
|
#/^\s*;\s*$/ and last; |
|
241
|
442
|
50
|
|
|
|
1262
|
if ( /^\s*([\"\'](.+?)[\"\']|(\S+))\s+(.*)\s*$/ ) { |
|
242
|
|
|
|
|
|
|
# get single and double quoted names, or all the first |
|
243
|
|
|
|
|
|
|
# nonwhite word as the name, and remained is seq |
|
244
|
|
|
|
|
|
|
#if (/^\s*('([^']*?)'|([^']\S*))\s+(.*)$/) { #' |
|
245
|
442
|
|
66
|
|
|
1008
|
$name = ($2 || $3); |
|
246
|
442
|
100
|
|
|
|
732
|
if ($4) { |
|
247
|
|
|
|
|
|
|
# seq is on same line as name |
|
248
|
|
|
|
|
|
|
# this is the usual NEXUS format |
|
249
|
432
|
|
|
|
|
425
|
$str = $4; |
|
250
|
|
|
|
|
|
|
} else { |
|
251
|
|
|
|
|
|
|
# otherwise get seq from following lines. No comments allowed |
|
252
|
|
|
|
|
|
|
# a less common matrix format, usually used for very long seqs |
|
253
|
10
|
|
|
|
|
12
|
$str=''; |
|
254
|
10
|
|
|
|
|
22
|
while (local ($_) = $self->_readline) { |
|
255
|
1860
|
|
|
|
|
1682
|
my $str_tmp = $_; |
|
256
|
1860
|
|
|
|
|
3664
|
$str_tmp =~ s/[\s;]//g; |
|
257
|
1860
|
|
|
|
|
1508
|
$str .= $str_tmp; |
|
258
|
1860
|
100
|
|
|
|
3807
|
last if length$str == $residuecount; |
|
259
|
|
|
|
|
|
|
} |
|
260
|
|
|
|
|
|
|
} |
|
261
|
442
|
|
|
|
|
322
|
$name =~ s/ /_/g; |
|
262
|
442
|
|
|
|
|
419
|
push @names, $name; |
|
263
|
|
|
|
|
|
|
|
|
264
|
442
|
|
|
|
|
4022
|
$str =~ s/[\s;]//g; |
|
265
|
442
|
|
|
|
|
345
|
$count = @names; |
|
266
|
442
|
|
|
|
|
705
|
$hash{$count} = $str; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
442
|
50
|
|
|
|
572
|
$self->throw("Not a valid interleaved NEXUS file! seqcount [$count] > predeclared [$seqcount] in the first section") if $count > $seqcount; |
|
269
|
442
|
100
|
|
|
|
1196
|
/;/ and last; # stop if colon at end of matrix is on the same line as the last seq |
|
270
|
|
|
|
|
|
|
} |
|
271
|
|
|
|
|
|
|
} |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
# interleaved sections |
|
274
|
22
|
|
|
|
|
27
|
$count = 0; |
|
275
|
22
|
100
|
|
|
|
41
|
if ( $interleave ) { # only read next section if file is interleaved |
|
276
|
7
|
|
|
|
|
14
|
while( $entry = $self->_readline) { |
|
277
|
1003
|
|
|
|
|
1023
|
local ($_) = $entry; |
|
278
|
1003
|
100
|
|
|
|
2700
|
if( s/\[[^\]]+\]//g ) { #] remove comments |
|
279
|
826
|
100
|
|
|
|
1417
|
next if /^\s*$/; # skip line if it is now empty or contains only whitespace |
|
280
|
|
|
|
|
|
|
} |
|
281
|
945
|
100
|
|
|
|
1330
|
/^\s*;/ and last; # stop if colon at end of matrix is on it's own line |
|
282
|
939
|
100
|
|
|
|
1388
|
$count = 0, next if $entry =~ /^\s*$/; |
|
283
|
872
|
50
|
|
|
|
1913
|
if (/^\s*(\'([^\']*?)\'|([^\']\S*))\s+(.*)$/) { |
|
284
|
872
|
|
|
|
|
1092
|
$str = $4; |
|
285
|
872
|
|
|
|
|
2558
|
$str =~ s/[\s;]//g; |
|
286
|
872
|
|
|
|
|
628
|
$count++; |
|
287
|
872
|
|
|
|
|
1205
|
$hash{$count} .= $str; |
|
288
|
|
|
|
|
|
|
}; |
|
289
|
872
|
50
|
|
|
|
1071
|
$self->throw("Not a valid interleaved NEXUS file! |
|
290
|
|
|
|
|
|
|
seqcount [$count] > predeclared [$seqcount] ") if $count > $seqcount; |
|
291
|
872
|
100
|
|
|
|
2036
|
/;/ and last; # stop if colon at end of matrix is on the same line as the last seq |
|
292
|
|
|
|
|
|
|
} |
|
293
|
|
|
|
|
|
|
} |
|
294
|
|
|
|
|
|
|
|
|
295
|
22
|
50
|
|
|
|
47
|
return if @names < 1; |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# sequence creation |
|
298
|
22
|
|
|
|
|
22
|
$count = 0; |
|
299
|
22
|
|
|
|
|
37
|
foreach $name ( @names ) { |
|
300
|
442
|
|
|
|
|
365
|
$count++; |
|
301
|
442
|
50
|
|
|
|
806
|
if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { |
|
302
|
0
|
|
|
|
|
0
|
($seqname,$start,$end) = ($1,$2,$3); |
|
303
|
|
|
|
|
|
|
} else { |
|
304
|
442
|
|
|
|
|
998
|
($seqname,$start,$str) = ($name,1,$hash{$count}); |
|
305
|
442
|
|
|
|
|
7897
|
$str =~ s/[$Bio::LocatableSeq::GAP_SYMBOLS]//g; |
|
306
|
442
|
|
|
|
|
521
|
$end = length($str); |
|
307
|
|
|
|
|
|
|
} |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# consistency test |
|
310
|
|
|
|
|
|
|
$self->throw("Length of sequence [$seqname] is not [$residuecount]; got".CORE::length($hash{$count})) |
|
311
|
442
|
50
|
|
|
|
800
|
unless CORE::length($hash{$count}) == $residuecount; |
|
312
|
|
|
|
|
|
|
|
|
313
|
442
|
|
|
|
|
1072
|
$seq = Bio::LocatableSeq->new('-seq' => $hash{$count}, |
|
314
|
|
|
|
|
|
|
'-display_id' => $seqname, |
|
315
|
|
|
|
|
|
|
'-start' => $start, |
|
316
|
|
|
|
|
|
|
'-end' => $end, |
|
317
|
|
|
|
|
|
|
'-alphabet' => $alphabet |
|
318
|
|
|
|
|
|
|
); |
|
319
|
442
|
|
|
|
|
861
|
$aln->add_seq($seq); |
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# if matchchar is used |
|
323
|
22
|
100
|
|
|
|
57
|
$aln->unmatch($match) if $match; |
|
324
|
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
# if equate ( e.g. equate="T=C G=A") is used |
|
326
|
22
|
50
|
|
|
|
38
|
if ($equate) { |
|
327
|
0
|
|
|
|
|
0
|
$aln->map_chars($1, $2) while $equate =~ /(\S)=(\S)/g; |
|
328
|
|
|
|
|
|
|
} |
|
329
|
|
|
|
|
|
|
|
|
330
|
22
|
|
100
|
|
|
105
|
while (defined $entry && |
|
331
|
|
|
|
|
|
|
$entry !~ /endblock/i) { |
|
332
|
453
|
|
|
|
|
615
|
$entry = $self->_readline; |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
|
|
335
|
22
|
50
|
|
|
|
73
|
return $aln if $aln->num_sequences; |
|
336
|
0
|
|
|
|
|
0
|
return; |
|
337
|
|
|
|
|
|
|
} |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
sub _read_taxlabels { |
|
340
|
0
|
|
|
0
|
|
0
|
my ($self) = @_; |
|
341
|
0
|
|
|
|
|
0
|
my ($name, @names); |
|
342
|
0
|
|
|
|
|
0
|
while (my $entry = $self->_readline) { |
|
343
|
0
|
0
|
|
|
|
0
|
last if $entry =~ m/^\s*(END)?;/i; |
|
344
|
0
|
0
|
|
|
|
0
|
if( $entry =~ m/\s*(\S+)\s+/ ) { |
|
345
|
0
|
|
|
|
|
0
|
($name) = ($1); |
|
346
|
0
|
|
|
|
|
0
|
$name =~ s/\[[^\[]+\]//g; |
|
347
|
0
|
|
|
|
|
0
|
$name =~ s/\W/_/g; |
|
348
|
0
|
|
|
|
|
0
|
push @names, $name; |
|
349
|
|
|
|
|
|
|
} |
|
350
|
|
|
|
|
|
|
} |
|
351
|
0
|
|
|
|
|
0
|
return @names; |
|
352
|
|
|
|
|
|
|
} |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=head2 write_aln |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Title : write_aln |
|
357
|
|
|
|
|
|
|
Usage : $stream->write_aln(@aln) |
|
358
|
|
|
|
|
|
|
Function: Writes the $aln object into the stream in interleaved NEXUS |
|
359
|
|
|
|
|
|
|
format. Everything is written into a data block. |
|
360
|
|
|
|
|
|
|
SimpleAlign methods match_char, missing_char and gap_char must be set |
|
361
|
|
|
|
|
|
|
if you want to see them in the output. |
|
362
|
|
|
|
|
|
|
Returns : 1 for success and 0 for error |
|
363
|
|
|
|
|
|
|
Args : L object |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=cut |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub write_aln { |
|
368
|
2
|
|
|
2
|
1
|
7
|
my ($self,@aln) = @_; |
|
369
|
2
|
|
|
|
|
3
|
my $count = 0; |
|
370
|
2
|
|
|
|
|
2
|
my $wrapped = 0; |
|
371
|
2
|
|
|
|
|
3
|
my $maxname; |
|
372
|
2
|
|
|
|
|
3
|
my ($length,$date,$name,$seq,$miss,$pad,%hash,@arr,$tempcount,$index ); |
|
373
|
2
|
|
|
|
|
5
|
my ($match, $missing, $gap,$symbols) = ('', '', '',''); |
|
374
|
|
|
|
|
|
|
|
|
375
|
2
|
|
|
|
|
5
|
foreach my $aln (@aln) { |
|
376
|
2
|
50
|
33
|
|
|
14
|
if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { |
|
377
|
0
|
|
|
|
|
0
|
$self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); |
|
378
|
0
|
|
|
|
|
0
|
next; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
2
|
50
|
|
|
|
4
|
$self->throw("All sequences in the alignment must be the same length") |
|
381
|
|
|
|
|
|
|
unless $aln->is_flush($self->verbose); |
|
382
|
|
|
|
|
|
|
|
|
383
|
2
|
|
|
|
|
29
|
$length = $aln->length(); |
|
384
|
|
|
|
|
|
|
|
|
385
|
2
|
|
|
|
|
7
|
$self->_print (sprintf("#NEXUS\n[TITLE: %s]\n\nbegin data;\ndimensions ntax=%s nchar=%s;\n", |
|
386
|
|
|
|
|
|
|
$aln->id, $aln->num_sequences, $length)); |
|
387
|
2
|
50
|
|
|
|
8
|
$match = "match=". $aln->match_char if $aln->match_char; |
|
388
|
2
|
100
|
|
|
|
6
|
$missing = "missing=". $aln->missing_char if $aln->missing_char; |
|
389
|
2
|
50
|
|
|
|
6
|
$gap = "gap=". $aln->gap_char if $aln->gap_char; |
|
390
|
|
|
|
|
|
|
|
|
391
|
2
|
50
|
33
|
|
|
6
|
$symbols = 'symbols="'.join('',$aln->symbol_chars). '"' |
|
392
|
|
|
|
|
|
|
if( $self->flag('symbols') && $aln->symbol_chars); |
|
393
|
2
|
|
|
|
|
7
|
$self->_print |
|
394
|
|
|
|
|
|
|
(sprintf("format interleave datatype=%s %s %s %s %s;\n\nmatrix\n", |
|
395
|
|
|
|
|
|
|
$aln->get_seq_by_pos(1)->alphabet, $match, |
|
396
|
|
|
|
|
|
|
$missing, $gap, $symbols)); |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# account for single quotes round names |
|
399
|
2
|
|
|
|
|
23
|
my $indent = $aln->maxdisplayname_length+2; |
|
400
|
|
|
|
|
|
|
|
|
401
|
2
|
|
|
|
|
8
|
$aln->set_displayname_flat(); |
|
402
|
2
|
|
|
|
|
6
|
foreach $seq ( $aln->each_seq() ) { |
|
403
|
10
|
|
|
|
|
14
|
my $nmid = $aln->displayname($seq->get_nse()); |
|
404
|
10
|
100
|
|
|
|
22
|
if( $nmid =~ /[^\w\d\.]/ ) { |
|
405
|
|
|
|
|
|
|
# put name in single quotes incase it contains any of |
|
406
|
|
|
|
|
|
|
# the following chars: ()[]{}/\,;:=*'"`+-<> that are not |
|
407
|
|
|
|
|
|
|
# allowed in PAUP* and possible other software |
|
408
|
|
|
|
|
|
|
|
|
409
|
1
|
|
|
|
|
3
|
$name = sprintf("%-${indent}s", "\'" . $nmid . "\'"); |
|
410
|
|
|
|
|
|
|
} else { |
|
411
|
9
|
|
|
|
|
22
|
$name = sprintf("%-${indent}s", $nmid); |
|
412
|
|
|
|
|
|
|
} |
|
413
|
10
|
|
|
|
|
19
|
$hash{$name} = $seq->seq; |
|
414
|
10
|
|
|
|
|
14
|
push(@arr,$name); |
|
415
|
|
|
|
|
|
|
} |
|
416
|
|
|
|
|
|
|
|
|
417
|
2
|
|
|
|
|
8
|
while( $count < $length ) { |
|
418
|
|
|
|
|
|
|
# there is another block to go! |
|
419
|
10
|
|
|
|
|
10
|
foreach $name ( @arr ) { |
|
420
|
58
|
|
|
|
|
38
|
my $dispname = $name; |
|
421
|
|
|
|
|
|
|
# $dispname = '' if $wrapped; |
|
422
|
58
|
|
|
|
|
106
|
$self->_print (sprintf("%${indent}s ",$dispname)); |
|
423
|
58
|
|
|
|
|
42
|
$tempcount = $count; |
|
424
|
58
|
|
|
|
|
31
|
$index = 0; |
|
425
|
58
|
|
100
|
|
|
154
|
while( ($tempcount + 10 < $length) && ($index < 5) ) { |
|
426
|
268
|
|
|
|
|
607
|
$self->_print (sprintf("%s ",substr($hash{$name},$tempcount,10))); |
|
427
|
268
|
|
|
|
|
204
|
$tempcount += 10; |
|
428
|
268
|
|
|
|
|
649
|
$index++; |
|
429
|
|
|
|
|
|
|
} |
|
430
|
|
|
|
|
|
|
# last |
|
431
|
58
|
100
|
|
|
|
66
|
if( $index < 5) { |
|
432
|
|
|
|
|
|
|
# space to print! |
|
433
|
10
|
|
|
|
|
48
|
$self->_print (sprintf("%s ",substr($hash{$name},$tempcount))); |
|
434
|
10
|
|
|
|
|
9
|
$tempcount += 10; |
|
435
|
|
|
|
|
|
|
} |
|
436
|
58
|
|
|
|
|
64
|
$self->_print ("\n"); |
|
437
|
|
|
|
|
|
|
} |
|
438
|
10
|
|
|
|
|
12
|
$self->_print ("\n\n"); |
|
439
|
10
|
|
|
|
|
8
|
$count = $tempcount; |
|
440
|
10
|
|
|
|
|
14
|
$wrapped = 1; |
|
441
|
|
|
|
|
|
|
} |
|
442
|
2
|
50
|
|
|
|
5
|
if( $self->flag('endblock') ) { |
|
443
|
2
|
|
|
|
|
4
|
$self->_print (";\n\nendblock;\n"); |
|
444
|
|
|
|
|
|
|
} else { |
|
445
|
0
|
|
|
|
|
0
|
$self->_print (";\n\nend;\n"); |
|
446
|
|
|
|
|
|
|
} |
|
447
|
|
|
|
|
|
|
} |
|
448
|
2
|
50
|
33
|
|
|
7
|
$self->flush if $self->_flush_on_write && defined $self->_fh; |
|
449
|
2
|
|
|
|
|
12
|
return 1; |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=head2 flag |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Title : flag |
|
455
|
|
|
|
|
|
|
Usage : $obj->flag($name,$value) |
|
456
|
|
|
|
|
|
|
Function: Get/Set a flag value |
|
457
|
|
|
|
|
|
|
Returns : value of flag (a scalar) |
|
458
|
|
|
|
|
|
|
Args : on set, new value (a scalar or undef, optional) |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
=cut |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
sub flag{ |
|
464
|
54
|
|
|
54
|
1
|
56
|
my ($self,$name,$val) = @_; |
|
465
|
54
|
100
|
|
|
|
192
|
return $self->{'flag'}->{$name} = $val if defined $val; |
|
466
|
4
|
|
|
|
|
17
|
return $self->{'flag'}->{$name}; |
|
467
|
|
|
|
|
|
|
} |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
1; |