line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::MUST::Core::Ali; |
2
|
|
|
|
|
|
|
# ABSTRACT: Multiple sequence alignment |
3
|
|
|
|
|
|
|
# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be> |
4
|
|
|
|
|
|
|
# CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com> |
5
|
|
|
|
|
|
|
$Bio::MUST::Core::Ali::VERSION = '0.212530'; |
6
|
17
|
|
|
17
|
|
13523
|
use Moose; |
|
17
|
|
|
|
|
45
|
|
|
17
|
|
|
|
|
138
|
|
7
|
17
|
|
|
17
|
|
92305
|
use namespace::autoclean; |
|
17
|
|
|
|
|
49
|
|
|
17
|
|
|
|
|
254
|
|
8
|
|
|
|
|
|
|
|
9
|
17
|
|
|
17
|
|
1696
|
use autodie; |
|
17
|
|
|
|
|
48
|
|
|
17
|
|
|
|
|
146
|
|
10
|
17
|
|
|
17
|
|
90732
|
use feature qw(say); |
|
17
|
|
|
|
|
45
|
|
|
17
|
|
|
|
|
1626
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# use Smart::Comments; |
13
|
|
|
|
|
|
|
|
14
|
17
|
|
|
17
|
|
135
|
use Carp; |
|
17
|
|
|
|
|
58
|
|
|
17
|
|
|
|
|
1370
|
|
15
|
17
|
|
|
17
|
|
144
|
use File::Temp; |
|
17
|
|
|
|
|
40
|
|
|
17
|
|
|
|
|
1573
|
|
16
|
17
|
|
|
17
|
|
129
|
use List::AllUtils qw(uniq indexes sum0); |
|
17
|
|
|
|
|
37
|
|
|
17
|
|
|
|
|
1064
|
|
17
|
17
|
|
|
17
|
|
120
|
use Path::Class qw(file); |
|
17
|
|
|
|
|
52
|
|
|
17
|
|
|
|
|
1261
|
|
18
|
17
|
|
|
17
|
|
120
|
use POSIX qw(ceil floor); |
|
17
|
|
|
|
|
38
|
|
|
17
|
|
|
|
|
144
|
|
19
|
17
|
|
|
17
|
|
12536
|
use Statistics::Descriptive; |
|
17
|
|
|
|
|
388150
|
|
|
17
|
|
|
|
|
693
|
|
20
|
17
|
|
|
17
|
|
10001
|
use Tie::IxHash; |
|
17
|
|
|
|
|
43254
|
|
|
17
|
|
|
|
|
756
|
|
21
|
|
|
|
|
|
|
|
22
|
17
|
|
|
17
|
|
167
|
use Bio::MUST::Core::Types; |
|
17
|
|
|
|
|
51
|
|
|
17
|
|
|
|
|
519
|
|
23
|
17
|
|
|
17
|
|
100
|
use Bio::MUST::Core::Constants qw(:gaps :files); |
|
17
|
|
|
|
|
40
|
|
|
17
|
|
|
|
|
4109
|
|
24
|
17
|
|
|
17
|
|
145
|
use aliased 'Bio::MUST::Core::Seq'; |
|
17
|
|
|
|
|
43
|
|
|
17
|
|
|
|
|
184
|
|
25
|
17
|
|
|
17
|
|
3858
|
use aliased 'Bio::MUST::Core::SeqMask'; |
|
17
|
|
|
|
|
44
|
|
|
17
|
|
|
|
|
76
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# TODO: add information about methods available in Ali-like objects |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# ATTRIBUTES |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
has 'seqs' => ( |
33
|
|
|
|
|
|
|
traits => ['Array'], |
34
|
|
|
|
|
|
|
is => 'ro', |
35
|
|
|
|
|
|
|
isa => 'ArrayRef[Bio::MUST::Core::Seq]', |
36
|
|
|
|
|
|
|
default => sub { [] }, |
37
|
|
|
|
|
|
|
writer => '_set_seqs', |
38
|
|
|
|
|
|
|
handles => { |
39
|
|
|
|
|
|
|
add_seq => 'push', |
40
|
|
|
|
|
|
|
get_seq => 'get', |
41
|
|
|
|
|
|
|
set_seq => 'set', |
42
|
|
|
|
|
|
|
delete_seq => 'delete', |
43
|
|
|
|
|
|
|
insert_seq => 'insert', |
44
|
|
|
|
|
|
|
count_seqs => 'count', |
45
|
|
|
|
|
|
|
all_seqs => 'elements', |
46
|
|
|
|
|
|
|
first_seq => 'first', |
47
|
|
|
|
|
|
|
filter_seqs => 'grep', |
48
|
|
|
|
|
|
|
}, |
49
|
|
|
|
|
|
|
); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
has 'file' => ( |
53
|
|
|
|
|
|
|
is => 'ro', |
54
|
|
|
|
|
|
|
isa => 'Bio::MUST::Core::Types::File', |
55
|
|
|
|
|
|
|
default => 'untitled.ali', |
56
|
|
|
|
|
|
|
coerce => 1, |
57
|
|
|
|
|
|
|
handles => { |
58
|
|
|
|
|
|
|
filename => 'stringify', |
59
|
|
|
|
|
|
|
}, |
60
|
|
|
|
|
|
|
); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
has 'guessing' => ( |
64
|
|
|
|
|
|
|
traits => ['Bool'], |
65
|
|
|
|
|
|
|
is => 'ro', |
66
|
|
|
|
|
|
|
isa => 'Bool', |
67
|
|
|
|
|
|
|
default => 1, |
68
|
|
|
|
|
|
|
handles => { |
69
|
|
|
|
|
|
|
dont_guess => 'unset', |
70
|
|
|
|
|
|
|
guess => 'set', |
71
|
|
|
|
|
|
|
}, |
72
|
|
|
|
|
|
|
); |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
with 'Bio::MUST::Core::Roles::Commentable', |
76
|
|
|
|
|
|
|
'Bio::MUST::Core::Roles::Listable'; |
77
|
|
|
|
|
|
|
with 'Bio::MUST::Core::Roles::Aliable'; ## no critic (ProhibitMultipleWiths) |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# CONSTRUCTORS |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub clone { |
84
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
return $self->new( |
87
|
|
|
|
|
|
|
comments => [ $self->all_comments ], |
88
|
1
|
|
|
|
|
51
|
seqs => [ map { $_->clone } $self->all_seqs ], |
|
250
|
|
|
|
|
789
|
|
89
|
|
|
|
|
|
|
file => file( $self->filename ), |
90
|
|
|
|
|
|
|
guessing => $self->guessing, |
91
|
|
|
|
|
|
|
); |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# ACCESSORS |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub get_seq_with_id { |
98
|
4
|
|
|
4
|
1
|
2613
|
my $self = shift; |
99
|
4
|
|
|
|
|
10
|
my $id = shift; |
100
|
|
|
|
|
|
|
|
101
|
4
|
|
|
11
|
|
209
|
my $seq = $self->first_seq( sub { $_->full_id eq $id } ); |
|
11
|
|
|
|
|
46
|
|
102
|
4
|
100
|
|
|
|
53
|
carp "[BMC] Warning: cannot find seq with id: $id; returning undef!" |
103
|
|
|
|
|
|
|
unless $seq; |
104
|
|
|
|
|
|
|
|
105
|
4
|
|
|
|
|
450
|
return $seq; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub all_new_seqs { |
110
|
6
|
|
|
6
|
1
|
31
|
return shift->filter_seqs( sub { $_->is_new } ); |
|
2
|
|
|
2
|
|
97
|
|
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub all_but_new_seqs { |
115
|
3
|
|
|
3
|
1
|
11
|
return shift->filter_seqs( sub { not $_->is_new } ); |
|
1
|
|
|
1
|
|
47
|
|
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub all_seq_ids { |
120
|
105
|
|
|
105
|
1
|
3689
|
my $self = shift; |
121
|
105
|
|
|
|
|
4403
|
return map { $_->seq_id } $self->all_seqs; |
|
1510
|
|
|
|
|
37004
|
|
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# PROPERTIES |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub has_uniq_ids { |
128
|
64
|
|
|
64
|
1
|
1591
|
my $self = shift; |
129
|
|
|
|
|
|
|
|
130
|
64
|
|
|
|
|
307
|
my @ids = map { $_->full_id } $self->all_seq_ids; |
|
941
|
|
|
|
|
23577
|
|
131
|
64
|
100
|
|
|
|
2431
|
return 1 if $self->count_seqs == uniq @ids; |
132
|
7
|
|
|
|
|
234
|
return 0; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub is_protein { |
137
|
77
|
|
|
77
|
1
|
2649
|
my $self = shift; |
138
|
77
|
100
|
|
96
|
|
3574
|
return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs; |
|
96
|
|
|
|
|
555
|
|
139
|
5
|
|
|
|
|
37
|
return 0; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub is_aligned { |
144
|
310
|
|
|
310
|
1
|
681
|
my $self = shift; |
145
|
310
|
100
|
|
|
|
11726
|
return 0 if not $self->guessing; |
146
|
307
|
100
|
|
8588
|
|
11654
|
return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs; |
|
8588
|
|
|
|
|
18926
|
|
147
|
116
|
|
|
|
|
1062
|
return 0; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
sub width { |
152
|
189
|
|
|
189
|
1
|
495
|
my $self = shift; |
153
|
189
|
100
|
|
|
|
871
|
$self->uniformize if $self->is_aligned; # pad seqs for robustness |
154
|
189
|
|
|
|
|
1092
|
return $self->_max_seq_len; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub _max_seq_len { |
158
|
365
|
|
|
365
|
|
746
|
my $self = shift; |
159
|
365
|
|
|
|
|
12495
|
my @lengths = map { $_->seq_len } $self->all_seqs; |
|
12841
|
|
|
|
|
402763
|
|
160
|
365
|
|
50
|
|
|
5637
|
return (List::AllUtils::max @lengths) // 0; # to avoid warnings |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub seq_len_stats { |
165
|
1
|
|
|
1
|
1
|
10
|
my $self = shift; |
166
|
|
|
|
|
|
|
|
167
|
1
|
|
|
|
|
41
|
my @lengths = map { $_->nomiss_seq_len } $self->all_seqs; |
|
10
|
|
|
|
|
27
|
|
168
|
1
|
|
|
|
|
15
|
my $stat = Statistics::Descriptive::Full->new; |
169
|
1
|
|
|
|
|
138
|
$stat->add_data( \@lengths ); |
170
|
1
|
|
|
|
|
163
|
my @quantiles = map { $stat->quantile($_) } 0..4; |
|
5
|
|
|
|
|
266
|
|
171
|
|
|
|
|
|
|
|
172
|
1
|
|
|
|
|
48
|
return @quantiles; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub perc_miss { |
177
|
2
|
|
|
2
|
1
|
7
|
my $self = shift; |
178
|
|
|
|
|
|
|
|
179
|
2
|
|
|
|
|
93
|
my $n = sum0 map { $_->nomiss_seq_len } $self->all_seqs; |
|
255
|
|
|
|
|
592
|
|
180
|
2
|
50
|
|
|
|
22
|
return 0.0 unless $n; |
181
|
|
|
|
|
|
|
|
182
|
2
|
|
|
|
|
13
|
my $total = $self->width * $self->count_seqs; |
183
|
2
|
|
|
|
|
28
|
return 100.0 * ($total - $n) / $total; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# MUTATORS |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub uc_seqs { |
190
|
1
|
|
|
1
|
1
|
37
|
my $self = shift; |
191
|
|
|
|
|
|
|
|
192
|
1
|
|
|
|
|
50
|
$_->uc for $self->all_seqs; |
193
|
1
|
|
|
|
|
4
|
return $self; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub recode_seqs { ## no critic (RequireArgUnpacking) |
198
|
1
|
|
|
1
|
1
|
17
|
my $self = shift; |
199
|
|
|
|
|
|
|
|
200
|
1
|
|
|
|
|
42
|
$_->recode(@_) for $self->all_seqs; |
201
|
1
|
|
|
|
|
4
|
return $self; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub degap_seqs { |
206
|
1
|
|
|
1
|
1
|
12
|
my $self = shift; |
207
|
|
|
|
|
|
|
|
208
|
1
|
|
|
|
|
46
|
$_->degap for $self->all_seqs; |
209
|
1
|
|
|
|
|
5
|
return $self; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub spacify_seqs { |
214
|
176
|
|
|
176
|
1
|
381
|
my $self = shift; |
215
|
|
|
|
|
|
|
|
216
|
176
|
|
|
|
|
5576
|
$_->spacify for $self->all_seqs; |
217
|
176
|
|
|
|
|
571
|
return $self; |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub gapify_seqs { ## no critic (RequireArgUnpacking) |
222
|
2
|
|
|
2
|
1
|
16
|
my $self = shift; |
223
|
2
|
|
|
|
|
80
|
$_->gapify(@_) for $self->all_seqs; |
224
|
2
|
|
|
|
|
6
|
return $self; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub trim_seqs { |
229
|
176
|
|
|
176
|
1
|
378
|
my $self = shift; |
230
|
|
|
|
|
|
|
|
231
|
176
|
|
|
|
|
5574
|
$_->trim for $self->all_seqs; |
232
|
176
|
|
|
|
|
615
|
return $self; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
sub pad_seqs { |
237
|
176
|
|
|
176
|
1
|
373
|
my $self = shift; |
238
|
|
|
|
|
|
|
|
239
|
176
|
|
|
|
|
633
|
my $bound = $self->_max_seq_len; |
240
|
176
|
|
|
|
|
5518
|
$_->pad_to($bound) for $self->all_seqs; |
241
|
176
|
|
|
|
|
598
|
return $self; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
sub uniformize { |
246
|
175
|
|
|
175
|
1
|
400
|
my $self = shift; |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# TODO: profile code (triggers?) to avoid useless multiple calls |
249
|
|
|
|
|
|
|
|
250
|
175
|
|
|
|
|
663
|
$self->spacify_seqs; |
251
|
175
|
|
|
|
|
749
|
$self->trim_seqs; |
252
|
175
|
|
|
|
|
792
|
$self->pad_seqs; |
253
|
|
|
|
|
|
|
|
254
|
175
|
|
|
|
|
395
|
return $self; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub clear_new_tags { |
259
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
260
|
|
|
|
|
|
|
|
261
|
1
|
|
|
|
|
47
|
$_->clear_new_tag for $self->all_seqs; |
262
|
1
|
|
|
|
|
6
|
return $self; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub shorten_ids { ## no critic (RequireArgUnpacking) |
267
|
12
|
|
|
12
|
1
|
66
|
return shift->_change_ids_(1, @_); |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub restore_ids { ## no critic (RequireArgUnpacking) |
272
|
10
|
|
|
10
|
1
|
42
|
return shift->_change_ids_(0, @_); |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub _change_ids_ { |
276
|
22
|
|
|
22
|
|
47
|
my $self = shift; |
277
|
22
|
|
|
|
|
42
|
my $abbr = shift; |
278
|
22
|
|
|
|
|
39
|
my $mapper = shift; |
279
|
|
|
|
|
|
|
|
280
|
22
|
|
|
|
|
861
|
for my $seq ($self->all_seqs) { |
281
|
133
|
100
|
|
|
|
669
|
my $new_id = $abbr ? $mapper->abbr_id_for( $seq->full_id ) |
282
|
|
|
|
|
|
|
: $mapper->long_id_for( $seq->full_id ); |
283
|
133
|
50
|
|
|
|
4166
|
$seq->set_seq_id($new_id) if $new_id; |
284
|
|
|
|
|
|
|
} # Note: leave id alone if not found |
285
|
|
|
|
|
|
|
|
286
|
22
|
|
|
|
|
109
|
return $self; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
sub apply_list { |
291
|
1
|
|
|
1
|
1
|
11
|
my $self = shift; |
292
|
1
|
|
|
|
|
4
|
my $list = shift; |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# loop through seqs from bottom to top |
295
|
|
|
|
|
|
|
# ... and delete seq from Ali if not listed in list |
296
|
1
|
|
|
|
|
49
|
my $seq_n = $self->count_seqs; |
297
|
|
|
|
|
|
|
|
298
|
1
|
|
|
|
|
7
|
for (my $i = $seq_n-1; $i >= 0; $i--) { |
299
|
250
|
100
|
|
|
|
8210
|
$self->delete_seq($i) |
300
|
|
|
|
|
|
|
unless $list->is_listed( $self->get_seq($i)->full_id ); |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
1
|
|
|
|
|
5
|
return $self; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub _premask_check { |
308
|
40
|
|
|
40
|
|
564
|
my $self = shift; |
309
|
40
|
|
|
|
|
76
|
my $mask = shift; |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# warn of unaligned Ali |
312
|
40
|
50
|
|
|
|
155
|
carp '[BMC] Note: Ali does not look aligned!' |
313
|
|
|
|
|
|
|
. ' This might not be an issue if seqs are ultra-conserved!' |
314
|
|
|
|
|
|
|
unless $self->is_aligned; |
315
|
40
|
|
|
|
|
243
|
$self->uniformize; |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
# warn of empty SeqMask |
318
|
|
|
|
|
|
|
carp '[BMC] Note: applying this mask will result in a zero-width Ali!' |
319
|
40
|
50
|
|
1845
|
|
1636
|
unless List::AllUtils::any { $_ } $mask->all_states; |
|
1845
|
|
|
|
|
2274
|
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
# check that SeqMask is compatible with Ali |
322
|
|
|
|
|
|
|
# potential bugs could come from constant sites etc |
323
|
40
|
|
|
|
|
13304
|
my $a_width = $self->width; |
324
|
40
|
|
|
|
|
1493
|
my $m_width = $mask->mask_len; |
325
|
40
|
100
|
|
|
|
181
|
carp "[BMC] Note: Ali width does not match mask len: $a_width vs. $m_width!" |
326
|
|
|
|
|
|
|
. ' This might not be an issue if the mask results from blocks.' |
327
|
|
|
|
|
|
|
unless $a_width == $m_width; |
328
|
|
|
|
|
|
|
|
329
|
40
|
|
|
|
|
1014
|
return; # return values of before subs are ignored |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
before 'apply_mask' => \&_premask_check; |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
sub apply_mask { |
335
|
|
|
|
|
|
|
my $self = shift; |
336
|
|
|
|
|
|
|
my $mask = shift; |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# select sites for each seq using a precomputed array slice |
339
|
|
|
|
|
|
|
my @indexes = indexes { $_ } $mask->all_states; |
340
|
|
|
|
|
|
|
$_->_set_seq( join q{}, ( $_->all_states )[@indexes] ) |
341
|
|
|
|
|
|
|
for $self->all_seqs; |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
return $self; |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
sub idealize { ## no critic (RequireArgUnpacking) |
348
|
13
|
|
|
13
|
1
|
121
|
my $self = shift; |
349
|
13
|
|
|
|
|
224
|
return $self->apply_mask( SeqMask->ideal_mask($self, @_) ); |
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# MISC METHODS |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
sub gapmiss_regex { |
356
|
73
|
100
|
|
73
|
1
|
1200
|
return shift->is_protein ? $GAPPROTMISS : $GAPDNAMISS; |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
sub map_coords { |
361
|
1
|
|
|
1
|
1
|
12
|
my $self = shift; |
362
|
1
|
|
|
|
|
3
|
my $id = shift; |
363
|
1
|
|
|
|
|
2
|
my $coords_in = shift; |
364
|
|
|
|
|
|
|
|
365
|
1
|
|
|
|
|
6
|
my $seq = $self->get_seq_with_id($id); |
366
|
1
|
|
|
|
|
5
|
my @states = $seq->all_states; |
367
|
|
|
|
|
|
|
|
368
|
1
|
|
|
|
|
3
|
my $count = 0; |
369
|
1
|
|
|
|
|
2
|
my @index; |
370
|
1
|
|
|
|
|
3
|
for my $state (@states) { |
371
|
130
|
100
|
|
|
|
331
|
$count++ if $state !~ $GAP; |
372
|
130
|
|
|
|
|
215
|
push @index, $count; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
1
|
|
|
|
|
2
|
my @coords_out = map { $index[$_-1] } @{$coords_in}; |
|
6
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
3
|
|
376
|
|
|
|
|
|
|
|
377
|
1
|
|
|
|
|
12
|
return \@coords_out; |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# ALIASES |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
sub height { |
384
|
0
|
|
|
0
|
1
|
0
|
return shift->count_seqs; |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
# I/O METHODS |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub load { |
391
|
69
|
|
|
69
|
1
|
31703
|
my $class = shift; |
392
|
69
|
|
|
|
|
221
|
my $infile = shift; |
393
|
|
|
|
|
|
|
|
394
|
69
|
|
|
|
|
423
|
open my $in, '<', $infile; |
395
|
|
|
|
|
|
|
|
396
|
69
|
|
|
|
|
33338
|
my $ali = $class->new( file => $infile ); |
397
|
69
|
|
|
|
|
411
|
my $seq_id; |
398
|
|
|
|
|
|
|
my $seq; |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
LINE: |
401
|
69
|
|
|
|
|
14756
|
while (my $line = <$in>) { |
402
|
4091
|
|
|
|
|
7944
|
chomp $line; |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# skip empty lines and process comments |
405
|
4091
|
100
|
100
|
|
|
25606
|
next LINE if $line =~ $EMPTY_LINE |
406
|
|
|
|
|
|
|
|| $ali->is_comment($line); |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
# at each '>' char... |
409
|
3944
|
|
|
|
|
13635
|
my ($defline) = $line =~ $DEF_LINE; |
410
|
3944
|
100
|
|
|
|
8400
|
if ($defline) { |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# add current seq to ali (if any) |
413
|
1662
|
100
|
|
|
|
3482
|
if ($seq) { |
414
|
1593
|
|
|
|
|
48351
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
415
|
1593
|
|
|
|
|
54597
|
$ali->add_seq($new_seq); |
416
|
1593
|
|
|
|
|
2987
|
$seq = q{}; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
1662
|
|
|
|
|
2726
|
$seq_id = $defline; |
420
|
1662
|
|
|
|
|
26369
|
next LINE; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# elongate current seq (seqs can be broken on several lines) |
424
|
2282
|
|
|
|
|
11246
|
$seq .= $line; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# add last seq to ali (if any) |
428
|
69
|
50
|
|
|
|
328
|
if ($seq) { |
429
|
69
|
|
|
|
|
2504
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
430
|
69
|
|
|
|
|
2392
|
$ali->add_seq($new_seq); |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
69
|
|
|
|
|
2021
|
return $ali; |
434
|
|
|
|
|
|
|
} |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
before qr{\Astore}xms => sub { |
438
|
|
|
|
|
|
|
my $self = shift; |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# perform pre-storage duties |
441
|
|
|
|
|
|
|
carp '[BMC] Warning: non unique seq ids!' unless $self->has_uniq_ids; |
442
|
|
|
|
|
|
|
$self->uniformize if $self->is_aligned; |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
return; |
445
|
|
|
|
|
|
|
}; |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub store { ## no critic (RequireArgUnpacking) |
449
|
|
|
|
|
|
|
my $self = shift; |
450
|
|
|
|
|
|
|
my $outfile = shift; |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# automatically redirect to store_fasta for non-Ali outfile names |
453
|
|
|
|
|
|
|
return $self->store_fasta($outfile, @_) # note the currying |
454
|
|
|
|
|
|
|
unless $outfile =~ $ALI_SUFFIX; |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
open my $out, '>', $outfile; |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
print {$out} $self->header; |
459
|
|
|
|
|
|
|
for my $seq ($self->all_seqs) { |
460
|
|
|
|
|
|
|
say {$out} '>' . $seq->full_id; |
461
|
|
|
|
|
|
|
say {$out} $seq->seq; |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
return; |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
sub store_fasta { |
469
|
|
|
|
|
|
|
my $self = shift; |
470
|
|
|
|
|
|
|
my $outfile = shift; |
471
|
|
|
|
|
|
|
my $args = shift // {}; # HashRef (should not be empty...) |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
my $degap = $args->{degap} // 0; |
474
|
|
|
|
|
|
|
my $clean = $args->{clean} // 0; |
475
|
|
|
|
|
|
|
my $gap = $args->{gapify} // 'X'; |
476
|
|
|
|
|
|
|
my $chunk = $args->{chunk} // 60; |
477
|
|
|
|
|
|
|
my $nowrap = $chunk < 0 ? 1 : 0; |
478
|
|
|
|
|
|
|
my $is_aligned = $self->is_aligned; |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
open my $out, '>', $outfile; |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
for my $seq ($self->all_seqs) { |
483
|
|
|
|
|
|
|
say {$out} '>' . $seq->foreign_id; |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
# optionally clean and/or degap seq |
486
|
|
|
|
|
|
|
$seq = $seq->clone if $clean || $degap; # clone seq only if needed |
487
|
|
|
|
|
|
|
$seq->gapify($gap) if $clean; |
488
|
|
|
|
|
|
|
$seq->degap if $degap; |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
my $width = $seq->seq_len; |
491
|
|
|
|
|
|
|
$chunk = $width if $nowrap; # optionally disable wrap |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
my $str = $seq->wrapped_str($chunk); |
494
|
|
|
|
|
|
|
$str =~ s{$GAP}{-}xmsg if $is_aligned; # restore '-' when aligned |
495
|
|
|
|
|
|
|
print {$out} $str; |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
return; |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
sub temp_fasta { |
503
|
9
|
|
|
9
|
1
|
23
|
my $self = shift; |
504
|
9
|
|
50
|
|
|
26
|
my $args = shift // {}; # HashRef (should not be empty...) |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
# abbreviate ids (possibly using a custom prefix) |
507
|
9
|
|
|
|
|
22
|
my $prefix = $args->{id_prefix}; |
508
|
9
|
|
|
|
|
18
|
delete $args->{id_prefix}; # not necessarily required |
509
|
9
|
|
|
|
|
47
|
my $mapper = $self->std_mapper($prefix); |
510
|
9
|
|
|
|
|
54
|
$self->shorten_ids($mapper); |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# write temporary .fasta file using standard abbr_ids |
513
|
|
|
|
|
|
|
# ...and restore long_ids afterwards |
514
|
9
|
|
|
|
|
88
|
my $out = File::Temp->new(UNLINK => 0, EXLOCK => 0, SUFFIX => '.fasta'); |
515
|
9
|
|
|
|
|
5089
|
$self->store_fasta($out->filename, $args); |
516
|
9
|
|
|
|
|
74
|
$self->restore_ids($mapper); |
517
|
|
|
|
|
|
|
### filename: $out->filename |
518
|
|
|
|
|
|
|
|
519
|
9
|
100
|
|
|
|
56
|
return wantarray ? ($out->filename, $mapper) : $out->filename; |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
# TODO: add an option to remove trailing '_' in ids? |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
sub load_phylip { |
526
|
105
|
|
|
105
|
1
|
8962
|
my $class = shift; |
527
|
105
|
|
|
|
|
297
|
my $infile = shift; |
528
|
|
|
|
|
|
|
|
529
|
105
|
|
|
|
|
838
|
open my $in, '<', $infile; |
530
|
|
|
|
|
|
|
|
531
|
105
|
|
|
|
|
50325
|
my $ali = $class->new( file => $infile ); |
532
|
105
|
|
|
|
|
282
|
my $seq_n; |
533
|
|
|
|
|
|
|
my $site_n; |
534
|
105
|
|
|
|
|
330
|
my $n = 0; |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
LINE: |
537
|
105
|
|
|
|
|
39303
|
while (my $line = <$in>) { |
538
|
8557
|
|
|
|
|
21245
|
chomp $line; |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# skip empty lines |
541
|
8557
|
100
|
|
|
|
51994
|
next LINE if $line =~ $EMPTY_LINE; |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# get matrix dimensions |
544
|
8524
|
100
|
|
|
|
37214
|
if ($line =~ $DIM_LINE) { |
545
|
105
|
|
|
|
|
695
|
($seq_n, $site_n) = ($1, $2); |
546
|
105
|
|
|
|
|
2209
|
next LINE; |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
# process regular line (identifier is optional) |
550
|
8419
|
|
|
|
|
49650
|
my ($seq_id, $seq) = $line =~ $PHY_LINE; |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# should never happen... |
553
|
8419
|
50
|
|
|
|
22049
|
croak "[BMC] Error: unable to parse PHYLIP file at line $.; aborting!" |
554
|
|
|
|
|
|
|
unless $seq; |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
# delete optional spaces |
557
|
8419
|
|
|
|
|
20555
|
$seq =~ tr/ //d; |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
# process first seq block |
560
|
8419
|
100
|
|
|
|
313196
|
if ($ali->count_seqs < $seq_n) { |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# seq_id are mandatory in first block |
563
|
8089
|
50
|
|
|
|
18313
|
croak "[BMC] Error: missing id in PHILIP file at line $.; aborting!" |
564
|
|
|
|
|
|
|
unless $seq_id; |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# store first seq chunk along with seq_id |
567
|
8089
|
|
|
|
|
225725
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
568
|
8089
|
|
|
|
|
279190
|
$ali->add_seq($new_seq); |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# process remaining seq blocks |
572
|
|
|
|
|
|
|
else { |
573
|
330
|
|
|
|
|
11285
|
my $curr_seq = $ali->get_seq($n); |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
# elongate current with partial seq chunk mistaken as seq_id |
576
|
330
|
100
|
100
|
|
|
1087
|
if ($seq_id && $seq_id ne $curr_seq->full_id) { |
577
|
110
|
|
|
|
|
3664
|
$curr_seq->append_seq($seq_id); |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
# elongate current seq with new seq chunk |
581
|
330
|
|
|
|
|
10759
|
$curr_seq->append_seq($seq); |
582
|
|
|
|
|
|
|
} |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
# prepare reading of next seq or next block |
585
|
8419
|
100
|
|
|
|
102891
|
$n = 0 if ++$n == $seq_n; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
105
|
|
|
|
|
919
|
my $width = $ali->width; |
589
|
105
|
50
|
|
|
|
696
|
croak "[BMC] Error: unexpected site number in PHYLIP file: $width;" |
590
|
|
|
|
|
|
|
. ' aborting!' if $width != $site_n; |
591
|
|
|
|
|
|
|
|
592
|
105
|
|
|
|
|
4069
|
return $ali; |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
# PHYLIP: http://evolution.genetics.washington.edu/phylip/doc/main.html |
596
|
|
|
|
|
|
|
# The information for each species follows, starting with a ten-character |
597
|
|
|
|
|
|
|
# species name (which can include blanks and some punctuation marks), and |
598
|
|
|
|
|
|
|
# continuing with the characters for that species. The name should be on the |
599
|
|
|
|
|
|
|
# same line as the first character of the data for that species. (...) The |
600
|
|
|
|
|
|
|
# name should be ten characters in length, filled out to the full ten |
601
|
|
|
|
|
|
|
# characters by blanks if shorter. Any printable ASCII/ISO character is |
602
|
|
|
|
|
|
|
# allowed in the name, except for parentheses ("(" and ")"), square brackets |
603
|
|
|
|
|
|
|
# ("[" and "]"), colon (":"), semicolon (";") and comma (","). (...) Note that |
604
|
|
|
|
|
|
|
# in these sequences we have a blank every ten sites to make them easier to |
605
|
|
|
|
|
|
|
# read: any such blanks are allowed. The blank line which separates the two |
606
|
|
|
|
|
|
|
# groups of lines (the ones containing sites 1-20 and ones containing sites |
607
|
|
|
|
|
|
|
# 21-39) may or may not be present. |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# PhyML: http://www.atgc-montpellier.fr/phyml/usersguide.php?type=command |
610
|
|
|
|
|
|
|
# The input sequence file is a standard PHYLIP file of aligned DNA or |
611
|
|
|
|
|
|
|
# amino-acids sequences. (...) The maximum number of characters in species |
612
|
|
|
|
|
|
|
# name MUST not exceed 100. Blanks and the symbols "(),:" are not allowed |
613
|
|
|
|
|
|
|
# within sequence names because the Newick tree format makes special use of |
614
|
|
|
|
|
|
|
# these symbols. However, blanks (one or more) MUST appear at the end of each |
615
|
|
|
|
|
|
|
# species name. |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# TREE-FINDER: http://www.treefinder.de/tf-march2011-manual.pdf |
618
|
|
|
|
|
|
|
# A sequence name may consist of 1 to 10 alphanumeric characters, dashes "-", |
619
|
|
|
|
|
|
|
# dots ".", underscores "_", or some of "/", "?", "*", "+". No space is |
620
|
|
|
|
|
|
|
# allowed inside the names. The first name character must be a letter or an |
621
|
|
|
|
|
|
|
# underscore. A sequence fragment may start behind position 10 after a |
622
|
|
|
|
|
|
|
# sequence name, or anywhere in a line without a name. |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
# TREE-PUZZLE [data/globin.a] |
625
|
|
|
|
|
|
|
# 7 128 |
626
|
|
|
|
|
|
|
# HBB_HUMAN HLTPEEKSAV TALWGKVNVD EVGGEALGRL LVVYPWTQRF FESFDLSMGN |
627
|
|
|
|
|
|
|
# HBB_HORSE QLSGEEKAAV LALWDKVNEE EVGGEALGRL LVVYPWTQRF FDSFDLSMGN |
628
|
|
|
|
|
|
|
# HBA_HUMAN VLSPADKTNV KAAWGKVGAG EYGAEALERM FLSFPTTKTY FPHFDLSHGS |
629
|
|
|
|
|
|
|
# HBA_HORSE VLSAADKTNV KAAWSKVGAG EYGAEALERM FLGFPTTKTY FPHFDLSHGS |
630
|
|
|
|
|
|
|
# MYG_PHYCA VLSEGEWQLV LHVWAKVEVA GHGQDILIRL FKSHPETLEK FDRFHLKKAS |
631
|
|
|
|
|
|
|
# GLB5_PETMA PLSAAEKTKI RSAWAPVYYE TSGVDILVKF FTSTPAAQEF FPKFGLTKKS |
632
|
|
|
|
|
|
|
# LGB2_LUPLU ALTESQAALV KSSWEEFNIP KHTHRFFILV LEIAPAAKDL FSFLGTSQNN |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# RAXML: http://sco.h-its.org/exelixis/oldPage/RAxML-Manual.7.0.4.pdf |
635
|
|
|
|
|
|
|
# The input alignment format of RAxML is relaxed interleaved or sequential |
636
|
|
|
|
|
|
|
# PHYLIP. "Relaxed" means that sequence names can be of variable length |
637
|
|
|
|
|
|
|
# between 1 up to 256 characters. (...) Prohibited Character(s) in taxon names |
638
|
|
|
|
|
|
|
# taxon names that contain any form of whitespace character, like blanks, |
639
|
|
|
|
|
|
|
# tabulators, and carriage returns, as well as one of the following prohibited |
640
|
|
|
|
|
|
|
# characters: :,();[]. |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
# PHYLOBAYES: http://megasun.bch.umontreal.ca/People/lartillot/www/phylobayes3.3e.pdf |
643
|
|
|
|
|
|
|
# Taxon names may contain more than 10 characters. Avoid special characters |
644
|
|
|
|
|
|
|
# such as ';' or ')', which will create problems when parsing trees. The best |
645
|
|
|
|
|
|
|
# is to only use letters, digits and '_'. Sequences can be interrupted by |
646
|
|
|
|
|
|
|
# space and tab, but not by return characters. Be sure that the lengths of the |
647
|
|
|
|
|
|
|
# sequences are all the same, and identical to the lengths indicated in the |
648
|
|
|
|
|
|
|
# header. Sequences can be interleaved, in which case the taxon names may or |
649
|
|
|
|
|
|
|
# may not be repeated in each block. |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
sub store_phylip { |
653
|
|
|
|
|
|
|
my $self = shift; |
654
|
|
|
|
|
|
|
my $outfile = shift; |
655
|
|
|
|
|
|
|
my $args = shift // {}; # HashRef (should not be empty...) |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
my $short = $args->{short} // 1; |
658
|
|
|
|
|
|
|
my $clean = $args->{clean} // 0; |
659
|
|
|
|
|
|
|
my $chunk = $args->{chunk} // 60; |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
open my $out, '>', $outfile; |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
# print data matrix dimensions |
664
|
|
|
|
|
|
|
my $height = $self->count_seqs; |
665
|
|
|
|
|
|
|
my $width = $self->width; |
666
|
|
|
|
|
|
|
say {$out} $height . q{ } . $width; |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
# setup id format (this will also affect block-like structure) |
669
|
|
|
|
|
|
|
my $format = $short ? "%-10.10s %s\n" : "%s %s\n"; |
670
|
|
|
|
|
|
|
my $method = $short ? 'full_id' : 'foreign_id'; |
671
|
|
|
|
|
|
|
my $sep = $short ? q{ } : q{}; |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# optionally disable wrapping |
674
|
|
|
|
|
|
|
$chunk = $width if $chunk < 0; |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
# output Ali in sequential or interleaved format |
677
|
|
|
|
|
|
|
for (my $site = 0; $site < $width; $site += $chunk) { |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
# leave empty line between chunks (but not after last chunk) |
680
|
|
|
|
|
|
|
print {$out} "\n" if $site; |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
for my $seq ($self->all_seqs) { |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
# print 10-chars ids only once |
685
|
|
|
|
|
|
|
# Note: full_ids are only truncated (use IdMapper for more) |
686
|
|
|
|
|
|
|
my $id = $site == 0 ? $seq->$method : q{}; |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
# optionally clean seq |
689
|
|
|
|
|
|
|
$seq = $seq->clone->gapify('X') if $clean; |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# print seq chunks in 10-state blocks |
692
|
|
|
|
|
|
|
# Note: We insert a space in the 11th column to ensure that more |
693
|
|
|
|
|
|
|
# software packages can read the written files. PHYLIP itself will |
694
|
|
|
|
|
|
|
# ignore this spurious space in the 'sequences'. |
695
|
|
|
|
|
|
|
( my $str = $seq->edit_seq($site, $chunk) ) =~ s{$GAP}{-}xmsg; |
696
|
|
|
|
|
|
|
printf {$out} $format, $id, join $sep, |
697
|
|
|
|
|
|
|
map { substr($str, $_*10, 10) } 0..ceil(length($str)/10)-1; |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
return; |
702
|
|
|
|
|
|
|
} |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
sub load_stockholm { |
706
|
2
|
|
|
2
|
1
|
31
|
my $class = shift; |
707
|
2
|
|
|
|
|
4
|
my $infile = shift; |
708
|
|
|
|
|
|
|
|
709
|
2
|
|
|
|
|
9
|
open my $in, '<', $infile; |
710
|
|
|
|
|
|
|
|
711
|
2
|
|
|
|
|
430
|
my $ali = $class->new( file => $infile ); |
712
|
2
|
|
|
|
|
55
|
tie my %seq_for, 'Tie::IxHash'; |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
LINE: |
715
|
2
|
|
|
|
|
1111
|
while (my $line = <$in>) { |
716
|
44
|
|
|
|
|
288
|
chomp $line; |
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
# process GF comments |
719
|
44
|
100
|
|
|
|
105
|
next LINE if $ali->is_comment($line, $STK_COMMENT); |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
# skip empty lines and other comment lines |
722
|
27
|
100
|
100
|
|
|
214
|
next LINE if $line =~ $EMPTY_LINE |
723
|
|
|
|
|
|
|
|| $line =~ $COMMENT_LINE; |
724
|
11
|
100
|
|
|
|
38
|
last LINE if $line =~ $END_LINE; |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
# parse stockholm seq |
727
|
9
|
|
|
|
|
42
|
my ($seq_id, $seq) = $line =~ $STK_SEQ; |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# replace dots and gaps by stars |
730
|
9
|
|
|
|
|
27
|
$seq =~ s/[\.\-]/*/xmsg; |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
# elongate seq |
733
|
9
|
|
|
|
|
45
|
$seq_for{$seq_id} .= $seq; |
734
|
|
|
|
|
|
|
} |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
# populate Ali object |
737
|
2
|
|
|
|
|
11
|
while (my ($seq_id, $seq) = each %seq_for) { |
738
|
9
|
|
|
|
|
409
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
739
|
9
|
|
|
|
|
307
|
$ali->add_seq($new_seq); |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
|
742
|
2
|
|
|
|
|
72
|
return $ali; |
743
|
|
|
|
|
|
|
} |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
sub instant_store { |
747
|
1
|
|
|
1
|
1
|
5
|
my $class = shift; |
748
|
1
|
|
|
|
|
3
|
my $outfile = shift; |
749
|
1
|
|
50
|
|
|
4
|
my $args = shift // {}; # HashRef (should not be empty...) |
750
|
|
|
|
|
|
|
|
751
|
1
|
|
|
|
|
4
|
my $infile = $args->{infile}; |
752
|
1
|
50
|
|
|
|
5
|
croak '[BMC] Error: no infile specified for instant_store; aborting!' |
753
|
|
|
|
|
|
|
unless $infile; |
754
|
|
|
|
|
|
|
|
755
|
1
|
|
|
|
|
8
|
my $coderef = $args->{coderef}; |
756
|
1
|
50
|
|
|
|
4
|
croak '[BMC] Error: no coderef specified for instant_store; aborting!' |
757
|
|
|
|
|
|
|
unless $coderef; |
758
|
|
|
|
|
|
|
|
759
|
1
|
|
|
|
|
5
|
open my $in, '<', $infile; |
760
|
1
|
|
|
|
|
270
|
open my $out, '>', $outfile; |
761
|
|
|
|
|
|
|
|
762
|
1
|
|
|
|
|
273
|
my $seq_id; |
763
|
|
|
|
|
|
|
my $seq; |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
LINE: |
766
|
1
|
|
|
|
|
611
|
while (my $line = <$in>) { |
767
|
92
|
|
|
|
|
134
|
chomp $line; |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
# skip empty lines and process comments |
770
|
92
|
100
|
66
|
|
|
458
|
next LINE if $line =~ $EMPTY_LINE |
771
|
|
|
|
|
|
|
|| $line =~ $COMMENT_LINE; |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
# at each '>' char... |
774
|
90
|
|
|
|
|
213
|
my ($defline) = $line =~ $DEF_LINE; |
775
|
90
|
100
|
|
|
|
159
|
if ($defline) { |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
# process current seq (if any) |
778
|
3
|
100
|
|
|
|
8
|
if ($seq) { |
779
|
2
|
|
|
|
|
74
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
780
|
2
|
|
|
|
|
5
|
print {$out} $coderef->($new_seq); |
|
2
|
|
|
|
|
9
|
|
781
|
2
|
|
|
|
|
74
|
$seq = q{}; |
782
|
|
|
|
|
|
|
} |
783
|
|
|
|
|
|
|
|
784
|
3
|
|
|
|
|
6
|
$seq_id = $defline; |
785
|
3
|
|
|
|
|
18
|
next LINE; |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# elongate current seq (seqs can be broken on several lines) |
789
|
87
|
|
|
|
|
242
|
$seq .= $line; |
790
|
|
|
|
|
|
|
} |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
# process last seq (if any) |
793
|
1
|
50
|
|
|
|
6
|
if ($seq) { |
794
|
1
|
|
|
|
|
38
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
795
|
1
|
|
|
|
|
3
|
print {$out} $coderef->($new_seq); |
|
1
|
|
|
|
|
5
|
|
796
|
|
|
|
|
|
|
} |
797
|
|
|
|
|
|
|
|
798
|
1
|
|
|
|
|
65
|
return; |
799
|
|
|
|
|
|
|
} |
800
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
sub instant_count { |
803
|
1
|
|
|
1
|
1
|
449
|
my $class = shift; |
804
|
1
|
|
|
|
|
2
|
my $infile = shift; |
805
|
|
|
|
|
|
|
|
806
|
1
|
|
|
|
|
2
|
my $seq_n = 0; |
807
|
|
|
|
|
|
|
|
808
|
1
|
|
|
|
|
7
|
open my $in, '<', $infile; |
809
|
1
|
100
|
|
|
|
2865
|
while (<$in>) { $seq_n++ if m/^>/xms } |
|
502
|
|
|
|
|
2586
|
|
810
|
|
|
|
|
|
|
|
811
|
1
|
|
|
|
|
23
|
return $seq_n; |
812
|
|
|
|
|
|
|
} |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
815
|
|
|
|
|
|
|
1; |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
__END__ |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=pod |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
=head1 NAME |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
Bio::MUST::Core::Ali - Multiple sequence alignment |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
=head1 VERSION |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
version 0.212530 |
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
=head1 SYNOPSIS |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
#!/usr/bin/env perl |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
use Modern::Perl '2011'; |
834
|
|
|
|
|
|
|
# same as: |
835
|
|
|
|
|
|
|
# use strict; |
836
|
|
|
|
|
|
|
# use warnings; |
837
|
|
|
|
|
|
|
# use feature qw(say); |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
use Bio::MUST::Core; |
840
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
# read Ali form disk |
843
|
|
|
|
|
|
|
my $ali = Ali->load('example.ali'); |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
# get some properties |
846
|
|
|
|
|
|
|
say 'height: ' . $ali->height; # number of seqs |
847
|
|
|
|
|
|
|
say 'width: ' . $ali->width; # number of sites |
848
|
|
|
|
|
|
|
say '% miss: ' . $ali->perc_miss; # fraction of missing chars (%) |
849
|
|
|
|
|
|
|
say 'seqs are ' . $ali->is_protein ? 'prot' : 'nucl'; |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
# turn seqs to uppercase |
852
|
|
|
|
|
|
|
$ali->uc_seqs; |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
# filter out seqs with no species associated |
855
|
|
|
|
|
|
|
my @seqs = $ali->filter_seqs( sub { not $_->is_genus_only } ); |
856
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdList'; |
857
|
|
|
|
|
|
|
my $list = IdList->new( ids => \@seqs ); |
858
|
|
|
|
|
|
|
$ali->apply_list($list); |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
# alternatively: |
861
|
|
|
|
|
|
|
# $ali = Ali->new( seqs => \@seqs ); |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
# filter out gap-rich sites |
864
|
|
|
|
|
|
|
$ali->idealize(0.2); # min 20% non-gaps per site |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
# filter out non-informative sites |
867
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::SeqMask'; |
868
|
|
|
|
|
|
|
my $mask = SeqMask->parsimony_mask($ali); |
869
|
|
|
|
|
|
|
$ali->apply_mask($mask); |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
# write down reduced Ali to disk |
872
|
|
|
|
|
|
|
$ali->store('example-uc-genus-sp-20.ali'); |
873
|
|
|
|
|
|
|
$ali->store_fasta('example-uc-genus-sp-20.fasta'); |
874
|
|
|
|
|
|
|
|
875
|
|
|
|
|
|
|
# see below for additional methods |
876
|
|
|
|
|
|
|
# ... |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
=head1 DESCRIPTION |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
This module implements the multiple sequence alignment (MSA) class and its |
881
|
|
|
|
|
|
|
methods. An Ali is modeled as an array of L<Bio::MUST::Core::Seq> objects. |
882
|
|
|
|
|
|
|
Consequently, sequence ids do not absolutely need to be unique for it to |
883
|
|
|
|
|
|
|
work (though id uniqueness helps a lot for sequence access and filtering). |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
An Ali knows whether it contains nucleotide or protein sequences, whether |
886
|
|
|
|
|
|
|
those are aligned or not, as well as its Seq count and exact width. All |
887
|
|
|
|
|
|
|
these properties are introspected on the fly, which means that, while they |
888
|
|
|
|
|
|
|
can be expensive to compute, they are always accurate. |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
This is important because an Ali provides methods for inserting, deleting |
891
|
|
|
|
|
|
|
and editing its Seq objects. Further, it does its best to maintain a semantic |
892
|
|
|
|
|
|
|
distinction between true gap states (encoded as '*') and missing regions of |
893
|
|
|
|
|
|
|
undetermined length (encoded as pure whitespace, for example at the end of a |
894
|
|
|
|
|
|
|
short sequence). |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
It also has methods for mapping its sequence ids (for example, before export |
897
|
|
|
|
|
|
|
to the PHYLIP format), as well as methods to selectively retain sequences |
898
|
|
|
|
|
|
|
and sites based on L<Bio::MUST::Core::IdList> and |
899
|
|
|
|
|
|
|
L<Bio::MUST::Core::SeqMask> objects. For example, the C<idealize> method |
900
|
|
|
|
|
|
|
discards shared gaps and optionally removes gap-rich sites only due to a |
901
|
|
|
|
|
|
|
tiny number of sequences. |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
Finally, an Ali can be stored in MUST pseudo-FASTA format (which handles |
904
|
|
|
|
|
|
|
meaningful whitespace and allows comment lines in the header) or be |
905
|
|
|
|
|
|
|
imported/exported from/to several popular MSA formats, such as plain FASTA, |
906
|
|
|
|
|
|
|
STOCKHOLM and PHYLIP. |
907
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
=head2 seqs |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
ArrayRef of L<Bio::MUST::Core::Seq> objects (optional) |
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
Most of the accessor methods described below are implemented by delegation |
915
|
|
|
|
|
|
|
to this public attribute using L<Moose::Meta::Attribute::Native/Moose native |
916
|
|
|
|
|
|
|
delegation>. Their documentation thus heavily borrows from the corresponding |
917
|
|
|
|
|
|
|
help pages. |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
=head2 file |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
L<Path::Class::File> object (optional) |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
This optional attribute is initialized by class methods that C<load> an Ali |
924
|
|
|
|
|
|
|
from disk. It is meant to improve the introspection capabilities of the Ali. |
925
|
|
|
|
|
|
|
For now, this attribute is not used by the C<store> methods, though it might |
926
|
|
|
|
|
|
|
provide them with a default value in the future. |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
=head2 guessing |
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
Boolean (optional) |
931
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
By default, an Ali object tries to guess whether it is aligned or not by |
933
|
|
|
|
|
|
|
looking for gap-like characters in any of its Seq objects (see |
934
|
|
|
|
|
|
|
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence). |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
When this smart behavior causes issues, one can disable it by unsetting this |
937
|
|
|
|
|
|
|
boolean attribute (see C<dont_guess> and C<guess> accessor methods). |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
=head2 comments |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
ArrayRef of strings (optional) |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
An Ali object is commentable, which means that it supports all the methods |
944
|
|
|
|
|
|
|
pertaining to comment lines described in |
945
|
|
|
|
|
|
|
L<Bio::MUST::Core::Roles::Commentable> (such as C<header>). |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
=head1 CONSTRUCTORS |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
=head2 new |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
Default constructor (class method) returning a new Ali. |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
954
|
|
|
|
|
|
|
my $ali1 = Ali->new(); |
955
|
|
|
|
|
|
|
my @seqs = $ali->all_seqs; |
956
|
|
|
|
|
|
|
my $ali2 = Ali->new( seqs => \@seqs ); |
957
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>, |
959
|
|
|
|
|
|
|
C<file>, C<guessing> and C<comments>. |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
=head2 clone |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
Creates a deep copy (a clone) of the Ali. Returns the copy. |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
966
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
967
|
|
|
|
|
|
|
my $ali_copy = $ali->clone; |
968
|
|
|
|
|
|
|
# you can now mess with $ali_copy without affecting $ali |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
This method does not accept any arguments. |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
=head1 ACCESSORS |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
=head2 add_seq |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
Adds one (or more) new sequence(s) at the end of the Ali. Returns the new |
977
|
|
|
|
|
|
|
number of sequences of the Ali. |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Seq'; |
980
|
|
|
|
|
|
|
my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' ); |
981
|
|
|
|
|
|
|
$ali->add_seq($new_seq); |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
This method accepts any number of arguments. |
984
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=head2 get_seq |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
Returns a sequence of the Ali by its index. You can also use negative index |
988
|
|
|
|
|
|
|
numbers, just as with Perl's core array handling. If the specified sequence |
989
|
|
|
|
|
|
|
does not exist, this method will return C<undef>. |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
my $seq = $ali->get_seq($index); |
992
|
|
|
|
|
|
|
croak "Seq $index not found in Ali!" unless defined $seq; |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
This method accepts just one argument (and not an array slice). |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
=head2 get_seq_with_id |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
Returns a sequence of the Ali by its id. If the specified id is not unique, |
999
|
|
|
|
|
|
|
only the first matching sequence will be returned, whereas if no sequence |
1000
|
|
|
|
|
|
|
exists for the specified id, this method will return C<undef>. |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
my $id = 'Pyrus malus_3750@658052655'; |
1003
|
|
|
|
|
|
|
my $seq = $ali->get_seq_with_id($id); |
1004
|
|
|
|
|
|
|
croak "Seq $id not found in Ali!" unless defined $seq; |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
This method accepts just one argument. |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
=head2 set_seq |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
Given an index and a sequence, sets the specified Ali element to the |
1011
|
|
|
|
|
|
|
sequence. This method returns the new sequence at C<$index>. |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Seq'; |
1014
|
|
|
|
|
|
|
my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' ); |
1015
|
|
|
|
|
|
|
$ali->set_seq($index, $new_seq); |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
This method requires two arguments. |
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
=head2 delete_seq |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
Removes the sequence at the given index from the Ali. This method returns |
1022
|
|
|
|
|
|
|
the deleted sequence. If the specified sequence does not exist, it will |
1023
|
|
|
|
|
|
|
return C<undef> instead. |
1024
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
$ali->delete_seq($index); |
1026
|
|
|
|
|
|
|
|
1027
|
|
|
|
|
|
|
This method requires one argument. |
1028
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
=head2 insert_seq |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
Inserts a new sequence into the Ali at the given index. This method returns |
1032
|
|
|
|
|
|
|
the new sequence at C<$index>. |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Seq'; |
1035
|
|
|
|
|
|
|
my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' ); |
1036
|
|
|
|
|
|
|
$ali->insert_seq($index, $new_seq); |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
This method requires two arguments. |
1039
|
|
|
|
|
|
|
|
1040
|
|
|
|
|
|
|
=head2 all_seqs |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
Returns all the sequences of the Ali (not an array reference). |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
my @seqs = $ali->all_seqs; |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
This method does not accept any arguments. |
1047
|
|
|
|
|
|
|
|
1048
|
|
|
|
|
|
|
=head2 first_seq |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
Returns the first sequence of the Ali matching a given criterion, just like |
1051
|
|
|
|
|
|
|
L<List::Util>'s C<first> function. This method requires a subroutine |
1052
|
|
|
|
|
|
|
implementing the matching logic. |
1053
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
# emulate get_seq_with_id method |
1055
|
|
|
|
|
|
|
my $id2find = 'seq13'; |
1056
|
|
|
|
|
|
|
my $seq = $ali->first_seq( sub { $_->full_id eq $id2find } ); |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
This method requires a single argument. |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
=head2 filter_seqs |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
Returns every sequence of the Ali matching a given criterion, just like |
1063
|
|
|
|
|
|
|
Perl's core C<grep> function. This method requires a subroutine implementing |
1064
|
|
|
|
|
|
|
the matching logic. |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
# keep only long sequences (ignoring gaps and missing states) |
1067
|
|
|
|
|
|
|
my @long_seqs = $ali->filter_seqs( sub { $_->nomiss_seq_len > 500 } ); |
1068
|
|
|
|
|
|
|
|
1069
|
|
|
|
|
|
|
This method requires a single argument. |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
=head2 all_new_seqs |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
Returns all the sequences of the Ali tagged as #NEW# (not an array reference). |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
my @new_seqs = $ali->all_new_seqs; |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
This method does not accept any arguments. |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
=head2 all_but_new_seqs |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
Returns all the sequences of the Ali except those tagged as #NEW# (not an array |
1082
|
|
|
|
|
|
|
reference). |
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
my @preexisting_seqs = $ali->all_but_new_seqs; |
1085
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
This method does not accept any arguments. |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
=head2 all_seq_ids |
1089
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
Returns all the sequence ids (L<Bio::MUST::Core::SeqId> objects) of the Ali |
1091
|
|
|
|
|
|
|
(not an array reference). This is only a convenience method. |
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
use Test::Deeply; |
1094
|
|
|
|
|
|
|
my @ids1 = $ali->all_seq_ids; |
1095
|
|
|
|
|
|
|
my @ids2 = map { $_->seq_id } $ali->all_seqs; |
1096
|
|
|
|
|
|
|
is_deeply \@ids1, \@ids2, 'should be true'; |
1097
|
|
|
|
|
|
|
my @orgs = map { $_->org } @ids1; |
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
This method does not accept any arguments. |
1100
|
|
|
|
|
|
|
|
1101
|
|
|
|
|
|
|
=head2 filename |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
Returns the stringified filename of the Ali. |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
This method does not accept any arguments. |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
=head2 guess |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
Turn on the smart detection of gaps (see C<guessing> attribute above). |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
This method does not accept any arguments. |
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
=head2 dont_guess |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
Turn off the smart detection of gaps (see C<guessing> attribute above). |
1116
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1118
|
|
|
|
|
|
|
my $ali = Ali->load('ensembl.fasta'); |
1119
|
|
|
|
|
|
|
$ali->dont_guess; |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
This method does not accept any arguments. |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
=head1 PROPERTIES |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
=head2 has_uniq_ids |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
Returns true if all the sequence ids are unique. |
1128
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids; |
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
This method does not accept any arguments. |
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
=head2 is_protein |
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
Returns true if any sequence of the Ali looks like a protein. See |
1136
|
|
|
|
|
|
|
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence. |
1137
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
say 'Your file includes nucleotide sequences' unless $ali->is_protein; |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
This method does not accept any arguments. |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
=head2 is_aligned |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
Returns true if any sequence of the Ali appears to be aligned. See |
1145
|
|
|
|
|
|
|
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence. |
1146
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
If the boolean attribute guessing is not set, always returns false. |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
carp 'Warning: file does not look aligned!' unless $ali->is_aligned; |
1150
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
This method does not accept any arguments. |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
=head2 count_seqs |
1154
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
Returns the number of sequences of the Ali. The alias method C<height> is |
1156
|
|
|
|
|
|
|
provided for convenience. |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
my $height = $ali->count_seqs; |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
This method does not accept any arguments. |
1161
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
=head2 width |
1163
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
Returns the width of the Ali (in characters). If the Ali is not aligned, |
1165
|
|
|
|
|
|
|
returns the length of the longest sequence instead. |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
To avoid potential bugs due to caching, this method dynamically computes the |
1168
|
|
|
|
|
|
|
Ali width at each call. Moreover, the Ali is always uniformized (see below) |
1169
|
|
|
|
|
|
|
beforehands to ensure accurate width value. Therefore, this method is |
1170
|
|
|
|
|
|
|
expensive and should not be called repeatedly (e.g., in a loop condition). |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
# you'd better looping through sites like this... |
1173
|
|
|
|
|
|
|
my $width = $ali->width; |
1174
|
|
|
|
|
|
|
for my $site (0..$width-1) { |
1175
|
|
|
|
|
|
|
... |
1176
|
|
|
|
|
|
|
} |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
This method does not accept any arguments. |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
=head2 seq_len_stats |
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps). |
1183
|
|
|
|
|
|
|
The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max). |
1184
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
This method does not accept any arguments. |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
=head2 perc_miss |
1188
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
Returns the percentage of missing (and gap-like) character states in the Ali. |
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
As this method internally calls C<Ali::width>, the remarks above also apply. |
1192
|
|
|
|
|
|
|
|
1193
|
|
|
|
|
|
|
my $miss_level = $ali->perc_miss; |
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
This method does not accept any arguments. |
1196
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
=head1 MUTATORS |
1198
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
=head2 uc_seqs |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
Turn all the sequences of the Ali to uppercase and returns it. |
1202
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
This method does not accept any arguments. |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
=head2 recode_seqs |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
Recode all the sequences of the Ali and returns it. |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1210
|
|
|
|
|
|
|
my $ali = Ali->load('biased.ali'); |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
# set up RY recoding for suppressing codon bias |
1213
|
|
|
|
|
|
|
my %base_for = ( |
1214
|
|
|
|
|
|
|
A => 'A', G => 'A', # purines |
1215
|
|
|
|
|
|
|
C => 'C', T => 'C', # pyrimidines |
1216
|
|
|
|
|
|
|
); |
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
my $ali_rec = $ali->recode_seqs( \%base_for ); |
1219
|
|
|
|
|
|
|
$ali_rec->store('biased_ry.ali'); |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
This method requires one argument. |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
=head2 degap_seqs |
1224
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
Remove the gaps in all the sequences of the Ali and returns it. |
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
This method does not accept any arguments. |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
=head2 spacify_seqs |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
Spacifies all the sequences of the Ali and returns it. See the corresponding |
1232
|
|
|
|
|
|
|
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning |
1233
|
|
|
|
|
|
|
operation. |
1234
|
|
|
|
|
|
|
|
1235
|
|
|
|
|
|
|
This method does not accept any arguments. |
1236
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
=head2 gapify_seqs |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
Gapifies all the sequences of the Ali and returns it. See the corresponding |
1240
|
|
|
|
|
|
|
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning |
1241
|
|
|
|
|
|
|
operation. |
1242
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
This method accepts an optional argument. |
1244
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
=head2 trim_seqs |
1246
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
Trims all the sequences of the Ali and returns it. See the corresponding |
1248
|
|
|
|
|
|
|
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning |
1249
|
|
|
|
|
|
|
operation. |
1250
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
This method does not accept any arguments. |
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
=head2 pad_seqs |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
Pads all the sequences of the Ali and returns it. See the corresponding |
1256
|
|
|
|
|
|
|
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning |
1257
|
|
|
|
|
|
|
operation. |
1258
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
This method does not accept any arguments. |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
=head2 uniformize |
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
Performs the three gap-cleaning operations in turn on all the sequences of |
1264
|
|
|
|
|
|
|
the Ali and returns it, which ensures that it is semantically clean and |
1265
|
|
|
|
|
|
|
rectangular. |
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
This is only a convenience method called internally by the Ali object before |
1268
|
|
|
|
|
|
|
selected methods (such as storage-like methods). However, it might prove |
1269
|
|
|
|
|
|
|
useful in some circumstances, hence it is not defined as private. |
1270
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1272
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
1273
|
|
|
|
|
|
|
$ali->add_seq(...); |
1274
|
|
|
|
|
|
|
# more editing of the Ali sequences |
1275
|
|
|
|
|
|
|
$ali->uniformize; |
1276
|
|
|
|
|
|
|
|
1277
|
|
|
|
|
|
|
This method does not accept any arguments. |
1278
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
=head2 clear_new_tags |
1280
|
|
|
|
|
|
|
|
1281
|
|
|
|
|
|
|
Clear the #NEW# tag (if any) from all the sequences of the Ali and returns it. |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1284
|
|
|
|
|
|
|
my $ali = Ali->load('input-42.ali'); |
1285
|
|
|
|
|
|
|
$ali->clear_new_tags; |
1286
|
|
|
|
|
|
|
my @new_seqs = $ali->all_new_seqs; |
1287
|
|
|
|
|
|
|
# array should be empty |
1288
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
This method does not accept any arguments. |
1290
|
|
|
|
|
|
|
|
1291
|
|
|
|
|
|
|
=head2 shorten_ids |
1292
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
Replaces all the sequence ids of the Ali by their abbreviated forms as |
1294
|
|
|
|
|
|
|
specified by the passed L<Bio::MUST::Core::IdMapper> and returns the Ali. |
1295
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
Note that this method will work only if the sequence ids have not been |
1297
|
|
|
|
|
|
|
already shortened or modified in any way since the creation of the IdMapper. |
1298
|
|
|
|
|
|
|
Long ids without abbreviated forms in the IdMapper are left untouched. |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1301
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdMapper'; |
1302
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
1303
|
|
|
|
|
|
|
my $mapper = IdMapper->std_mapper($ali, 'lcl|seq'); |
1304
|
|
|
|
|
|
|
$ali->shorten_ids($mapper); |
1305
|
|
|
|
|
|
|
$ali->store_fasta('input.4blast.fasta'); |
1306
|
|
|
|
|
|
|
# makeblastdb |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
# Note: the temp_fasta method does exactly that |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
This method requires one argument. |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
=head2 restore_ids |
1313
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
Replaces all the sequence ids of the Ali by their long forms as specified by |
1315
|
|
|
|
|
|
|
the passed L<Bio::MUST::Core::IdMapper> and returns the Ali. |
1316
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
Note that this method will work only if the sequence ids have been previously |
1318
|
|
|
|
|
|
|
abbreviated (see above) and have not been modified in any way since then. |
1319
|
|
|
|
|
|
|
Again, abbreviated ids without long forms in the IdMapper are left untouched. |
1320
|
|
|
|
|
|
|
|
1321
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdMapper'; |
1322
|
|
|
|
|
|
|
my $mapper = IdMapper->gi_mapper($ali); |
1323
|
|
|
|
|
|
|
$ali->shorten_ids($mapper); |
1324
|
|
|
|
|
|
|
... |
1325
|
|
|
|
|
|
|
$ali->restore_ids($mapper); |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
This method requires one argument. |
1328
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
=head2 apply_list |
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
Selectively retains or discards sequences from the Ali based on the content |
1332
|
|
|
|
|
|
|
of the passed L<Bio::MUST::Core::IdList> object and returns the Ali. |
1333
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdList'; |
1335
|
|
|
|
|
|
|
my $list = IdList->load('interesting_seqs.idl'); |
1336
|
|
|
|
|
|
|
$ali->apply_list($list); # discard non-interesting seqs |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
This method requires one argument. |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
=head2 apply_mask |
1341
|
|
|
|
|
|
|
|
1342
|
|
|
|
|
|
|
Selectively retains or discards sites from the Ali based on the content of |
1343
|
|
|
|
|
|
|
the passed L<Bio::MUST::Core::SeqMask> object and returns the Ali. |
1344
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::SeqMask'; |
1346
|
|
|
|
|
|
|
my $variable_sites = SeqMask->variable_mask($ali); |
1347
|
|
|
|
|
|
|
$ali->apply_mask($variable_sites); # discard constant sites |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
This method requires one argument. |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
=head2 idealize |
1352
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
Computes and applies an ideal sequence mask to the Ali and returns it. This |
1354
|
|
|
|
|
|
|
is only a convenience method. |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
When invoked without arguments, it will discard the gaps that are |
1357
|
|
|
|
|
|
|
universally shared by all the sequences. Otherwise, the provided argument |
1358
|
|
|
|
|
|
|
corresponds to the threshold of the C<ideal_mask> method described in |
1359
|
|
|
|
|
|
|
L<Bio::MUST::Core::SeqMask>. |
1360
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdList'; |
1362
|
|
|
|
|
|
|
my $fast_seqs = IdList->load('fast_evolving_seqs.idl'); |
1363
|
|
|
|
|
|
|
my $seqs2keep = $fast_seqs->negative_list($ali); |
1364
|
|
|
|
|
|
|
$ali->apply_list($seqs2keep); # discard fast-evolving seqs |
1365
|
|
|
|
|
|
|
$ali->idealize; # discard newly shared gaps caused by fast seqs |
1366
|
|
|
|
|
|
|
|
1367
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1368
|
|
|
|
|
|
|
my $ali = Ali->load('hmm_based.ali'); |
1369
|
|
|
|
|
|
|
$ali->idealize(0.05); # discard insertions due to <5% of the seqs |
1370
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
This method accepts an optional argument. |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
=head1 MISC METHODS |
1374
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
=head2 gapmiss_regex |
1376
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
Returns a regular expression matching gaps and ambiguous or missing states. |
1378
|
|
|
|
|
|
|
The exact regex returned depends on the type of sequences in the Ali (nucl. or |
1379
|
|
|
|
|
|
|
proteins). |
1380
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
my $regex = $ali->gapmiss_regex; |
1382
|
|
|
|
|
|
|
my $first_seq = $ali->get_seq(0)->seq; |
1383
|
|
|
|
|
|
|
my $gapmiss_n = $first_seq =~ m/($regex)/xmsg; |
1384
|
|
|
|
|
|
|
say "The first sequence has $gapmiss_n gaps or ambiguous/missing sites"; |
1385
|
|
|
|
|
|
|
|
1386
|
|
|
|
|
|
|
This method does not accept any arguments. |
1387
|
|
|
|
|
|
|
|
1388
|
|
|
|
|
|
|
=head2 map_coords |
1389
|
|
|
|
|
|
|
|
1390
|
|
|
|
|
|
|
Converts a set of site positions from Ali coordinates to coordinates of the |
1391
|
|
|
|
|
|
|
specified sequence (thereby ignoring positions due to gaps). Returns the |
1392
|
|
|
|
|
|
|
converted sites in sequence coordinates as an array refrence. |
1393
|
|
|
|
|
|
|
|
1394
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1395
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
1396
|
|
|
|
|
|
|
my $id = 'GIV-Norovirus Hum.GIV.1.POL_1338688@508124125'; |
1397
|
|
|
|
|
|
|
my $ali_coords = [ 4, 25, 73, 89, 104, 116 ]; |
1398
|
|
|
|
|
|
|
my $seq_coords = $ali->map_coords($id, $ali_coords); |
1399
|
|
|
|
|
|
|
# $seq_coords is [ 3, 23, 59, 71, 71, 74 ] |
1400
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
This method requires two arguments: the id of a sequence and an array |
1402
|
|
|
|
|
|
|
reference of input sites in Ali coordinates. |
1403
|
|
|
|
|
|
|
|
1404
|
|
|
|
|
|
|
=head1 I/O METHODS |
1405
|
|
|
|
|
|
|
|
1406
|
|
|
|
|
|
|
=head2 load |
1407
|
|
|
|
|
|
|
|
1408
|
|
|
|
|
|
|
Class method (constructor) returning a new Ali read from disk. This method |
1409
|
|
|
|
|
|
|
will transparently import plain FASTA files in addition to the MUST |
1410
|
|
|
|
|
|
|
pseudo-FASTA format (ALI files). |
1411
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
use Test::Deeply; |
1413
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1414
|
|
|
|
|
|
|
my $ali1 = Ali->load('example.ali'); |
1415
|
|
|
|
|
|
|
my $ali2 = Ali->load('example.fasta'); |
1416
|
|
|
|
|
|
|
my @seqs1 = $ali1->all_seqs; |
1417
|
|
|
|
|
|
|
my @seqs2 = $ali2->all_seqs; |
1418
|
|
|
|
|
|
|
is_deeply, \@seqs1, \@seqs2, 'should be true'; |
1419
|
|
|
|
|
|
|
|
1420
|
|
|
|
|
|
|
This method requires one argument. |
1421
|
|
|
|
|
|
|
|
1422
|
|
|
|
|
|
|
=head2 store |
1423
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
Writes the Ali to disk in the MUST pseudo-FASTA format (ALI files). |
1425
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
Note that the ALI format is only used when the suffix of the outfile name is |
1427
|
|
|
|
|
|
|
'.ali'. In all other cases (including lack of suffix), this method |
1428
|
|
|
|
|
|
|
automatically forwards the call to C<store_fasta>. |
1429
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
$ali->store('output.ali'); |
1431
|
|
|
|
|
|
|
# output.ali is written in ALI format |
1432
|
|
|
|
|
|
|
$ali->store('output.fasta'); |
1433
|
|
|
|
|
|
|
# output.fasta is written in FASTA format |
1434
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
This method requires one argument (but see C<store_fasta> in case of automatic |
1436
|
|
|
|
|
|
|
forwarding of the method call). |
1437
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
=head2 store_fasta |
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
Writes the Ali to disk in the plain FASTA format. |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
For compatibility purposes, this method automatically fetches sequence ids |
1443
|
|
|
|
|
|
|
using the C<foreign_id> method instead of the native C<full_id> method, both |
1444
|
|
|
|
|
|
|
described in L<Bio::MUST::Core::SeqId>. |
1445
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
$ali->store_fasta( 'output.fasta' ); |
1447
|
|
|
|
|
|
|
$ali->store_fasta( 'output.fasta', {chunk => -1, degap => 1} ); |
1448
|
|
|
|
|
|
|
|
1449
|
|
|
|
|
|
|
This method requires one argument and accepts a second optional argument |
1450
|
|
|
|
|
|
|
controlling the output format. It is a hash reference that may contain one |
1451
|
|
|
|
|
|
|
or more of the following keys: |
1452
|
|
|
|
|
|
|
|
1453
|
|
|
|
|
|
|
- clean: replace all ambiguous and missing states by C<X> (default: false) |
1454
|
|
|
|
|
|
|
- degap: boolean value controlling degapping (default: false) |
1455
|
|
|
|
|
|
|
- chunk: line width (default is 60 chars; negative values means no wrap) |
1456
|
|
|
|
|
|
|
|
1457
|
|
|
|
|
|
|
Finally, it is possible to fine-tune the behavior of the C<clean> option by |
1458
|
|
|
|
|
|
|
providing another character than C<X> through the C<gapify> key. This can be |
1459
|
|
|
|
|
|
|
useful to replace all ambiguous and missing states by gaps, as shown below: |
1460
|
|
|
|
|
|
|
|
1461
|
|
|
|
|
|
|
$ali->store_fasta( 'output.fasta, { clean => 1, gapify => '*' } ); |
1462
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
=head2 temp_fasta |
1464
|
|
|
|
|
|
|
|
1465
|
|
|
|
|
|
|
Writes a temporary copy of the Ali to disk in the plain FASTA format using |
1466
|
|
|
|
|
|
|
numeric sequence ids and returns the name of the temporary file. This is |
1467
|
|
|
|
|
|
|
only a convenience method. |
1468
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
In list context, returns the IdMapper object along with temporary filename. |
1470
|
|
|
|
|
|
|
|
1471
|
|
|
|
|
|
|
my $infile = $ali->temp_fasta( { degap => 1 } ); |
1472
|
|
|
|
|
|
|
my $output = `script.sh $infile`; |
1473
|
|
|
|
|
|
|
... |
1474
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
This method accepts the same optional argument hash as C<store_fasta>. |
1476
|
|
|
|
|
|
|
However, an additional option (C<id_prefix>) is available to control the way |
1477
|
|
|
|
|
|
|
abbreviated sequence ids are prefixed by the C<std_mapper> method (see |
1478
|
|
|
|
|
|
|
L<Bio::MUST::Core::Listable>). |
1479
|
|
|
|
|
|
|
|
1480
|
|
|
|
|
|
|
my $infile1 = $ali1->temp_fasta( { id_prefix => 'file1-' } ); |
1481
|
|
|
|
|
|
|
my $infile2 = $ali2->temp_fasta( { id_prefix => 'file2-' } ); |
1482
|
|
|
|
|
|
|
|
1483
|
|
|
|
|
|
|
=head2 load_phylip |
1484
|
|
|
|
|
|
|
|
1485
|
|
|
|
|
|
|
Class method (constructor) returning a new Ali read from a file in the |
1486
|
|
|
|
|
|
|
PHYLIP format. Both sequential and interleaved formats are supported. |
1487
|
|
|
|
|
|
|
|
1488
|
|
|
|
|
|
|
The only constraint is that sequence ids MUST NOT contain whitespace and be |
1489
|
|
|
|
|
|
|
followed by at least one whitespace character. This means that some old-school |
1490
|
|
|
|
|
|
|
PHYLIP files not following this convention will not be processed correctly. |
1491
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
When using the interleaved format, sequence ids may or may not be repeated in |
1493
|
|
|
|
|
|
|
each block. |
1494
|
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1496
|
|
|
|
|
|
|
my $ali = Ali->load_phylip('phylip.phy'); |
1497
|
|
|
|
|
|
|
say $ali->count_seqs; |
1498
|
|
|
|
|
|
|
say $ali->width; |
1499
|
|
|
|
|
|
|
|
1500
|
|
|
|
|
|
|
# outputs: |
1501
|
|
|
|
|
|
|
# 10 |
1502
|
|
|
|
|
|
|
# 709 |
1503
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
This method requires one argument. |
1505
|
|
|
|
|
|
|
|
1506
|
|
|
|
|
|
|
=head2 store_phylip |
1507
|
|
|
|
|
|
|
|
1508
|
|
|
|
|
|
|
Writes the Ali to disk in the interleaved (or sequential) PHYLIP format. |
1509
|
|
|
|
|
|
|
|
1510
|
|
|
|
|
|
|
To ensure maximal flexibility, this method fetches sequence ids using the |
1511
|
|
|
|
|
|
|
native C<full_id> method described in L<Bio::MUST::Core::SeqId>, but |
1512
|
|
|
|
|
|
|
truncates them to 10 characters, as expected by the original PHYLIP software |
1513
|
|
|
|
|
|
|
package. No other tinkering is carried out on the ids. Thus, if the ids |
1514
|
|
|
|
|
|
|
contain whitespace or are not unique in their 10 first characters, it is |
1515
|
|
|
|
|
|
|
advised to first map them using one of the constructors in |
1516
|
|
|
|
|
|
|
L<Bio::MUST::Core::IdMapper>. |
1517
|
|
|
|
|
|
|
|
1518
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1519
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdMapper'; |
1520
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
1521
|
|
|
|
|
|
|
my $mapper = IdMapper->std_mapper($ali); |
1522
|
|
|
|
|
|
|
$ali->shorten_ids($mapper); |
1523
|
|
|
|
|
|
|
$ali->store_phylip( 'input.phy', { chunk => 50 } ); |
1524
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
This method requires one argument and accepts a second optional argument |
1526
|
|
|
|
|
|
|
controlling the output format. It is a hash reference that may contain one |
1527
|
|
|
|
|
|
|
or more of the following keys: |
1528
|
|
|
|
|
|
|
|
1529
|
|
|
|
|
|
|
- short: truncate ids to 10 chars, as in original PHYLIP (defaut: yes) |
1530
|
|
|
|
|
|
|
- clean: replace all ambiguous and missing states by 'X' (default: false) |
1531
|
|
|
|
|
|
|
- chunk: line width (default: 60 chars; negative values means no wrap) |
1532
|
|
|
|
|
|
|
|
1533
|
|
|
|
|
|
|
To store the Ali in PHYLIP sequential format, specify a negative chunk (-1). |
1534
|
|
|
|
|
|
|
|
1535
|
|
|
|
|
|
|
=head2 load_stockholm |
1536
|
|
|
|
|
|
|
|
1537
|
|
|
|
|
|
|
Class method (constructor) returning a new Ali read from a file in the |
1538
|
|
|
|
|
|
|
STOCKHOLM format. =GF comments are retained (see above) but not the other |
1539
|
|
|
|
|
|
|
comment classes (=GS, =GR and =GC). |
1540
|
|
|
|
|
|
|
|
1541
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1542
|
|
|
|
|
|
|
my $ali = Ali->load('upsk.stockholm'); |
1543
|
|
|
|
|
|
|
say $ali->header; |
1544
|
|
|
|
|
|
|
|
1545
|
|
|
|
|
|
|
# outputs: |
1546
|
|
|
|
|
|
|
# ID UPSK |
1547
|
|
|
|
|
|
|
# SE Predicted; Infernal |
1548
|
|
|
|
|
|
|
# SS Published; PMID 9223489 |
1549
|
|
|
|
|
|
|
# RN [1] |
1550
|
|
|
|
|
|
|
# RM 9223489 |
1551
|
|
|
|
|
|
|
# RT The role of the pseudoknot at the 3' end of turnip yellow mosaic |
1552
|
|
|
|
|
|
|
# RT virus RNA in minus-strand synthesis by the viral RNA-dependent RNA |
1553
|
|
|
|
|
|
|
# RT polymerase. |
1554
|
|
|
|
|
|
|
# RA Deiman BA, Kortlever RM, Pleij CW; |
1555
|
|
|
|
|
|
|
# RL J Virol 1997;71:5990-5996. |
1556
|
|
|
|
|
|
|
|
1557
|
|
|
|
|
|
|
This method requires one argument. |
1558
|
|
|
|
|
|
|
|
1559
|
|
|
|
|
|
|
=head2 instant_store |
1560
|
|
|
|
|
|
|
|
1561
|
|
|
|
|
|
|
Class method intended to transform a large sequence file read from disk |
1562
|
|
|
|
|
|
|
without loading it in memory. This method will transparently process plain |
1563
|
|
|
|
|
|
|
FASTA files in addition to the MUST pseudo-FASTA format (ALI files). |
1564
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
my $chunk = 200; |
1566
|
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
|
my $split = sub { |
1568
|
|
|
|
|
|
|
my $seq = shift; |
1569
|
|
|
|
|
|
|
my $base_id = ( split /\s+/xms, $seq->full_id )[0]; |
1570
|
|
|
|
|
|
|
my $max_pos = $seq->seq_len - $chunk; |
1571
|
|
|
|
|
|
|
my $n = 0; |
1572
|
|
|
|
|
|
|
my $out_str; |
1573
|
|
|
|
|
|
|
for (my $pos = 0; $pos <= $max_pos; $pos += $chunk, $n++) { |
1574
|
|
|
|
|
|
|
$out_str .= ">$base_id.$n\n" . $seq->edit_seq($pos, |
1575
|
|
|
|
|
|
|
$pos + $chunk <= $max_pos ? $chunk : 2 * $chunk |
1576
|
|
|
|
|
|
|
) . "\n"; |
1577
|
|
|
|
|
|
|
} |
1578
|
|
|
|
|
|
|
return $out_str; |
1579
|
|
|
|
|
|
|
}; |
1580
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1582
|
|
|
|
|
|
|
|
1583
|
|
|
|
|
|
|
Ali->instant_store( |
1584
|
|
|
|
|
|
|
'outfile.fasta', { infile => 'infile.fasta', coderef => $split } |
1585
|
|
|
|
|
|
|
); |
1586
|
|
|
|
|
|
|
|
1587
|
|
|
|
|
|
|
This method requires two arguments. The sercond is a hash reference that must |
1588
|
|
|
|
|
|
|
contain the following keys: |
1589
|
|
|
|
|
|
|
- infile: input sequence file |
1590
|
|
|
|
|
|
|
- coderef: subroutine implementing the transforming logic |
1591
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
=head2 instant_count |
1593
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
Class method returning the number of seqs in any sequence file read from disk |
1595
|
|
|
|
|
|
|
without loading it in memory. This method will transparently process plain |
1596
|
|
|
|
|
|
|
FASTA files in addition to the MUST pseudo-FASTA format (ALI files). |
1597
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1599
|
|
|
|
|
|
|
my $seq_n = Ali->instant_count('input.ali'); |
1600
|
|
|
|
|
|
|
say $seq_n; |
1601
|
|
|
|
|
|
|
|
1602
|
|
|
|
|
|
|
=head1 ALIASES |
1603
|
|
|
|
|
|
|
|
1604
|
|
|
|
|
|
|
=head2 height |
1605
|
|
|
|
|
|
|
|
1606
|
|
|
|
|
|
|
Alias for C<count_seqs> method. For API consistency. |
1607
|
|
|
|
|
|
|
|
1608
|
|
|
|
|
|
|
=head1 AUTHOR |
1609
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
Denis BAURAIN <denis.baurain@uliege.be> |
1611
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
1613
|
|
|
|
|
|
|
|
1614
|
|
|
|
|
|
|
=for stopwords Catherine COLSON Arnaud DI FRANCO |
1615
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
=over 4 |
1617
|
|
|
|
|
|
|
|
1618
|
|
|
|
|
|
|
=item * |
1619
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
Catherine COLSON <ccolson@doct.uliege.be> |
1621
|
|
|
|
|
|
|
|
1622
|
|
|
|
|
|
|
=item * |
1623
|
|
|
|
|
|
|
|
1624
|
|
|
|
|
|
|
Arnaud DI FRANCO <arnaud.difranco@gmail.com> |
1625
|
|
|
|
|
|
|
|
1626
|
|
|
|
|
|
|
=back |
1627
|
|
|
|
|
|
|
|
1628
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
1629
|
|
|
|
|
|
|
|
1630
|
|
|
|
|
|
|
This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN. |
1631
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
1633
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
1634
|
|
|
|
|
|
|
|
1635
|
|
|
|
|
|
|
=cut |