line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::MUST::Core::Ali; |
2
|
|
|
|
|
|
|
# ABSTRACT: Multiple sequence alignment |
3
|
|
|
|
|
|
|
# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be> |
4
|
|
|
|
|
|
|
# CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com> |
5
|
|
|
|
|
|
|
$Bio::MUST::Core::Ali::VERSION = '0.212670'; |
6
|
17
|
|
|
17
|
|
13671
|
use Moose; |
|
17
|
|
|
|
|
47
|
|
|
17
|
|
|
|
|
129
|
|
7
|
17
|
|
|
17
|
|
92080
|
use namespace::autoclean; |
|
17
|
|
|
|
|
44
|
|
|
17
|
|
|
|
|
212
|
|
8
|
|
|
|
|
|
|
|
9
|
17
|
|
|
17
|
|
1661
|
use autodie; |
|
17
|
|
|
|
|
66
|
|
|
17
|
|
|
|
|
154
|
|
10
|
17
|
|
|
17
|
|
91106
|
use feature qw(say); |
|
17
|
|
|
|
|
64
|
|
|
17
|
|
|
|
|
1478
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# use Smart::Comments; |
13
|
|
|
|
|
|
|
|
14
|
17
|
|
|
17
|
|
136
|
use Carp; |
|
17
|
|
|
|
|
40
|
|
|
17
|
|
|
|
|
1233
|
|
15
|
17
|
|
|
17
|
|
132
|
use File::Temp; |
|
17
|
|
|
|
|
39
|
|
|
17
|
|
|
|
|
1437
|
|
16
|
17
|
|
|
17
|
|
139
|
use List::AllUtils qw(uniq indexes sum0); |
|
17
|
|
|
|
|
47
|
|
|
17
|
|
|
|
|
982
|
|
17
|
17
|
|
|
17
|
|
118
|
use Path::Class qw(file); |
|
17
|
|
|
|
|
53
|
|
|
17
|
|
|
|
|
837
|
|
18
|
17
|
|
|
17
|
|
112
|
use POSIX qw(ceil floor); |
|
17
|
|
|
|
|
46
|
|
|
17
|
|
|
|
|
129
|
|
19
|
17
|
|
|
17
|
|
11520
|
use Statistics::Descriptive; |
|
17
|
|
|
|
|
373152
|
|
|
17
|
|
|
|
|
701
|
|
20
|
17
|
|
|
17
|
|
8900
|
use Tie::IxHash; |
|
17
|
|
|
|
|
41782
|
|
|
17
|
|
|
|
|
645
|
|
21
|
|
|
|
|
|
|
|
22
|
17
|
|
|
17
|
|
149
|
use Bio::MUST::Core::Types; |
|
17
|
|
|
|
|
44
|
|
|
17
|
|
|
|
|
500
|
|
23
|
17
|
|
|
17
|
|
102
|
use Bio::MUST::Core::Constants qw(:gaps :files); |
|
17
|
|
|
|
|
44
|
|
|
17
|
|
|
|
|
3977
|
|
24
|
17
|
|
|
17
|
|
139
|
use aliased 'Bio::MUST::Core::Seq'; |
|
17
|
|
|
|
|
40
|
|
|
17
|
|
|
|
|
167
|
|
25
|
17
|
|
|
17
|
|
3904
|
use aliased 'Bio::MUST::Core::SeqMask'; |
|
17
|
|
|
|
|
48
|
|
|
17
|
|
|
|
|
71
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# TODO: add information about methods available in Ali-like objects |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# ATTRIBUTES |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
has 'seqs' => ( |
33
|
|
|
|
|
|
|
traits => ['Array'], |
34
|
|
|
|
|
|
|
is => 'ro', |
35
|
|
|
|
|
|
|
isa => 'ArrayRef[Bio::MUST::Core::Seq]', |
36
|
|
|
|
|
|
|
default => sub { [] }, |
37
|
|
|
|
|
|
|
writer => '_set_seqs', |
38
|
|
|
|
|
|
|
handles => { |
39
|
|
|
|
|
|
|
add_seq => 'push', |
40
|
|
|
|
|
|
|
get_seq => 'get', |
41
|
|
|
|
|
|
|
set_seq => 'set', |
42
|
|
|
|
|
|
|
delete_seq => 'delete', |
43
|
|
|
|
|
|
|
insert_seq => 'insert', |
44
|
|
|
|
|
|
|
count_seqs => 'count', |
45
|
|
|
|
|
|
|
all_seqs => 'elements', |
46
|
|
|
|
|
|
|
first_seq => 'first', |
47
|
|
|
|
|
|
|
filter_seqs => 'grep', |
48
|
|
|
|
|
|
|
}, |
49
|
|
|
|
|
|
|
); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
has 'file' => ( |
53
|
|
|
|
|
|
|
is => 'ro', |
54
|
|
|
|
|
|
|
isa => 'Bio::MUST::Core::Types::File', |
55
|
|
|
|
|
|
|
default => 'untitled.ali', |
56
|
|
|
|
|
|
|
coerce => 1, |
57
|
|
|
|
|
|
|
handles => { |
58
|
|
|
|
|
|
|
filename => 'stringify', |
59
|
|
|
|
|
|
|
}, |
60
|
|
|
|
|
|
|
); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
has 'guessing' => ( |
64
|
|
|
|
|
|
|
traits => ['Bool'], |
65
|
|
|
|
|
|
|
is => 'ro', |
66
|
|
|
|
|
|
|
isa => 'Bool', |
67
|
|
|
|
|
|
|
default => 1, |
68
|
|
|
|
|
|
|
handles => { |
69
|
|
|
|
|
|
|
dont_guess => 'unset', |
70
|
|
|
|
|
|
|
guess => 'set', |
71
|
|
|
|
|
|
|
}, |
72
|
|
|
|
|
|
|
); |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
with 'Bio::MUST::Core::Roles::Commentable', |
76
|
|
|
|
|
|
|
'Bio::MUST::Core::Roles::Listable'; |
77
|
|
|
|
|
|
|
with 'Bio::MUST::Core::Roles::Aliable'; ## no critic (ProhibitMultipleWiths) |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# CONSTRUCTORS |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
sub clone { |
84
|
1
|
|
|
1
|
1
|
4
|
my $self = shift; |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
return $self->new( |
87
|
|
|
|
|
|
|
comments => [ $self->all_comments ], |
88
|
1
|
|
|
|
|
48
|
seqs => [ map { $_->clone } $self->all_seqs ], |
|
250
|
|
|
|
|
712
|
|
89
|
|
|
|
|
|
|
file => file( $self->filename ), |
90
|
|
|
|
|
|
|
guessing => $self->guessing, |
91
|
|
|
|
|
|
|
); |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# ACCESSORS |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub get_seq_with_id { |
98
|
4
|
|
|
4
|
1
|
1985
|
my $self = shift; |
99
|
4
|
|
|
|
|
17
|
my $id = shift; |
100
|
|
|
|
|
|
|
|
101
|
4
|
|
|
10
|
|
200
|
my $seq = $self->first_seq( sub { $_->full_id eq $id } ); |
|
10
|
|
|
|
|
44
|
|
102
|
4
|
100
|
|
|
|
44
|
carp "[BMC] Warning: cannot find seq with id: $id; returning undef!" |
103
|
|
|
|
|
|
|
unless $seq; |
104
|
|
|
|
|
|
|
|
105
|
4
|
|
|
|
|
464
|
return $seq; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub all_new_seqs { |
110
|
6
|
|
|
6
|
1
|
20
|
return shift->filter_seqs( sub { $_->is_new } ); |
|
2
|
|
|
2
|
|
106
|
|
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub all_but_new_seqs { |
115
|
3
|
|
|
3
|
1
|
10
|
return shift->filter_seqs( sub { not $_->is_new } ); |
|
1
|
|
|
1
|
|
82
|
|
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub all_seq_ids { |
120
|
105
|
|
|
105
|
1
|
3943
|
my $self = shift; |
121
|
105
|
|
|
|
|
4420
|
return map { $_->seq_id } $self->all_seqs; |
|
1510
|
|
|
|
|
37655
|
|
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# PROPERTIES |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub has_uniq_ids { |
128
|
64
|
|
|
64
|
1
|
1562
|
my $self = shift; |
129
|
|
|
|
|
|
|
|
130
|
64
|
|
|
|
|
211
|
my @ids = map { $_->full_id } $self->all_seq_ids; |
|
941
|
|
|
|
|
24065
|
|
131
|
64
|
100
|
|
|
|
2415
|
return 1 if $self->count_seqs == uniq @ids; |
132
|
7
|
|
|
|
|
234
|
return 0; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub is_protein { |
137
|
77
|
|
|
77
|
1
|
2635
|
my $self = shift; |
138
|
77
|
100
|
|
96
|
|
3277
|
return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs; |
|
96
|
|
|
|
|
534
|
|
139
|
5
|
|
|
|
|
31
|
return 0; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub is_aligned { |
144
|
310
|
|
|
310
|
1
|
587
|
my $self = shift; |
145
|
310
|
100
|
|
|
|
10966
|
return 0 if not $self->guessing; |
146
|
307
|
100
|
|
8588
|
|
11599
|
return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs; |
|
8588
|
|
|
|
|
16726
|
|
147
|
116
|
|
|
|
|
802
|
return 0; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
sub width { |
152
|
189
|
|
|
189
|
1
|
433
|
my $self = shift; |
153
|
189
|
100
|
|
|
|
617
|
$self->uniformize if $self->is_aligned; # pad seqs for robustness |
154
|
189
|
|
|
|
|
945
|
return $self->_max_seq_len; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub _max_seq_len { |
158
|
365
|
|
|
365
|
|
678
|
my $self = shift; |
159
|
365
|
|
|
|
|
12299
|
my @lengths = map { $_->seq_len } $self->all_seqs; |
|
12841
|
|
|
|
|
384806
|
|
160
|
365
|
|
50
|
|
|
5034
|
return (List::AllUtils::max @lengths) // 0; # to avoid warnings |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub seq_len_stats { |
165
|
1
|
|
|
1
|
1
|
13
|
my $self = shift; |
166
|
|
|
|
|
|
|
|
167
|
1
|
|
|
|
|
44
|
my @lengths = map { $_->nomiss_seq_len } $self->all_seqs; |
|
10
|
|
|
|
|
27
|
|
168
|
1
|
|
|
|
|
16
|
my $stat = Statistics::Descriptive::Full->new; |
169
|
1
|
|
|
|
|
139
|
$stat->add_data( \@lengths ); |
170
|
1
|
|
|
|
|
259
|
my @quantiles = map { $stat->quantile($_) } 0..4; |
|
5
|
|
|
|
|
253
|
|
171
|
|
|
|
|
|
|
|
172
|
1
|
|
|
|
|
46
|
return @quantiles; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub perc_miss { |
177
|
2
|
|
|
2
|
1
|
6
|
my $self = shift; |
178
|
|
|
|
|
|
|
|
179
|
2
|
|
|
|
|
85
|
my $n = sum0 map { $_->nomiss_seq_len } $self->all_seqs; |
|
255
|
|
|
|
|
540
|
|
180
|
2
|
50
|
|
|
|
19
|
return 0.0 unless $n; |
181
|
|
|
|
|
|
|
|
182
|
2
|
|
|
|
|
11
|
my $total = $self->width * $self->count_seqs; |
183
|
2
|
|
|
|
|
19
|
return 100.0 * ($total - $n) / $total; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# MUTATORS |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub uc_seqs { |
190
|
1
|
|
|
1
|
1
|
12
|
my $self = shift; |
191
|
|
|
|
|
|
|
|
192
|
1
|
|
|
|
|
44
|
$_->uc for $self->all_seqs; |
193
|
1
|
|
|
|
|
4
|
return $self; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub recode_seqs { ## no critic (RequireArgUnpacking) |
198
|
1
|
|
|
1
|
1
|
20
|
my $self = shift; |
199
|
|
|
|
|
|
|
|
200
|
1
|
|
|
|
|
44
|
$_->recode(@_) for $self->all_seqs; |
201
|
1
|
|
|
|
|
7
|
return $self; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
sub degap_seqs { |
206
|
1
|
|
|
1
|
1
|
8
|
my $self = shift; |
207
|
|
|
|
|
|
|
|
208
|
1
|
|
|
|
|
39
|
$_->degap for $self->all_seqs; |
209
|
1
|
|
|
|
|
4
|
return $self; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub spacify_seqs { |
214
|
176
|
|
|
176
|
1
|
364
|
my $self = shift; |
215
|
|
|
|
|
|
|
|
216
|
176
|
|
|
|
|
6082
|
$_->spacify for $self->all_seqs; |
217
|
176
|
|
|
|
|
489
|
return $self; |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub gapify_seqs { ## no critic (RequireArgUnpacking) |
222
|
2
|
|
|
2
|
1
|
19
|
my $self = shift; |
223
|
2
|
|
|
|
|
78
|
$_->gapify(@_) for $self->all_seqs; |
224
|
2
|
|
|
|
|
8
|
return $self; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub trim_seqs { |
229
|
176
|
|
|
176
|
1
|
337
|
my $self = shift; |
230
|
|
|
|
|
|
|
|
231
|
176
|
|
|
|
|
6208
|
$_->trim for $self->all_seqs; |
232
|
176
|
|
|
|
|
691
|
return $self; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
sub pad_seqs { |
237
|
176
|
|
|
176
|
1
|
412
|
my $self = shift; |
238
|
|
|
|
|
|
|
|
239
|
176
|
|
|
|
|
538
|
my $bound = $self->_max_seq_len; |
240
|
176
|
|
|
|
|
5923
|
$_->pad_to($bound) for $self->all_seqs; |
241
|
176
|
|
|
|
|
550
|
return $self; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
sub uniformize { |
246
|
175
|
|
|
175
|
1
|
371
|
my $self = shift; |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# TODO: profile code (triggers?) to avoid useless multiple calls |
249
|
|
|
|
|
|
|
|
250
|
175
|
|
|
|
|
624
|
$self->spacify_seqs; |
251
|
175
|
|
|
|
|
795
|
$self->trim_seqs; |
252
|
175
|
|
|
|
|
699
|
$self->pad_seqs; |
253
|
|
|
|
|
|
|
|
254
|
175
|
|
|
|
|
343
|
return $self; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub clear_new_tags { |
259
|
1
|
|
|
1
|
1
|
4
|
my $self = shift; |
260
|
|
|
|
|
|
|
|
261
|
1
|
|
|
|
|
42
|
$_->clear_new_tag for $self->all_seqs; |
262
|
1
|
|
|
|
|
4
|
return $self; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub shorten_ids { ## no critic (RequireArgUnpacking) |
267
|
12
|
|
|
12
|
1
|
65
|
return shift->_change_ids_(1, @_); |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub restore_ids { ## no critic (RequireArgUnpacking) |
272
|
10
|
|
|
10
|
1
|
44
|
return shift->_change_ids_(0, @_); |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
sub _change_ids_ { |
276
|
22
|
|
|
22
|
|
59
|
my $self = shift; |
277
|
22
|
|
|
|
|
47
|
my $abbr = shift; |
278
|
22
|
|
|
|
|
45
|
my $mapper = shift; |
279
|
|
|
|
|
|
|
|
280
|
22
|
|
|
|
|
868
|
for my $seq ($self->all_seqs) { |
281
|
133
|
100
|
|
|
|
702
|
my $new_id = $abbr ? $mapper->abbr_id_for( $seq->full_id ) |
282
|
|
|
|
|
|
|
: $mapper->long_id_for( $seq->full_id ); |
283
|
133
|
50
|
|
|
|
4236
|
$seq->set_seq_id($new_id) if $new_id; |
284
|
|
|
|
|
|
|
} # Note: leave id alone if not found |
285
|
|
|
|
|
|
|
|
286
|
22
|
|
|
|
|
115
|
return $self; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
sub apply_list { |
291
|
1
|
|
|
1
|
1
|
8
|
my $self = shift; |
292
|
1
|
|
|
|
|
4
|
my $list = shift; |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# loop through seqs from bottom to top |
295
|
|
|
|
|
|
|
# ... and delete seq from Ali if not listed in list |
296
|
1
|
|
|
|
|
42
|
my $seq_n = $self->count_seqs; |
297
|
|
|
|
|
|
|
|
298
|
1
|
|
|
|
|
8
|
for (my $i = $seq_n-1; $i >= 0; $i--) { |
299
|
250
|
100
|
|
|
|
8067
|
$self->delete_seq($i) |
300
|
|
|
|
|
|
|
unless $list->is_listed( $self->get_seq($i)->full_id ); |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
1
|
|
|
|
|
8
|
return $self; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub _premask_check { |
308
|
40
|
|
|
40
|
|
476
|
my $self = shift; |
309
|
40
|
|
|
|
|
95
|
my $mask = shift; |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# warn of unaligned Ali |
312
|
40
|
50
|
|
|
|
160
|
carp '[BMC] Note: Ali does not look aligned!' |
313
|
|
|
|
|
|
|
. ' This might not be an issue if seqs are ultra-conserved!' |
314
|
|
|
|
|
|
|
unless $self->is_aligned; |
315
|
40
|
|
|
|
|
254
|
$self->uniformize; |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
# warn of empty SeqMask |
318
|
|
|
|
|
|
|
carp '[BMC] Note: applying this mask will result in a zero-width Ali!' |
319
|
40
|
50
|
|
1845
|
|
1782
|
unless List::AllUtils::any { $_ } $mask->all_states; |
|
1845
|
|
|
|
|
2302
|
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
# check that SeqMask is compatible with Ali |
322
|
|
|
|
|
|
|
# potential bugs could come from constant sites etc |
323
|
40
|
|
|
|
|
14410
|
my $a_width = $self->width; |
324
|
40
|
|
|
|
|
1594
|
my $m_width = $mask->mask_len; |
325
|
40
|
100
|
|
|
|
182
|
carp "[BMC] Note: Ali width does not match mask len: $a_width vs. $m_width!" |
326
|
|
|
|
|
|
|
. ' This might not be an issue if the mask results from blocks.' |
327
|
|
|
|
|
|
|
unless $a_width == $m_width; |
328
|
|
|
|
|
|
|
|
329
|
40
|
|
|
|
|
919
|
return; # return values of before subs are ignored |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
before 'apply_mask' => \&_premask_check; |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
sub apply_mask { |
335
|
|
|
|
|
|
|
my $self = shift; |
336
|
|
|
|
|
|
|
my $mask = shift; |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
# select sites for each seq using a precomputed array slice |
339
|
|
|
|
|
|
|
my @indexes = indexes { $_ } $mask->all_states; |
340
|
|
|
|
|
|
|
$_->_set_seq( join q{}, ( $_->all_states )[@indexes] ) |
341
|
|
|
|
|
|
|
for $self->all_seqs; |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
return $self; |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
sub idealize { ## no critic (RequireArgUnpacking) |
348
|
13
|
|
|
13
|
1
|
107
|
my $self = shift; |
349
|
13
|
|
|
|
|
154
|
return $self->apply_mask( SeqMask->ideal_mask($self, @_) ); |
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# MISC METHODS |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
sub gapmiss_regex { |
356
|
73
|
100
|
|
73
|
1
|
1152
|
return shift->is_protein ? $GAPPROTMISS : $GAPDNAMISS; |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
sub map_coords { |
361
|
1
|
|
|
1
|
1
|
14
|
my $self = shift; |
362
|
1
|
|
|
|
|
4
|
my $id = shift; |
363
|
1
|
|
|
|
|
3
|
my $coords_in = shift; |
364
|
|
|
|
|
|
|
|
365
|
1
|
|
|
|
|
6
|
my $seq = $self->get_seq_with_id($id); |
366
|
1
|
|
|
|
|
7
|
my @states = $seq->all_states; |
367
|
|
|
|
|
|
|
|
368
|
1
|
|
|
|
|
3
|
my $count = 0; |
369
|
1
|
|
|
|
|
2
|
my @index; |
370
|
1
|
|
|
|
|
4
|
for my $state (@states) { |
371
|
130
|
100
|
|
|
|
351
|
$count++ if $state !~ $GAP; |
372
|
130
|
|
|
|
|
222
|
push @index, $count; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
1
|
|
|
|
|
2
|
my @coords_out = map { $index[$_-1] } @{$coords_in}; |
|
6
|
|
|
|
|
14
|
|
|
1
|
|
|
|
|
3
|
|
376
|
|
|
|
|
|
|
|
377
|
1
|
|
|
|
|
14
|
return \@coords_out; |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# ALIASES |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
sub height { |
384
|
0
|
|
|
0
|
1
|
0
|
return shift->count_seqs; |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
# I/O METHODS |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub load { |
391
|
69
|
|
|
69
|
1
|
26466
|
my $class = shift; |
392
|
69
|
|
|
|
|
162
|
my $infile = shift; |
393
|
|
|
|
|
|
|
|
394
|
69
|
|
|
|
|
397
|
open my $in, '<', $infile; |
395
|
|
|
|
|
|
|
|
396
|
69
|
|
|
|
|
33740
|
my $ali = $class->new( file => $infile ); |
397
|
69
|
|
|
|
|
451
|
my $seq_id; |
398
|
|
|
|
|
|
|
my $seq; |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
LINE: |
401
|
69
|
|
|
|
|
15067
|
while (my $line = <$in>) { |
402
|
4091
|
|
|
|
|
7888
|
chomp $line; |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# skip empty lines and process comments |
405
|
4091
|
100
|
100
|
|
|
26130
|
next LINE if $line =~ $EMPTY_LINE |
406
|
|
|
|
|
|
|
|| $ali->is_comment($line); |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
# at each '>' char... |
409
|
3944
|
|
|
|
|
14367
|
my ($defline) = $line =~ $DEF_LINE; |
410
|
3944
|
100
|
|
|
|
8598
|
if ($defline) { |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
# add current seq to ali (if any) |
413
|
1662
|
100
|
|
|
|
3365
|
if ($seq) { |
414
|
1593
|
|
|
|
|
48380
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
415
|
1593
|
|
|
|
|
54646
|
$ali->add_seq($new_seq); |
416
|
1593
|
|
|
|
|
2989
|
$seq = q{}; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
1662
|
|
|
|
|
2768
|
$seq_id = $defline; |
420
|
1662
|
|
|
|
|
24372
|
next LINE; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# elongate current seq (seqs can be broken on several lines) |
424
|
2282
|
|
|
|
|
10687
|
$seq .= $line; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# add last seq to ali (if any) |
428
|
69
|
50
|
|
|
|
283
|
if ($seq) { |
429
|
69
|
|
|
|
|
2410
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
430
|
69
|
|
|
|
|
2428
|
$ali->add_seq($new_seq); |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
69
|
|
|
|
|
1797
|
return $ali; |
434
|
|
|
|
|
|
|
} |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
before qr{\Astore}xms => sub { |
438
|
|
|
|
|
|
|
my $self = shift; |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# perform pre-storage duties |
441
|
|
|
|
|
|
|
carp '[BMC] Warning: non unique seq ids!' unless $self->has_uniq_ids; |
442
|
|
|
|
|
|
|
$self->uniformize if $self->is_aligned; |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
return; |
445
|
|
|
|
|
|
|
}; |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub store { ## no critic (RequireArgUnpacking) |
449
|
|
|
|
|
|
|
my $self = shift; |
450
|
|
|
|
|
|
|
my $outfile = shift; |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# automatically redirect to store_fasta for non-Ali outfile names |
453
|
|
|
|
|
|
|
return $self->store_fasta($outfile, @_) # note the currying |
454
|
|
|
|
|
|
|
unless $outfile =~ $ALI_SUFFIX; |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
open my $out, '>', $outfile; |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
print {$out} $self->header; |
459
|
|
|
|
|
|
|
for my $seq ($self->all_seqs) { |
460
|
|
|
|
|
|
|
say {$out} '>' . $seq->full_id; |
461
|
|
|
|
|
|
|
say {$out} $seq->seq; |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
return; |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
sub store_fasta { |
469
|
|
|
|
|
|
|
my $self = shift; |
470
|
|
|
|
|
|
|
my $outfile = shift; |
471
|
|
|
|
|
|
|
my $args = shift // {}; # HashRef (should not be empty...) |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
my $degap = $args->{degap} // 0; |
474
|
|
|
|
|
|
|
my $clean = $args->{clean} // 0; |
475
|
|
|
|
|
|
|
my $gap = $args->{gapify} // 'X'; |
476
|
|
|
|
|
|
|
my $chunk = $args->{chunk} // 60; |
477
|
|
|
|
|
|
|
my $nowrap = $chunk < 0 ? 1 : 0; |
478
|
|
|
|
|
|
|
my $is_aligned = $self->is_aligned; |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
open my $out, '>', $outfile; |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
for my $seq ($self->all_seqs) { |
483
|
|
|
|
|
|
|
say {$out} '>' . $seq->foreign_id; |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
# optionally clean and/or degap seq |
486
|
|
|
|
|
|
|
$seq = $seq->clone if $clean || $degap; # clone seq only if needed |
487
|
|
|
|
|
|
|
$seq->gapify($gap) if $clean; |
488
|
|
|
|
|
|
|
$seq->degap if $degap; |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
my $width = $seq->seq_len; |
491
|
|
|
|
|
|
|
$chunk = $width if $nowrap; # optionally disable wrap |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
my $str = $seq->wrapped_str($chunk); |
494
|
|
|
|
|
|
|
$str =~ s{$GAP}{-}xmsg if $is_aligned; # restore '-' when aligned |
495
|
|
|
|
|
|
|
print {$out} $str; |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
return; |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
sub temp_fasta { |
503
|
9
|
|
|
9
|
1
|
25
|
my $self = shift; |
504
|
9
|
|
50
|
|
|
31
|
my $args = shift // {}; # HashRef (should not be empty...) |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
# abbreviate ids (possibly using a custom prefix) |
507
|
9
|
|
|
|
|
23
|
my $prefix = $args->{id_prefix}; |
508
|
9
|
|
|
|
|
22
|
delete $args->{id_prefix}; # not necessarily required |
509
|
9
|
|
|
|
|
54
|
my $mapper = $self->std_mapper($prefix); |
510
|
9
|
|
|
|
|
51
|
$self->shorten_ids($mapper); |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# write temporary .fasta file using standard abbr_ids |
513
|
|
|
|
|
|
|
# ...and restore long_ids afterwards |
514
|
9
|
|
|
|
|
94
|
my $out = File::Temp->new(UNLINK => 0, EXLOCK => 0, SUFFIX => '.fasta'); |
515
|
9
|
|
|
|
|
5348
|
$self->store_fasta($out->filename, $args); |
516
|
9
|
|
|
|
|
80
|
$self->restore_ids($mapper); |
517
|
|
|
|
|
|
|
### filename: $out->filename |
518
|
|
|
|
|
|
|
|
519
|
9
|
100
|
|
|
|
77
|
return wantarray ? ($out->filename, $mapper) : $out->filename; |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
# TODO: add an option to remove trailing '_' in ids? |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
sub load_phylip { |
526
|
105
|
|
|
105
|
1
|
7455
|
my $class = shift; |
527
|
105
|
|
|
|
|
241
|
my $infile = shift; |
528
|
|
|
|
|
|
|
|
529
|
105
|
|
|
|
|
603
|
open my $in, '<', $infile; |
530
|
|
|
|
|
|
|
|
531
|
105
|
|
|
|
|
37878
|
my $ali = $class->new( file => $infile ); |
532
|
105
|
|
|
|
|
254
|
my $seq_n; |
533
|
|
|
|
|
|
|
my $site_n; |
534
|
105
|
|
|
|
|
196
|
my $n = 0; |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
LINE: |
537
|
105
|
|
|
|
|
76005
|
while (my $line = <$in>) { |
538
|
8557
|
|
|
|
|
18371
|
chomp $line; |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# skip empty lines |
541
|
8557
|
100
|
|
|
|
44127
|
next LINE if $line =~ $EMPTY_LINE; |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# get matrix dimensions |
544
|
8524
|
100
|
|
|
|
32752
|
if ($line =~ $DIM_LINE) { |
545
|
105
|
|
|
|
|
548
|
($seq_n, $site_n) = ($1, $2); |
546
|
105
|
|
|
|
|
1623
|
next LINE; |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
# process regular line (identifier is optional) |
550
|
8419
|
|
|
|
|
43458
|
my ($seq_id, $seq) = $line =~ $PHY_LINE; |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# should never happen... |
553
|
8419
|
50
|
|
|
|
19424
|
croak "[BMC] Error: unable to parse PHYLIP file at line $.; aborting!" |
554
|
|
|
|
|
|
|
unless $seq; |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
# delete optional spaces |
557
|
8419
|
|
|
|
|
18587
|
$seq =~ tr/ //d; |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
# process first seq block |
560
|
8419
|
100
|
|
|
|
278129
|
if ($ali->count_seqs < $seq_n) { |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# seq_id are mandatory in first block |
563
|
8089
|
50
|
|
|
|
17295
|
croak "[BMC] Error: missing id in PHILIP file at line $.; aborting!" |
564
|
|
|
|
|
|
|
unless $seq_id; |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# store first seq chunk along with seq_id |
567
|
8089
|
|
|
|
|
201665
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
568
|
8089
|
|
|
|
|
249422
|
$ali->add_seq($new_seq); |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# process remaining seq blocks |
572
|
|
|
|
|
|
|
else { |
573
|
330
|
|
|
|
|
10763
|
my $curr_seq = $ali->get_seq($n); |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
# elongate current with partial seq chunk mistaken as seq_id |
576
|
330
|
100
|
100
|
|
|
1038
|
if ($seq_id && $seq_id ne $curr_seq->full_id) { |
577
|
110
|
|
|
|
|
3689
|
$curr_seq->append_seq($seq_id); |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
# elongate current seq with new seq chunk |
581
|
330
|
|
|
|
|
11025
|
$curr_seq->append_seq($seq); |
582
|
|
|
|
|
|
|
} |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
# prepare reading of next seq or next block |
585
|
8419
|
100
|
|
|
|
86033
|
$n = 0 if ++$n == $seq_n; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
105
|
|
|
|
|
643
|
my $width = $ali->width; |
589
|
105
|
50
|
|
|
|
482
|
croak "[BMC] Error: unexpected site number in PHYLIP file: $width;" |
590
|
|
|
|
|
|
|
. ' aborting!' if $width != $site_n; |
591
|
|
|
|
|
|
|
|
592
|
105
|
|
|
|
|
2940
|
return $ali; |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
# PHYLIP: http://evolution.genetics.washington.edu/phylip/doc/main.html |
596
|
|
|
|
|
|
|
# The information for each species follows, starting with a ten-character |
597
|
|
|
|
|
|
|
# species name (which can include blanks and some punctuation marks), and |
598
|
|
|
|
|
|
|
# continuing with the characters for that species. The name should be on the |
599
|
|
|
|
|
|
|
# same line as the first character of the data for that species. (...) The |
600
|
|
|
|
|
|
|
# name should be ten characters in length, filled out to the full ten |
601
|
|
|
|
|
|
|
# characters by blanks if shorter. Any printable ASCII/ISO character is |
602
|
|
|
|
|
|
|
# allowed in the name, except for parentheses ("(" and ")"), square brackets |
603
|
|
|
|
|
|
|
# ("[" and "]"), colon (":"), semicolon (";") and comma (","). (...) Note that |
604
|
|
|
|
|
|
|
# in these sequences we have a blank every ten sites to make them easier to |
605
|
|
|
|
|
|
|
# read: any such blanks are allowed. The blank line which separates the two |
606
|
|
|
|
|
|
|
# groups of lines (the ones containing sites 1-20 and ones containing sites |
607
|
|
|
|
|
|
|
# 21-39) may or may not be present. |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# PhyML: http://www.atgc-montpellier.fr/phyml/usersguide.php?type=command |
610
|
|
|
|
|
|
|
# The input sequence file is a standard PHYLIP file of aligned DNA or |
611
|
|
|
|
|
|
|
# amino-acids sequences. (...) The maximum number of characters in species |
612
|
|
|
|
|
|
|
# name MUST not exceed 100. Blanks and the symbols "(),:" are not allowed |
613
|
|
|
|
|
|
|
# within sequence names because the Newick tree format makes special use of |
614
|
|
|
|
|
|
|
# these symbols. However, blanks (one or more) MUST appear at the end of each |
615
|
|
|
|
|
|
|
# species name. |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
# TREE-FINDER: http://www.treefinder.de/tf-march2011-manual.pdf |
618
|
|
|
|
|
|
|
# A sequence name may consist of 1 to 10 alphanumeric characters, dashes "-", |
619
|
|
|
|
|
|
|
# dots ".", underscores "_", or some of "/", "?", "*", "+". No space is |
620
|
|
|
|
|
|
|
# allowed inside the names. The first name character must be a letter or an |
621
|
|
|
|
|
|
|
# underscore. A sequence fragment may start behind position 10 after a |
622
|
|
|
|
|
|
|
# sequence name, or anywhere in a line without a name. |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
# TREE-PUZZLE [data/globin.a] |
625
|
|
|
|
|
|
|
# 7 128 |
626
|
|
|
|
|
|
|
# HBB_HUMAN HLTPEEKSAV TALWGKVNVD EVGGEALGRL LVVYPWTQRF FESFDLSMGN |
627
|
|
|
|
|
|
|
# HBB_HORSE QLSGEEKAAV LALWDKVNEE EVGGEALGRL LVVYPWTQRF FDSFDLSMGN |
628
|
|
|
|
|
|
|
# HBA_HUMAN VLSPADKTNV KAAWGKVGAG EYGAEALERM FLSFPTTKTY FPHFDLSHGS |
629
|
|
|
|
|
|
|
# HBA_HORSE VLSAADKTNV KAAWSKVGAG EYGAEALERM FLGFPTTKTY FPHFDLSHGS |
630
|
|
|
|
|
|
|
# MYG_PHYCA VLSEGEWQLV LHVWAKVEVA GHGQDILIRL FKSHPETLEK FDRFHLKKAS |
631
|
|
|
|
|
|
|
# GLB5_PETMA PLSAAEKTKI RSAWAPVYYE TSGVDILVKF FTSTPAAQEF FPKFGLTKKS |
632
|
|
|
|
|
|
|
# LGB2_LUPLU ALTESQAALV KSSWEEFNIP KHTHRFFILV LEIAPAAKDL FSFLGTSQNN |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# RAXML: http://sco.h-its.org/exelixis/oldPage/RAxML-Manual.7.0.4.pdf |
635
|
|
|
|
|
|
|
# The input alignment format of RAxML is relaxed interleaved or sequential |
636
|
|
|
|
|
|
|
# PHYLIP. "Relaxed" means that sequence names can be of variable length |
637
|
|
|
|
|
|
|
# between 1 up to 256 characters. (...) Prohibited Character(s) in taxon names |
638
|
|
|
|
|
|
|
# taxon names that contain any form of whitespace character, like blanks, |
639
|
|
|
|
|
|
|
# tabulators, and carriage returns, as well as one of the following prohibited |
640
|
|
|
|
|
|
|
# characters: :,();[]. |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
# PHYLOBAYES: http://megasun.bch.umontreal.ca/People/lartillot/www/phylobayes3.3e.pdf |
643
|
|
|
|
|
|
|
# Taxon names may contain more than 10 characters. Avoid special characters |
644
|
|
|
|
|
|
|
# such as ';' or ')', which will create problems when parsing trees. The best |
645
|
|
|
|
|
|
|
# is to only use letters, digits and '_'. Sequences can be interrupted by |
646
|
|
|
|
|
|
|
# space and tab, but not by return characters. Be sure that the lengths of the |
647
|
|
|
|
|
|
|
# sequences are all the same, and identical to the lengths indicated in the |
648
|
|
|
|
|
|
|
# header. Sequences can be interleaved, in which case the taxon names may or |
649
|
|
|
|
|
|
|
# may not be repeated in each block. |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
sub store_phylip { |
653
|
|
|
|
|
|
|
my $self = shift; |
654
|
|
|
|
|
|
|
my $outfile = shift; |
655
|
|
|
|
|
|
|
my $args = shift // {}; # HashRef (should not be empty...) |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
my $short = $args->{short} // 1; |
658
|
|
|
|
|
|
|
my $clean = $args->{clean} // 0; |
659
|
|
|
|
|
|
|
my $chunk = $args->{chunk} // 60; |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
open my $out, '>', $outfile; |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
# print data matrix dimensions |
664
|
|
|
|
|
|
|
my $height = $self->count_seqs; |
665
|
|
|
|
|
|
|
my $width = $self->width; |
666
|
|
|
|
|
|
|
say {$out} $height . q{ } . $width; |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
# setup id format (this will also affect block-like structure) |
669
|
|
|
|
|
|
|
my $format = $short ? "%-10.10s %s\n" : "%s %s\n"; |
670
|
|
|
|
|
|
|
my $method = $short ? 'full_id' : 'foreign_id'; |
671
|
|
|
|
|
|
|
my $sep = $short ? q{ } : q{}; |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# optionally disable wrapping |
674
|
|
|
|
|
|
|
$chunk = $width if $chunk < 0; |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
# optionally clean seq |
677
|
|
|
|
|
|
|
my @seqs = $self->all_seqs; |
678
|
|
|
|
|
|
|
@seqs = map { $_->clone->gapify('X') } @seqs if $clean; |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
# output Ali in sequential or interleaved format |
681
|
|
|
|
|
|
|
for (my $site = 0; $site < $width; $site += $chunk) { |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
# leave empty line between chunks (but not after last chunk) |
684
|
|
|
|
|
|
|
print {$out} "\n" if $site; |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
for my $seq (@seqs) { |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
# print 10-chars ids only once |
689
|
|
|
|
|
|
|
# Note: full_ids are only truncated (use IdMapper for more) |
690
|
|
|
|
|
|
|
my $id = $site == 0 ? $seq->$method : q{}; |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
# print seq chunks in 10-state blocks |
693
|
|
|
|
|
|
|
# Note: We insert a space in the 11th column to ensure that more |
694
|
|
|
|
|
|
|
# software packages can read the written files. PHYLIP itself will |
695
|
|
|
|
|
|
|
# ignore this spurious space in the 'sequences'. |
696
|
|
|
|
|
|
|
( my $str = $seq->edit_seq($site, $chunk) ) =~ s{$GAP}{-}xmsg; |
697
|
|
|
|
|
|
|
printf {$out} $format, $id, join $sep, |
698
|
|
|
|
|
|
|
map { substr($str, $_*10, 10) } 0..ceil(length($str)/10)-1; |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
return; |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
sub load_stockholm { |
707
|
2
|
|
|
2
|
1
|
33
|
my $class = shift; |
708
|
2
|
|
|
|
|
5
|
my $infile = shift; |
709
|
|
|
|
|
|
|
|
710
|
2
|
|
|
|
|
10
|
open my $in, '<', $infile; |
711
|
|
|
|
|
|
|
|
712
|
2
|
|
|
|
|
445
|
my $ali = $class->new( file => $infile ); |
713
|
2
|
|
|
|
|
27
|
tie my %seq_for, 'Tie::IxHash'; |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
LINE: |
716
|
2
|
|
|
|
|
1300
|
while (my $line = <$in>) { |
717
|
44
|
|
|
|
|
294
|
chomp $line; |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
# process GF comments |
720
|
44
|
100
|
|
|
|
106
|
next LINE if $ali->is_comment($line, $STK_COMMENT); |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
# skip empty lines and other comment lines |
723
|
27
|
100
|
100
|
|
|
219
|
next LINE if $line =~ $EMPTY_LINE |
724
|
|
|
|
|
|
|
|| $line =~ $COMMENT_LINE; |
725
|
11
|
100
|
|
|
|
33
|
last LINE if $line =~ $END_LINE; |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
# parse stockholm seq |
728
|
9
|
|
|
|
|
46
|
my ($seq_id, $seq) = $line =~ $STK_SEQ; |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
# replace dots and gaps by stars |
731
|
9
|
|
|
|
|
30
|
$seq =~ s/[\.\-]/*/xmsg; |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
# elongate seq |
734
|
9
|
|
|
|
|
44
|
$seq_for{$seq_id} .= $seq; |
735
|
|
|
|
|
|
|
} |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
# populate Ali object |
738
|
2
|
|
|
|
|
16
|
while (my ($seq_id, $seq) = each %seq_for) { |
739
|
9
|
|
|
|
|
395
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
740
|
9
|
|
|
|
|
302
|
$ali->add_seq($new_seq); |
741
|
|
|
|
|
|
|
} |
742
|
|
|
|
|
|
|
|
743
|
2
|
|
|
|
|
72
|
return $ali; |
744
|
|
|
|
|
|
|
} |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
sub instant_store { |
748
|
1
|
|
|
1
|
1
|
6
|
my $class = shift; |
749
|
1
|
|
|
|
|
2
|
my $outfile = shift; |
750
|
1
|
|
50
|
|
|
5
|
my $args = shift // {}; # HashRef (should not be empty...) |
751
|
|
|
|
|
|
|
|
752
|
1
|
|
|
|
|
4
|
my $infile = $args->{infile}; |
753
|
1
|
50
|
|
|
|
6
|
croak '[BMC] Error: no infile specified for instant_store; aborting!' |
754
|
|
|
|
|
|
|
unless $infile; |
755
|
|
|
|
|
|
|
|
756
|
1
|
|
|
|
|
8
|
my $coderef = $args->{coderef}; |
757
|
1
|
50
|
|
|
|
4
|
croak '[BMC] Error: no coderef specified for instant_store; aborting!' |
758
|
|
|
|
|
|
|
unless $coderef; |
759
|
|
|
|
|
|
|
|
760
|
1
|
|
|
|
|
6
|
open my $in, '<', $infile; |
761
|
1
|
|
|
|
|
349
|
open my $out, '>', $outfile; |
762
|
|
|
|
|
|
|
|
763
|
1
|
|
|
|
|
273
|
my $seq_id; |
764
|
|
|
|
|
|
|
my $seq; |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
LINE: |
767
|
1
|
|
|
|
|
793
|
while (my $line = <$in>) { |
768
|
92
|
|
|
|
|
144
|
chomp $line; |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
# skip empty lines and process comments |
771
|
92
|
100
|
66
|
|
|
493
|
next LINE if $line =~ $EMPTY_LINE |
772
|
|
|
|
|
|
|
|| $line =~ $COMMENT_LINE; |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
# at each '>' char... |
775
|
90
|
|
|
|
|
223
|
my ($defline) = $line =~ $DEF_LINE; |
776
|
90
|
100
|
|
|
|
154
|
if ($defline) { |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
# process current seq (if any) |
779
|
3
|
100
|
|
|
|
9
|
if ($seq) { |
780
|
2
|
|
|
|
|
80
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
781
|
2
|
|
|
|
|
3
|
print {$out} $coderef->($new_seq); |
|
2
|
|
|
|
|
12
|
|
782
|
2
|
|
|
|
|
78
|
$seq = q{}; |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
|
785
|
3
|
|
|
|
|
9
|
$seq_id = $defline; |
786
|
3
|
|
|
|
|
19
|
next LINE; |
787
|
|
|
|
|
|
|
} |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
# elongate current seq (seqs can be broken on several lines) |
790
|
87
|
|
|
|
|
251
|
$seq .= $line; |
791
|
|
|
|
|
|
|
} |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
# process last seq (if any) |
794
|
1
|
50
|
|
|
|
6
|
if ($seq) { |
795
|
1
|
|
|
|
|
38
|
my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq ); |
796
|
1
|
|
|
|
|
3
|
print {$out} $coderef->($new_seq); |
|
1
|
|
|
|
|
5
|
|
797
|
|
|
|
|
|
|
} |
798
|
|
|
|
|
|
|
|
799
|
1
|
|
|
|
|
156
|
return; |
800
|
|
|
|
|
|
|
} |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
sub instant_count { |
804
|
1
|
|
|
1
|
1
|
443
|
my $class = shift; |
805
|
1
|
|
|
|
|
4
|
my $infile = shift; |
806
|
|
|
|
|
|
|
|
807
|
1
|
|
|
|
|
2
|
my $seq_n = 0; |
808
|
|
|
|
|
|
|
|
809
|
1
|
|
|
|
|
7
|
open my $in, '<', $infile; |
810
|
1
|
100
|
|
|
|
2860
|
while (<$in>) { $seq_n++ if m/^>/xms } |
|
502
|
|
|
|
|
2361
|
|
811
|
|
|
|
|
|
|
|
812
|
1
|
|
|
|
|
21
|
return $seq_n; |
813
|
|
|
|
|
|
|
} |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
816
|
|
|
|
|
|
|
1; |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
__END__ |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
=pod |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
=head1 NAME |
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
Bio::MUST::Core::Ali - Multiple sequence alignment |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
=head1 VERSION |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
version 0.212670 |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
=head1 SYNOPSIS |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
#!/usr/bin/env perl |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
use Modern::Perl '2011'; |
835
|
|
|
|
|
|
|
# same as: |
836
|
|
|
|
|
|
|
# use strict; |
837
|
|
|
|
|
|
|
# use warnings; |
838
|
|
|
|
|
|
|
# use feature qw(say); |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
use Bio::MUST::Core; |
841
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
# read Ali form disk |
844
|
|
|
|
|
|
|
my $ali = Ali->load('example.ali'); |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
# get some properties |
847
|
|
|
|
|
|
|
say 'height: ' . $ali->height; # number of seqs |
848
|
|
|
|
|
|
|
say 'width: ' . $ali->width; # number of sites |
849
|
|
|
|
|
|
|
say '% miss: ' . $ali->perc_miss; # fraction of missing chars (%) |
850
|
|
|
|
|
|
|
say 'seqs are ' . $ali->is_protein ? 'prot' : 'nucl'; |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
# turn seqs to uppercase |
853
|
|
|
|
|
|
|
$ali->uc_seqs; |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
# filter out seqs with no species associated |
856
|
|
|
|
|
|
|
my @seqs = $ali->filter_seqs( sub { not $_->is_genus_only } ); |
857
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdList'; |
858
|
|
|
|
|
|
|
my $list = IdList->new( ids => \@seqs ); |
859
|
|
|
|
|
|
|
$ali->apply_list($list); |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
# alternatively: |
862
|
|
|
|
|
|
|
# $ali = Ali->new( seqs => \@seqs ); |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
# filter out gap-rich sites |
865
|
|
|
|
|
|
|
$ali->idealize(0.2); # min 20% non-gaps per site |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
# filter out non-informative sites |
868
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::SeqMask'; |
869
|
|
|
|
|
|
|
my $mask = SeqMask->parsimony_mask($ali); |
870
|
|
|
|
|
|
|
$ali->apply_mask($mask); |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
# write down reduced Ali to disk |
873
|
|
|
|
|
|
|
$ali->store('example-uc-genus-sp-20.ali'); |
874
|
|
|
|
|
|
|
$ali->store_fasta('example-uc-genus-sp-20.fasta'); |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
# see below for additional methods |
877
|
|
|
|
|
|
|
# ... |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
=head1 DESCRIPTION |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
This module implements the multiple sequence alignment (MSA) class and its |
882
|
|
|
|
|
|
|
methods. An Ali is modeled as an array of L<Bio::MUST::Core::Seq> objects. |
883
|
|
|
|
|
|
|
Consequently, sequence ids do not absolutely need to be unique for it to |
884
|
|
|
|
|
|
|
work (though id uniqueness helps a lot for sequence access and filtering). |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
An Ali knows whether it contains nucleotide or protein sequences, whether |
887
|
|
|
|
|
|
|
those are aligned or not, as well as its Seq count and exact width. All |
888
|
|
|
|
|
|
|
these properties are introspected on the fly, which means that, while they |
889
|
|
|
|
|
|
|
can be expensive to compute, they are always accurate. |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
This is important because an Ali provides methods for inserting, deleting |
892
|
|
|
|
|
|
|
and editing its Seq objects. Further, it does its best to maintain a semantic |
893
|
|
|
|
|
|
|
distinction between true gap states (encoded as '*') and missing regions of |
894
|
|
|
|
|
|
|
undetermined length (encoded as pure whitespace, for example at the end of a |
895
|
|
|
|
|
|
|
short sequence). |
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
It also has methods for mapping its sequence ids (for example, before export |
898
|
|
|
|
|
|
|
to the PHYLIP format), as well as methods to selectively retain sequences |
899
|
|
|
|
|
|
|
and sites based on L<Bio::MUST::Core::IdList> and |
900
|
|
|
|
|
|
|
L<Bio::MUST::Core::SeqMask> objects. For example, the C<idealize> method |
901
|
|
|
|
|
|
|
discards shared gaps and optionally removes gap-rich sites only due to a |
902
|
|
|
|
|
|
|
tiny number of sequences. |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
Finally, an Ali can be stored in MUST pseudo-FASTA format (which handles |
905
|
|
|
|
|
|
|
meaningful whitespace and allows comment lines in the header) or be |
906
|
|
|
|
|
|
|
imported/exported from/to several popular MSA formats, such as plain FASTA, |
907
|
|
|
|
|
|
|
STOCKHOLM and PHYLIP. |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
910
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
=head2 seqs |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
ArrayRef of L<Bio::MUST::Core::Seq> objects (optional) |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
Most of the accessor methods described below are implemented by delegation |
916
|
|
|
|
|
|
|
to this public attribute using L<Moose::Meta::Attribute::Native/Moose native |
917
|
|
|
|
|
|
|
delegation>. Their documentation thus heavily borrows from the corresponding |
918
|
|
|
|
|
|
|
help pages. |
919
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
=head2 file |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
L<Path::Class::File> object (optional) |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
This optional attribute is initialized by class methods that C<load> an Ali |
925
|
|
|
|
|
|
|
from disk. It is meant to improve the introspection capabilities of the Ali. |
926
|
|
|
|
|
|
|
For now, this attribute is not used by the C<store> methods, though it might |
927
|
|
|
|
|
|
|
provide them with a default value in the future. |
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
=head2 guessing |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
Boolean (optional) |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
By default, an Ali object tries to guess whether it is aligned or not by |
934
|
|
|
|
|
|
|
looking for gap-like characters in any of its Seq objects (see |
935
|
|
|
|
|
|
|
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence). |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
When this smart behavior causes issues, one can disable it by unsetting this |
938
|
|
|
|
|
|
|
boolean attribute (see C<dont_guess> and C<guess> accessor methods). |
939
|
|
|
|
|
|
|
|
940
|
|
|
|
|
|
|
=head2 comments |
941
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
ArrayRef of strings (optional) |
943
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
An Ali object is commentable, which means that it supports all the methods |
945
|
|
|
|
|
|
|
pertaining to comment lines described in |
946
|
|
|
|
|
|
|
L<Bio::MUST::Core::Roles::Commentable> (such as C<header>). |
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
=head1 CONSTRUCTORS |
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
=head2 new |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
Default constructor (class method) returning a new Ali. |
953
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
955
|
|
|
|
|
|
|
my $ali1 = Ali->new(); |
956
|
|
|
|
|
|
|
my @seqs = $ali->all_seqs; |
957
|
|
|
|
|
|
|
my $ali2 = Ali->new( seqs => \@seqs ); |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>, |
960
|
|
|
|
|
|
|
C<file>, C<guessing> and C<comments>. |
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
=head2 clone |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
Creates a deep copy (a clone) of the Ali. Returns the copy. |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
967
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
968
|
|
|
|
|
|
|
my $ali_copy = $ali->clone; |
969
|
|
|
|
|
|
|
# you can now mess with $ali_copy without affecting $ali |
970
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
This method does not accept any arguments. |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
=head1 ACCESSORS |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
=head2 add_seq |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
Adds one (or more) new sequence(s) at the end of the Ali. Returns the new |
978
|
|
|
|
|
|
|
number of sequences of the Ali. |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Seq'; |
981
|
|
|
|
|
|
|
my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' ); |
982
|
|
|
|
|
|
|
$ali->add_seq($new_seq); |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
This method accepts any number of arguments. |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
=head2 get_seq |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
Returns a sequence of the Ali by its index. You can also use negative index |
989
|
|
|
|
|
|
|
numbers, just as with Perl's core array handling. If the specified sequence |
990
|
|
|
|
|
|
|
does not exist, this method will return C<undef>. |
991
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
my $seq = $ali->get_seq($index); |
993
|
|
|
|
|
|
|
croak "Seq $index not found in Ali!" unless defined $seq; |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
This method accepts just one argument (and not an array slice). |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
=head2 get_seq_with_id |
998
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
Returns a sequence of the Ali by its id. If the specified id is not unique, |
1000
|
|
|
|
|
|
|
only the first matching sequence will be returned, whereas if no sequence |
1001
|
|
|
|
|
|
|
exists for the specified id, this method will return C<undef>. |
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
my $id = 'Pyrus malus_3750@658052655'; |
1004
|
|
|
|
|
|
|
my $seq = $ali->get_seq_with_id($id); |
1005
|
|
|
|
|
|
|
croak "Seq $id not found in Ali!" unless defined $seq; |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
This method accepts just one argument. |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
=head2 set_seq |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
Given an index and a sequence, sets the specified Ali element to the |
1012
|
|
|
|
|
|
|
sequence. This method returns the new sequence at C<$index>. |
1013
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Seq'; |
1015
|
|
|
|
|
|
|
my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' ); |
1016
|
|
|
|
|
|
|
$ali->set_seq($index, $new_seq); |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
This method requires two arguments. |
1019
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
=head2 delete_seq |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
Removes the sequence at the given index from the Ali. This method returns |
1023
|
|
|
|
|
|
|
the deleted sequence. If the specified sequence does not exist, it will |
1024
|
|
|
|
|
|
|
return C<undef> instead. |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
$ali->delete_seq($index); |
1027
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
This method requires one argument. |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
=head2 insert_seq |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
Inserts a new sequence into the Ali at the given index. This method returns |
1033
|
|
|
|
|
|
|
the new sequence at C<$index>. |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Seq'; |
1036
|
|
|
|
|
|
|
my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' ); |
1037
|
|
|
|
|
|
|
$ali->insert_seq($index, $new_seq); |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
This method requires two arguments. |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
=head2 all_seqs |
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
Returns all the sequences of the Ali (not an array reference). |
1044
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
my @seqs = $ali->all_seqs; |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
This method does not accept any arguments. |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
=head2 first_seq |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
Returns the first sequence of the Ali matching a given criterion, just like |
1052
|
|
|
|
|
|
|
L<List::Util>'s C<first> function. This method requires a subroutine |
1053
|
|
|
|
|
|
|
implementing the matching logic. |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
# emulate get_seq_with_id method |
1056
|
|
|
|
|
|
|
my $id2find = 'seq13'; |
1057
|
|
|
|
|
|
|
my $seq = $ali->first_seq( sub { $_->full_id eq $id2find } ); |
1058
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
This method requires a single argument. |
1060
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
=head2 filter_seqs |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
Returns every sequence of the Ali matching a given criterion, just like |
1064
|
|
|
|
|
|
|
Perl's core C<grep> function. This method requires a subroutine implementing |
1065
|
|
|
|
|
|
|
the matching logic. |
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
# keep only long sequences (ignoring gaps and missing states) |
1068
|
|
|
|
|
|
|
my @long_seqs = $ali->filter_seqs( sub { $_->nomiss_seq_len > 500 } ); |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
This method requires a single argument. |
1071
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
=head2 all_new_seqs |
1073
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
Returns all the sequences of the Ali tagged as #NEW# (not an array reference). |
1075
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
my @new_seqs = $ali->all_new_seqs; |
1077
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
This method does not accept any arguments. |
1079
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
=head2 all_but_new_seqs |
1081
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
Returns all the sequences of the Ali except those tagged as #NEW# (not an array |
1083
|
|
|
|
|
|
|
reference). |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
my @preexisting_seqs = $ali->all_but_new_seqs; |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
This method does not accept any arguments. |
1088
|
|
|
|
|
|
|
|
1089
|
|
|
|
|
|
|
=head2 all_seq_ids |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
Returns all the sequence ids (L<Bio::MUST::Core::SeqId> objects) of the Ali |
1092
|
|
|
|
|
|
|
(not an array reference). This is only a convenience method. |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
use Test::Deeply; |
1095
|
|
|
|
|
|
|
my @ids1 = $ali->all_seq_ids; |
1096
|
|
|
|
|
|
|
my @ids2 = map { $_->seq_id } $ali->all_seqs; |
1097
|
|
|
|
|
|
|
is_deeply \@ids1, \@ids2, 'should be true'; |
1098
|
|
|
|
|
|
|
my @orgs = map { $_->org } @ids1; |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
This method does not accept any arguments. |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
=head2 filename |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
Returns the stringified filename of the Ali. |
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
This method does not accept any arguments. |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
=head2 guess |
1109
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
Turn on the smart detection of gaps (see C<guessing> attribute above). |
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
This method does not accept any arguments. |
1113
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
=head2 dont_guess |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
Turn off the smart detection of gaps (see C<guessing> attribute above). |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1119
|
|
|
|
|
|
|
my $ali = Ali->load('ensembl.fasta'); |
1120
|
|
|
|
|
|
|
$ali->dont_guess; |
1121
|
|
|
|
|
|
|
|
1122
|
|
|
|
|
|
|
This method does not accept any arguments. |
1123
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
=head1 PROPERTIES |
1125
|
|
|
|
|
|
|
|
1126
|
|
|
|
|
|
|
=head2 has_uniq_ids |
1127
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
Returns true if all the sequence ids are unique. |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids; |
1131
|
|
|
|
|
|
|
|
1132
|
|
|
|
|
|
|
This method does not accept any arguments. |
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
=head2 is_protein |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
Returns true if any sequence of the Ali looks like a protein. See |
1137
|
|
|
|
|
|
|
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence. |
1138
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
say 'Your file includes nucleotide sequences' unless $ali->is_protein; |
1140
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
This method does not accept any arguments. |
1142
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
=head2 is_aligned |
1144
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
Returns true if any sequence of the Ali appears to be aligned. See |
1146
|
|
|
|
|
|
|
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence. |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
If the boolean attribute guessing is not set, always returns false. |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
carp 'Warning: file does not look aligned!' unless $ali->is_aligned; |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
This method does not accept any arguments. |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
=head2 count_seqs |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
Returns the number of sequences of the Ali. The alias method C<height> is |
1157
|
|
|
|
|
|
|
provided for convenience. |
1158
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
my $height = $ali->count_seqs; |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
This method does not accept any arguments. |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
=head2 width |
1164
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
Returns the width of the Ali (in characters). If the Ali is not aligned, |
1166
|
|
|
|
|
|
|
returns the length of the longest sequence instead. |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
To avoid potential bugs due to caching, this method dynamically computes the |
1169
|
|
|
|
|
|
|
Ali width at each call. Moreover, the Ali is always uniformized (see below) |
1170
|
|
|
|
|
|
|
beforehands to ensure accurate width value. Therefore, this method is |
1171
|
|
|
|
|
|
|
expensive and should not be called repeatedly (e.g., in a loop condition). |
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
# you'd better looping through sites like this... |
1174
|
|
|
|
|
|
|
my $width = $ali->width; |
1175
|
|
|
|
|
|
|
for my $site (0..$width-1) { |
1176
|
|
|
|
|
|
|
... |
1177
|
|
|
|
|
|
|
} |
1178
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
This method does not accept any arguments. |
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
=head2 seq_len_stats |
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps). |
1184
|
|
|
|
|
|
|
The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max). |
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
This method does not accept any arguments. |
1187
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
=head2 perc_miss |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
Returns the percentage of missing (and gap-like) character states in the Ali. |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
As this method internally calls C<Ali::width>, the remarks above also apply. |
1193
|
|
|
|
|
|
|
|
1194
|
|
|
|
|
|
|
my $miss_level = $ali->perc_miss; |
1195
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
This method does not accept any arguments. |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
=head1 MUTATORS |
1199
|
|
|
|
|
|
|
|
1200
|
|
|
|
|
|
|
=head2 uc_seqs |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
Turn all the sequences of the Ali to uppercase and returns it. |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
This method does not accept any arguments. |
1205
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
=head2 recode_seqs |
1207
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
Recode all the sequences of the Ali and returns it. |
1209
|
|
|
|
|
|
|
|
1210
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1211
|
|
|
|
|
|
|
my $ali = Ali->load('biased.ali'); |
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
# set up RY recoding for suppressing codon bias |
1214
|
|
|
|
|
|
|
my %base_for = ( |
1215
|
|
|
|
|
|
|
A => 'A', G => 'A', # purines |
1216
|
|
|
|
|
|
|
C => 'C', T => 'C', # pyrimidines |
1217
|
|
|
|
|
|
|
); |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
my $ali_rec = $ali->recode_seqs( \%base_for ); |
1220
|
|
|
|
|
|
|
$ali_rec->store('biased_ry.ali'); |
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
This method requires one argument. |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
=head2 degap_seqs |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
Remove the gaps in all the sequences of the Ali and returns it. |
1227
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
This method does not accept any arguments. |
1229
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
=head2 spacify_seqs |
1231
|
|
|
|
|
|
|
|
1232
|
|
|
|
|
|
|
Spacifies all the sequences of the Ali and returns it. See the corresponding |
1233
|
|
|
|
|
|
|
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning |
1234
|
|
|
|
|
|
|
operation. |
1235
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
This method does not accept any arguments. |
1237
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
=head2 gapify_seqs |
1239
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
Gapifies all the sequences of the Ali and returns it. See the corresponding |
1241
|
|
|
|
|
|
|
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning |
1242
|
|
|
|
|
|
|
operation. |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
This method accepts an optional argument. |
1245
|
|
|
|
|
|
|
|
1246
|
|
|
|
|
|
|
=head2 trim_seqs |
1247
|
|
|
|
|
|
|
|
1248
|
|
|
|
|
|
|
Trims all the sequences of the Ali and returns it. See the corresponding |
1249
|
|
|
|
|
|
|
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning |
1250
|
|
|
|
|
|
|
operation. |
1251
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
This method does not accept any arguments. |
1253
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
=head2 pad_seqs |
1255
|
|
|
|
|
|
|
|
1256
|
|
|
|
|
|
|
Pads all the sequences of the Ali and returns it. See the corresponding |
1257
|
|
|
|
|
|
|
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning |
1258
|
|
|
|
|
|
|
operation. |
1259
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
This method does not accept any arguments. |
1261
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
=head2 uniformize |
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
Performs the three gap-cleaning operations in turn on all the sequences of |
1265
|
|
|
|
|
|
|
the Ali and returns it, which ensures that it is semantically clean and |
1266
|
|
|
|
|
|
|
rectangular. |
1267
|
|
|
|
|
|
|
|
1268
|
|
|
|
|
|
|
This is only a convenience method called internally by the Ali object before |
1269
|
|
|
|
|
|
|
selected methods (such as storage-like methods). However, it might prove |
1270
|
|
|
|
|
|
|
useful in some circumstances, hence it is not defined as private. |
1271
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1273
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
1274
|
|
|
|
|
|
|
$ali->add_seq(...); |
1275
|
|
|
|
|
|
|
# more editing of the Ali sequences |
1276
|
|
|
|
|
|
|
$ali->uniformize; |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
This method does not accept any arguments. |
1279
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
=head2 clear_new_tags |
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
Clear the #NEW# tag (if any) from all the sequences of the Ali and returns it. |
1283
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1285
|
|
|
|
|
|
|
my $ali = Ali->load('input-42.ali'); |
1286
|
|
|
|
|
|
|
$ali->clear_new_tags; |
1287
|
|
|
|
|
|
|
my @new_seqs = $ali->all_new_seqs; |
1288
|
|
|
|
|
|
|
# array should be empty |
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
This method does not accept any arguments. |
1291
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
=head2 shorten_ids |
1293
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
Replaces all the sequence ids of the Ali by their abbreviated forms as |
1295
|
|
|
|
|
|
|
specified by the passed L<Bio::MUST::Core::IdMapper> and returns the Ali. |
1296
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
Note that this method will work only if the sequence ids have not been |
1298
|
|
|
|
|
|
|
already shortened or modified in any way since the creation of the IdMapper. |
1299
|
|
|
|
|
|
|
Long ids without abbreviated forms in the IdMapper are left untouched. |
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1302
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdMapper'; |
1303
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
1304
|
|
|
|
|
|
|
my $mapper = IdMapper->std_mapper($ali, 'lcl|seq'); |
1305
|
|
|
|
|
|
|
$ali->shorten_ids($mapper); |
1306
|
|
|
|
|
|
|
$ali->store_fasta('input.4blast.fasta'); |
1307
|
|
|
|
|
|
|
# makeblastdb |
1308
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
# Note: the temp_fasta method does exactly that |
1310
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
This method requires one argument. |
1312
|
|
|
|
|
|
|
|
1313
|
|
|
|
|
|
|
=head2 restore_ids |
1314
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
Replaces all the sequence ids of the Ali by their long forms as specified by |
1316
|
|
|
|
|
|
|
the passed L<Bio::MUST::Core::IdMapper> and returns the Ali. |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
Note that this method will work only if the sequence ids have been previously |
1319
|
|
|
|
|
|
|
abbreviated (see above) and have not been modified in any way since then. |
1320
|
|
|
|
|
|
|
Again, abbreviated ids without long forms in the IdMapper are left untouched. |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdMapper'; |
1323
|
|
|
|
|
|
|
my $mapper = IdMapper->gi_mapper($ali); |
1324
|
|
|
|
|
|
|
$ali->shorten_ids($mapper); |
1325
|
|
|
|
|
|
|
... |
1326
|
|
|
|
|
|
|
$ali->restore_ids($mapper); |
1327
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
This method requires one argument. |
1329
|
|
|
|
|
|
|
|
1330
|
|
|
|
|
|
|
=head2 apply_list |
1331
|
|
|
|
|
|
|
|
1332
|
|
|
|
|
|
|
Selectively retains or discards sequences from the Ali based on the content |
1333
|
|
|
|
|
|
|
of the passed L<Bio::MUST::Core::IdList> object and returns the Ali. |
1334
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdList'; |
1336
|
|
|
|
|
|
|
my $list = IdList->load('interesting_seqs.idl'); |
1337
|
|
|
|
|
|
|
$ali->apply_list($list); # discard non-interesting seqs |
1338
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
This method requires one argument. |
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
=head2 apply_mask |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
Selectively retains or discards sites from the Ali based on the content of |
1344
|
|
|
|
|
|
|
the passed L<Bio::MUST::Core::SeqMask> object and returns the Ali. |
1345
|
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::SeqMask'; |
1347
|
|
|
|
|
|
|
my $variable_sites = SeqMask->variable_mask($ali); |
1348
|
|
|
|
|
|
|
$ali->apply_mask($variable_sites); # discard constant sites |
1349
|
|
|
|
|
|
|
|
1350
|
|
|
|
|
|
|
This method requires one argument. |
1351
|
|
|
|
|
|
|
|
1352
|
|
|
|
|
|
|
=head2 idealize |
1353
|
|
|
|
|
|
|
|
1354
|
|
|
|
|
|
|
Computes and applies an ideal sequence mask to the Ali and returns it. This |
1355
|
|
|
|
|
|
|
is only a convenience method. |
1356
|
|
|
|
|
|
|
|
1357
|
|
|
|
|
|
|
When invoked without arguments, it will discard the gaps that are |
1358
|
|
|
|
|
|
|
universally shared by all the sequences. Otherwise, the provided argument |
1359
|
|
|
|
|
|
|
corresponds to the threshold of the C<ideal_mask> method described in |
1360
|
|
|
|
|
|
|
L<Bio::MUST::Core::SeqMask>. |
1361
|
|
|
|
|
|
|
|
1362
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdList'; |
1363
|
|
|
|
|
|
|
my $fast_seqs = IdList->load('fast_evolving_seqs.idl'); |
1364
|
|
|
|
|
|
|
my $seqs2keep = $fast_seqs->negative_list($ali); |
1365
|
|
|
|
|
|
|
$ali->apply_list($seqs2keep); # discard fast-evolving seqs |
1366
|
|
|
|
|
|
|
$ali->idealize; # discard newly shared gaps caused by fast seqs |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1369
|
|
|
|
|
|
|
my $ali = Ali->load('hmm_based.ali'); |
1370
|
|
|
|
|
|
|
$ali->idealize(0.05); # discard insertions due to <5% of the seqs |
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
This method accepts an optional argument. |
1373
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
=head1 MISC METHODS |
1375
|
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
=head2 gapmiss_regex |
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
Returns a regular expression matching gaps and ambiguous or missing states. |
1379
|
|
|
|
|
|
|
The exact regex returned depends on the type of sequences in the Ali (nucl. or |
1380
|
|
|
|
|
|
|
proteins). |
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
my $regex = $ali->gapmiss_regex; |
1383
|
|
|
|
|
|
|
my $first_seq = $ali->get_seq(0)->seq; |
1384
|
|
|
|
|
|
|
my $gapmiss_n = $first_seq =~ m/($regex)/xmsg; |
1385
|
|
|
|
|
|
|
say "The first sequence has $gapmiss_n gaps or ambiguous/missing sites"; |
1386
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
This method does not accept any arguments. |
1388
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
=head2 map_coords |
1390
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
Converts a set of site positions from Ali coordinates to coordinates of the |
1392
|
|
|
|
|
|
|
specified sequence (thereby ignoring positions due to gaps). Returns the |
1393
|
|
|
|
|
|
|
converted sites in sequence coordinates as an array refrence. |
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1396
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
1397
|
|
|
|
|
|
|
my $id = 'GIV-Norovirus Hum.GIV.1.POL_1338688@508124125'; |
1398
|
|
|
|
|
|
|
my $ali_coords = [ 4, 25, 73, 89, 104, 116 ]; |
1399
|
|
|
|
|
|
|
my $seq_coords = $ali->map_coords($id, $ali_coords); |
1400
|
|
|
|
|
|
|
# $seq_coords is [ 3, 23, 59, 71, 71, 74 ] |
1401
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
This method requires two arguments: the id of a sequence and an array |
1403
|
|
|
|
|
|
|
reference of input sites in Ali coordinates. |
1404
|
|
|
|
|
|
|
|
1405
|
|
|
|
|
|
|
=head1 I/O METHODS |
1406
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
=head2 load |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
Class method (constructor) returning a new Ali read from disk. This method |
1410
|
|
|
|
|
|
|
will transparently import plain FASTA files in addition to the MUST |
1411
|
|
|
|
|
|
|
pseudo-FASTA format (ALI files). |
1412
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
use Test::Deeply; |
1414
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1415
|
|
|
|
|
|
|
my $ali1 = Ali->load('example.ali'); |
1416
|
|
|
|
|
|
|
my $ali2 = Ali->load('example.fasta'); |
1417
|
|
|
|
|
|
|
my @seqs1 = $ali1->all_seqs; |
1418
|
|
|
|
|
|
|
my @seqs2 = $ali2->all_seqs; |
1419
|
|
|
|
|
|
|
is_deeply, \@seqs1, \@seqs2, 'should be true'; |
1420
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
This method requires one argument. |
1422
|
|
|
|
|
|
|
|
1423
|
|
|
|
|
|
|
=head2 store |
1424
|
|
|
|
|
|
|
|
1425
|
|
|
|
|
|
|
Writes the Ali to disk in the MUST pseudo-FASTA format (ALI files). |
1426
|
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
|
Note that the ALI format is only used when the suffix of the outfile name is |
1428
|
|
|
|
|
|
|
'.ali'. In all other cases (including lack of suffix), this method |
1429
|
|
|
|
|
|
|
automatically forwards the call to C<store_fasta>. |
1430
|
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
$ali->store('output.ali'); |
1432
|
|
|
|
|
|
|
# output.ali is written in ALI format |
1433
|
|
|
|
|
|
|
$ali->store('output.fasta'); |
1434
|
|
|
|
|
|
|
# output.fasta is written in FASTA format |
1435
|
|
|
|
|
|
|
|
1436
|
|
|
|
|
|
|
This method requires one argument (but see C<store_fasta> in case of automatic |
1437
|
|
|
|
|
|
|
forwarding of the method call). |
1438
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
=head2 store_fasta |
1440
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
Writes the Ali to disk in the plain FASTA format. |
1442
|
|
|
|
|
|
|
|
1443
|
|
|
|
|
|
|
For compatibility purposes, this method automatically fetches sequence ids |
1444
|
|
|
|
|
|
|
using the C<foreign_id> method instead of the native C<full_id> method, both |
1445
|
|
|
|
|
|
|
described in L<Bio::MUST::Core::SeqId>. |
1446
|
|
|
|
|
|
|
|
1447
|
|
|
|
|
|
|
$ali->store_fasta( 'output.fasta' ); |
1448
|
|
|
|
|
|
|
$ali->store_fasta( 'output.fasta', {chunk => -1, degap => 1} ); |
1449
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
This method requires one argument and accepts a second optional argument |
1451
|
|
|
|
|
|
|
controlling the output format. It is a hash reference that may contain one |
1452
|
|
|
|
|
|
|
or more of the following keys: |
1453
|
|
|
|
|
|
|
|
1454
|
|
|
|
|
|
|
- clean: replace all ambiguous and missing states by C<X> (default: false) |
1455
|
|
|
|
|
|
|
- degap: boolean value controlling degapping (default: false) |
1456
|
|
|
|
|
|
|
- chunk: line width (default is 60 chars; negative values means no wrap) |
1457
|
|
|
|
|
|
|
|
1458
|
|
|
|
|
|
|
Finally, it is possible to fine-tune the behavior of the C<clean> option by |
1459
|
|
|
|
|
|
|
providing another character than C<X> through the C<gapify> key. This can be |
1460
|
|
|
|
|
|
|
useful to replace all ambiguous and missing states by gaps, as shown below: |
1461
|
|
|
|
|
|
|
|
1462
|
|
|
|
|
|
|
$ali->store_fasta( 'output.fasta, { clean => 1, gapify => '*' } ); |
1463
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
=head2 temp_fasta |
1465
|
|
|
|
|
|
|
|
1466
|
|
|
|
|
|
|
Writes a temporary copy of the Ali to disk in the plain FASTA format using |
1467
|
|
|
|
|
|
|
numeric sequence ids and returns the name of the temporary file. This is |
1468
|
|
|
|
|
|
|
only a convenience method. |
1469
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
In list context, returns the IdMapper object along with temporary filename. |
1471
|
|
|
|
|
|
|
|
1472
|
|
|
|
|
|
|
my $infile = $ali->temp_fasta( { degap => 1 } ); |
1473
|
|
|
|
|
|
|
my $output = `script.sh $infile`; |
1474
|
|
|
|
|
|
|
... |
1475
|
|
|
|
|
|
|
|
1476
|
|
|
|
|
|
|
This method accepts the same optional argument hash as C<store_fasta>. |
1477
|
|
|
|
|
|
|
However, an additional option (C<id_prefix>) is available to control the way |
1478
|
|
|
|
|
|
|
abbreviated sequence ids are prefixed by the C<std_mapper> method (see |
1479
|
|
|
|
|
|
|
L<Bio::MUST::Core::Listable>). |
1480
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
my $infile1 = $ali1->temp_fasta( { id_prefix => 'file1-' } ); |
1482
|
|
|
|
|
|
|
my $infile2 = $ali2->temp_fasta( { id_prefix => 'file2-' } ); |
1483
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
=head2 load_phylip |
1485
|
|
|
|
|
|
|
|
1486
|
|
|
|
|
|
|
Class method (constructor) returning a new Ali read from a file in the |
1487
|
|
|
|
|
|
|
PHYLIP format. Both sequential and interleaved formats are supported. |
1488
|
|
|
|
|
|
|
|
1489
|
|
|
|
|
|
|
The only constraint is that sequence ids MUST NOT contain whitespace and be |
1490
|
|
|
|
|
|
|
followed by at least one whitespace character. This means that some old-school |
1491
|
|
|
|
|
|
|
PHYLIP files not following this convention will not be processed correctly. |
1492
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
When using the interleaved format, sequence ids may or may not be repeated in |
1494
|
|
|
|
|
|
|
each block. |
1495
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1497
|
|
|
|
|
|
|
my $ali = Ali->load_phylip('phylip.phy'); |
1498
|
|
|
|
|
|
|
say $ali->count_seqs; |
1499
|
|
|
|
|
|
|
say $ali->width; |
1500
|
|
|
|
|
|
|
|
1501
|
|
|
|
|
|
|
# outputs: |
1502
|
|
|
|
|
|
|
# 10 |
1503
|
|
|
|
|
|
|
# 709 |
1504
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
This method requires one argument. |
1506
|
|
|
|
|
|
|
|
1507
|
|
|
|
|
|
|
=head2 store_phylip |
1508
|
|
|
|
|
|
|
|
1509
|
|
|
|
|
|
|
Writes the Ali to disk in the interleaved (or sequential) PHYLIP format. |
1510
|
|
|
|
|
|
|
|
1511
|
|
|
|
|
|
|
To ensure maximal flexibility, this method fetches sequence ids using the |
1512
|
|
|
|
|
|
|
native C<full_id> method described in L<Bio::MUST::Core::SeqId>, but |
1513
|
|
|
|
|
|
|
truncates them to 10 characters, as expected by the original PHYLIP software |
1514
|
|
|
|
|
|
|
package. No other tinkering is carried out on the ids. Thus, if the ids |
1515
|
|
|
|
|
|
|
contain whitespace or are not unique in their 10 first characters, it is |
1516
|
|
|
|
|
|
|
advised to first map them using one of the constructors in |
1517
|
|
|
|
|
|
|
L<Bio::MUST::Core::IdMapper>. |
1518
|
|
|
|
|
|
|
|
1519
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1520
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::IdMapper'; |
1521
|
|
|
|
|
|
|
my $ali = Ali->load('input.ali'); |
1522
|
|
|
|
|
|
|
my $mapper = IdMapper->std_mapper($ali); |
1523
|
|
|
|
|
|
|
$ali->shorten_ids($mapper); |
1524
|
|
|
|
|
|
|
$ali->store_phylip( 'input.phy', { chunk => 50 } ); |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
This method requires one argument and accepts a second optional argument |
1527
|
|
|
|
|
|
|
controlling the output format. It is a hash reference that may contain one |
1528
|
|
|
|
|
|
|
or more of the following keys: |
1529
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
- short: truncate ids to 10 chars, as in original PHYLIP (defaut: yes) |
1531
|
|
|
|
|
|
|
- clean: replace all ambiguous and missing states by 'X' (default: false) |
1532
|
|
|
|
|
|
|
- chunk: line width (default: 60 chars; negative values means no wrap) |
1533
|
|
|
|
|
|
|
|
1534
|
|
|
|
|
|
|
To store the Ali in PHYLIP sequential format, specify a negative chunk (-1). |
1535
|
|
|
|
|
|
|
|
1536
|
|
|
|
|
|
|
=head2 load_stockholm |
1537
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
Class method (constructor) returning a new Ali read from a file in the |
1539
|
|
|
|
|
|
|
STOCKHOLM format. =GF comments are retained (see above) but not the other |
1540
|
|
|
|
|
|
|
comment classes (=GS, =GR and =GC). |
1541
|
|
|
|
|
|
|
|
1542
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1543
|
|
|
|
|
|
|
my $ali = Ali->load('upsk.stockholm'); |
1544
|
|
|
|
|
|
|
say $ali->header; |
1545
|
|
|
|
|
|
|
|
1546
|
|
|
|
|
|
|
# outputs: |
1547
|
|
|
|
|
|
|
# ID UPSK |
1548
|
|
|
|
|
|
|
# SE Predicted; Infernal |
1549
|
|
|
|
|
|
|
# SS Published; PMID 9223489 |
1550
|
|
|
|
|
|
|
# RN [1] |
1551
|
|
|
|
|
|
|
# RM 9223489 |
1552
|
|
|
|
|
|
|
# RT The role of the pseudoknot at the 3' end of turnip yellow mosaic |
1553
|
|
|
|
|
|
|
# RT virus RNA in minus-strand synthesis by the viral RNA-dependent RNA |
1554
|
|
|
|
|
|
|
# RT polymerase. |
1555
|
|
|
|
|
|
|
# RA Deiman BA, Kortlever RM, Pleij CW; |
1556
|
|
|
|
|
|
|
# RL J Virol 1997;71:5990-5996. |
1557
|
|
|
|
|
|
|
|
1558
|
|
|
|
|
|
|
This method requires one argument. |
1559
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
=head2 instant_store |
1561
|
|
|
|
|
|
|
|
1562
|
|
|
|
|
|
|
Class method intended to transform a large sequence file read from disk |
1563
|
|
|
|
|
|
|
without loading it in memory. This method will transparently process plain |
1564
|
|
|
|
|
|
|
FASTA files in addition to the MUST pseudo-FASTA format (ALI files). |
1565
|
|
|
|
|
|
|
|
1566
|
|
|
|
|
|
|
my $chunk = 200; |
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
my $split = sub { |
1569
|
|
|
|
|
|
|
my $seq = shift; |
1570
|
|
|
|
|
|
|
my $base_id = ( split /\s+/xms, $seq->full_id )[0]; |
1571
|
|
|
|
|
|
|
my $max_pos = $seq->seq_len - $chunk; |
1572
|
|
|
|
|
|
|
my $n = 0; |
1573
|
|
|
|
|
|
|
my $out_str; |
1574
|
|
|
|
|
|
|
for (my $pos = 0; $pos <= $max_pos; $pos += $chunk, $n++) { |
1575
|
|
|
|
|
|
|
$out_str .= ">$base_id.$n\n" . $seq->edit_seq($pos, |
1576
|
|
|
|
|
|
|
$pos + $chunk <= $max_pos ? $chunk : 2 * $chunk |
1577
|
|
|
|
|
|
|
) . "\n"; |
1578
|
|
|
|
|
|
|
} |
1579
|
|
|
|
|
|
|
return $out_str; |
1580
|
|
|
|
|
|
|
}; |
1581
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1583
|
|
|
|
|
|
|
|
1584
|
|
|
|
|
|
|
Ali->instant_store( |
1585
|
|
|
|
|
|
|
'outfile.fasta', { infile => 'infile.fasta', coderef => $split } |
1586
|
|
|
|
|
|
|
); |
1587
|
|
|
|
|
|
|
|
1588
|
|
|
|
|
|
|
This method requires two arguments. The sercond is a hash reference that must |
1589
|
|
|
|
|
|
|
contain the following keys: |
1590
|
|
|
|
|
|
|
- infile: input sequence file |
1591
|
|
|
|
|
|
|
- coderef: subroutine implementing the transforming logic |
1592
|
|
|
|
|
|
|
|
1593
|
|
|
|
|
|
|
=head2 instant_count |
1594
|
|
|
|
|
|
|
|
1595
|
|
|
|
|
|
|
Class method returning the number of seqs in any sequence file read from disk |
1596
|
|
|
|
|
|
|
without loading it in memory. This method will transparently process plain |
1597
|
|
|
|
|
|
|
FASTA files in addition to the MUST pseudo-FASTA format (ALI files). |
1598
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
use aliased 'Bio::MUST::Core::Ali'; |
1600
|
|
|
|
|
|
|
my $seq_n = Ali->instant_count('input.ali'); |
1601
|
|
|
|
|
|
|
say $seq_n; |
1602
|
|
|
|
|
|
|
|
1603
|
|
|
|
|
|
|
=head1 ALIASES |
1604
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
=head2 height |
1606
|
|
|
|
|
|
|
|
1607
|
|
|
|
|
|
|
Alias for C<count_seqs> method. For API consistency. |
1608
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
=head1 AUTHOR |
1610
|
|
|
|
|
|
|
|
1611
|
|
|
|
|
|
|
Denis BAURAIN <denis.baurain@uliege.be> |
1612
|
|
|
|
|
|
|
|
1613
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
1614
|
|
|
|
|
|
|
|
1615
|
|
|
|
|
|
|
=for stopwords Catherine COLSON Arnaud DI FRANCO |
1616
|
|
|
|
|
|
|
|
1617
|
|
|
|
|
|
|
=over 4 |
1618
|
|
|
|
|
|
|
|
1619
|
|
|
|
|
|
|
=item * |
1620
|
|
|
|
|
|
|
|
1621
|
|
|
|
|
|
|
Catherine COLSON <ccolson@doct.uliege.be> |
1622
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
=item * |
1624
|
|
|
|
|
|
|
|
1625
|
|
|
|
|
|
|
Arnaud DI FRANCO <arnaud.difranco@gmail.com> |
1626
|
|
|
|
|
|
|
|
1627
|
|
|
|
|
|
|
=back |
1628
|
|
|
|
|
|
|
|
1629
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
1630
|
|
|
|
|
|
|
|
1631
|
|
|
|
|
|
|
This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN. |
1632
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
1634
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
1635
|
|
|
|
|
|
|
|
1636
|
|
|
|
|
|
|
=cut |