line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::MUST::Core::SeqMask; |
2
|
|
|
|
|
|
|
# ABSTRACT: Sequence mask for selecting specific sites |
3
|
|
|
|
|
|
|
# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be> |
4
|
|
|
|
|
|
|
# CONTRIBUTOR: Raphael LEONARD <rleonard@doct.uliege.be> |
5
|
|
|
|
|
|
|
$Bio::MUST::Core::SeqMask::VERSION = '0.212670'; |
6
|
17
|
|
|
17
|
|
140
|
use Moose; |
|
17
|
|
|
|
|
46
|
|
|
17
|
|
|
|
|
136
|
|
7
|
17
|
|
|
17
|
|
122680
|
use namespace::autoclean; |
|
17
|
|
|
|
|
45
|
|
|
17
|
|
|
|
|
196
|
|
8
|
|
|
|
|
|
|
|
9
|
17
|
|
|
17
|
|
1715
|
use autodie; |
|
17
|
|
|
|
|
43
|
|
|
17
|
|
|
|
|
167
|
|
10
|
17
|
|
|
17
|
|
93880
|
use feature qw(say); |
|
17
|
|
|
|
|
49
|
|
|
17
|
|
|
|
|
1786
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# use Smart::Comments '###'; |
13
|
|
|
|
|
|
|
|
14
|
17
|
|
|
17
|
|
135
|
use Carp; |
|
17
|
|
|
|
|
39
|
|
|
17
|
|
|
|
|
1477
|
|
15
|
17
|
|
|
17
|
|
135
|
use File::Basename; |
|
17
|
|
|
|
|
50
|
|
|
17
|
|
|
|
|
1681
|
|
16
|
17
|
|
|
17
|
|
11486
|
use IPC::System::Simple qw(system); |
|
17
|
|
|
|
|
109925
|
|
|
17
|
|
|
|
|
1434
|
|
17
|
17
|
|
|
17
|
|
3525
|
use List::AllUtils 0.08 qw(uniq max sum natatime first_index last_index pairmap mesh); |
|
17
|
|
|
|
|
56658
|
|
|
17
|
|
|
|
|
2159
|
|
18
|
17
|
|
|
17
|
|
148
|
use Path::Class qw(file); |
|
17
|
|
|
|
|
42
|
|
|
17
|
|
|
|
|
830
|
|
19
|
17
|
|
|
17
|
|
135
|
use POSIX qw(ceil floor); |
|
17
|
|
|
|
|
36
|
|
|
17
|
|
|
|
|
117
|
|
20
|
|
|
|
|
|
|
|
21
|
17
|
|
|
17
|
|
1253
|
use Bio::MUST::Core::Types; |
|
17
|
|
|
|
|
69
|
|
|
17
|
|
|
|
|
593
|
|
22
|
17
|
|
|
17
|
|
112
|
use Bio::MUST::Core::Constants qw(:gaps :files); |
|
17
|
|
|
|
|
44
|
|
|
17
|
|
|
|
|
4144
|
|
23
|
17
|
|
|
17
|
|
9549
|
use Bio::MUST::Core::Utils qw(change_suffix); |
|
17
|
|
|
|
|
53
|
|
|
17
|
|
|
|
|
1167
|
|
24
|
17
|
|
|
17
|
|
150
|
use aliased 'Bio::MUST::Core::Seq'; |
|
17
|
|
|
|
|
39
|
|
|
17
|
|
|
|
|
142
|
|
25
|
17
|
|
|
17
|
|
3877
|
use aliased 'Bio::MUST::Core::Ali'; |
|
17
|
|
|
|
|
42
|
|
|
17
|
|
|
|
|
92
|
|
26
|
17
|
|
|
17
|
|
2338
|
use aliased 'Bio::MUST::Core::IdList'; |
|
17
|
|
|
|
|
44
|
|
|
17
|
|
|
|
|
116
|
|
27
|
|
|
|
|
|
|
with 'Bio::MUST::Core::Roles::Commentable'; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# public array |
31
|
|
|
|
|
|
|
# Note: mask methods use the 0-based C/Perl array indexing |
32
|
|
|
|
|
|
|
# ...but block methods encode bounds using the usual 1-based indexing |
33
|
|
|
|
|
|
|
has 'mask' => ( |
34
|
|
|
|
|
|
|
traits => ['Array'], |
35
|
|
|
|
|
|
|
is => 'ro', |
36
|
|
|
|
|
|
|
isa => 'ArrayRef[Bool]', |
37
|
|
|
|
|
|
|
default => sub { [] }, |
38
|
|
|
|
|
|
|
writer => '_set_mask', |
39
|
|
|
|
|
|
|
handles => { |
40
|
|
|
|
|
|
|
mask_len => 'count', |
41
|
|
|
|
|
|
|
all_states => 'elements', |
42
|
|
|
|
|
|
|
add_state => 'push', |
43
|
|
|
|
|
|
|
set_state => 'set', |
44
|
|
|
|
|
|
|
state_at => 'get', |
45
|
|
|
|
|
|
|
}, |
46
|
|
|
|
|
|
|
); |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub first_site { |
51
|
2
|
|
|
2
|
1
|
7
|
my $self = shift; |
52
|
2
|
|
|
8
|
|
99
|
return first_index { $_ } $self->all_states; |
|
8
|
|
|
|
|
42
|
|
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub last_site { |
58
|
2
|
|
|
2
|
1
|
5
|
my $self = shift; |
59
|
2
|
|
|
46
|
|
99
|
return last_index { $_ } $self->all_states; |
|
46
|
|
|
|
|
82
|
|
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
sub count_sites { |
65
|
76
|
|
|
76
|
1
|
35978
|
my $self = shift; |
66
|
76
|
|
|
|
|
3330
|
my $count = grep { $_ } $self->all_states; |
|
70226
|
|
|
|
|
100387
|
|
67
|
76
|
|
|
|
|
3233
|
return $count; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub coverage { |
73
|
2
|
|
|
2
|
1
|
7
|
my $self = shift; |
74
|
2
|
|
|
|
|
6
|
return $self->count_sites / $self->mask_len; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
sub empty_mask { |
80
|
48
|
|
|
48
|
1
|
150
|
my $class = shift; |
81
|
48
|
|
100
|
|
|
172
|
my $len = shift // 0; |
82
|
|
|
|
|
|
|
|
83
|
48
|
|
|
|
|
13297
|
return $class->new( mask => [ (0) x $len ] ); |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub custom_mask { |
89
|
33
|
|
|
33
|
1
|
131
|
my $class = shift; |
90
|
33
|
|
|
|
|
66
|
my $len = shift; |
91
|
33
|
|
|
|
|
68
|
my $sites = shift; |
92
|
|
|
|
|
|
|
|
93
|
33
|
|
|
|
|
152
|
my $mask = $class->empty_mask($len); |
94
|
33
|
|
|
|
|
106
|
$mask->set_state($_, 1) for @{$sites}; |
|
33
|
|
|
|
|
2020
|
|
95
|
|
|
|
|
|
|
|
96
|
33
|
|
|
|
|
4065
|
return $mask; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
sub mark_block { ## no critic (RequireArgUnpacking) |
102
|
40
|
|
|
40
|
1
|
128
|
return shift->_change_block(1, @_); |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub unmark_block { ## no critic (RequireArgUnpacking) |
108
|
0
|
|
|
0
|
1
|
0
|
return shift->_change_block(0, @_); |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub _change_block { |
113
|
40
|
|
|
40
|
|
75
|
my $self = shift; |
114
|
40
|
|
|
|
|
63
|
my $state = shift; |
115
|
40
|
|
|
|
|
68
|
my $start = shift; |
116
|
40
|
|
|
|
|
60
|
my $end = shift; |
117
|
|
|
|
|
|
|
|
118
|
40
|
|
|
|
|
1516
|
$self->set_state($_, $state) for $start-1 .. $end-1; |
119
|
|
|
|
|
|
|
|
120
|
40
|
|
|
|
|
95
|
return $self; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub mask2blocks { |
126
|
14
|
|
|
14
|
1
|
79
|
my $self = shift; |
127
|
|
|
|
|
|
|
|
128
|
14
|
|
|
|
|
29
|
my @blocks; |
129
|
14
|
|
|
|
|
31
|
my $start = -1; |
130
|
14
|
|
|
|
|
25
|
my $site = 0; |
131
|
14
|
|
|
|
|
549
|
while ($site < $self->mask_len) { |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# open a new block (if not yet open) |
134
|
217
|
100
|
|
|
|
7212
|
if ( $self->state_at($site) ) { |
135
|
123
|
100
|
|
|
|
261
|
$start = $site+1 if $start < 0; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# close the current block (if any open) |
139
|
|
|
|
|
|
|
else { |
140
|
94
|
100
|
|
|
|
198
|
if ($start >= 0) { |
141
|
34
|
|
|
|
|
88
|
push @blocks, [ $start, $site ]; |
142
|
34
|
|
|
|
|
60
|
$start = -1; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
217
|
|
|
|
|
7321
|
$site++; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# close last block (if still open) |
150
|
14
|
100
|
|
|
|
311
|
push @blocks, [ $start, $site ] if $start >= 0; |
151
|
|
|
|
|
|
|
|
152
|
14
|
|
|
|
|
51
|
return \@blocks; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub blocks2mask { |
158
|
14
|
|
|
14
|
1
|
16653
|
my $class = shift; |
159
|
14
|
|
|
|
|
39
|
my $blocks = shift; |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# old version: only handled non-overlapping ordered blocks |
162
|
|
|
|
|
|
|
# my @mask; |
163
|
|
|
|
|
|
|
# for my $block ( @{$blocks} ) { |
164
|
|
|
|
|
|
|
# my ($start, $end) = @{$block}; |
165
|
|
|
|
|
|
|
# push @mask, (0) x ($start-@mask-1); # pad last block |
166
|
|
|
|
|
|
|
# push @mask, (1) x ($end-$start+1); # mark new block |
167
|
|
|
|
|
|
|
# } |
168
|
|
|
|
|
|
|
# return \@mask; |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# Note: blocks can now overlap and come in any order |
171
|
|
|
|
|
|
|
# this allows using SeqMask for computing seq coverages |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# setup empty mask going up to max end (second slot) |
174
|
14
|
|
|
|
|
27
|
my $max = max map { $_->[1] } @{$blocks}; |
|
37
|
|
|
|
|
124
|
|
|
14
|
|
|
|
|
42
|
|
175
|
14
|
|
|
|
|
75
|
my $mask = $class->empty_mask($max); |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# mark all states included in each block |
178
|
14
|
|
|
|
|
55
|
$mask->mark_block( @{$_} ) for @{$blocks}; |
|
14
|
|
|
|
|
48
|
|
|
37
|
|
|
|
|
125
|
|
179
|
|
|
|
|
|
|
|
180
|
14
|
|
|
|
|
439
|
return $mask; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# Ali-based SeqMask factory methods |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub neutral_mask { |
188
|
1
|
|
|
1
|
1
|
13
|
my $class = shift; |
189
|
1
|
|
|
|
|
3
|
my $ali = shift; |
190
|
|
|
|
|
|
|
|
191
|
1
|
|
|
|
|
4
|
my $width = $ali->width; |
192
|
1
|
|
|
|
|
119
|
return $class->new( mask => [ (1) x $width ] ); |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub variable_mask { |
198
|
3
|
|
|
3
|
1
|
1304
|
my $class = shift; |
199
|
3
|
|
|
|
|
9
|
my $ali = shift; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# TODO: profile and optimize as ideal_mask below (if needed) |
202
|
|
|
|
|
|
|
|
203
|
3
|
|
|
|
|
6
|
my @mask; |
204
|
3
|
|
|
|
|
17
|
my $width = $ali->width; |
205
|
3
|
|
|
|
|
26
|
my $regex = $ali->gapmiss_regex; |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
# filter out sites with only one state |
208
|
3
|
|
|
|
|
19
|
for (my $site = 0; $site < $width; $site++) { |
209
|
|
|
|
|
|
|
my $state_n = uniq # count unique states |
210
|
432032
|
|
|
|
|
1296499
|
grep { $_ !~ m/$regex/xms } # which are valid |
211
|
53696
|
|
|
|
|
1845280
|
map { $_->state_at($site) } # and found at that site |
|
432032
|
|
|
|
|
957851
|
|
212
|
|
|
|
|
|
|
$ali->all_seqs # across all seqs |
213
|
|
|
|
|
|
|
; |
214
|
53696
|
100
|
|
|
|
248257
|
push @mask, $state_n > 1 ? 1 : 0; |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
3
|
|
|
|
|
141
|
return $class->new( mask => \@mask ); |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub parsimony_mask { |
223
|
1
|
|
|
1
|
1
|
16
|
my $class = shift; |
224
|
1
|
|
|
|
|
11
|
my $ali = shift; |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# TODO: profile and optimize as ideal_mask below (if needed) |
227
|
|
|
|
|
|
|
|
228
|
1
|
|
|
|
|
5
|
my @mask; |
229
|
1
|
|
|
|
|
15
|
my $width = $ali->width; |
230
|
1
|
|
|
|
|
8
|
my $regex = $ali->gapmiss_regex; |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
# filter out sites with only one state |
233
|
1
|
|
|
|
|
14
|
for (my $site = 0; $site < $width; $site++) { |
234
|
|
|
|
|
|
|
my @states = # get states |
235
|
209856
|
|
|
|
|
632550
|
grep { $_ !~ m/$regex/xms } # which are valid |
236
|
26232
|
|
|
|
|
942632
|
map { $_->state_at($site) } # and found at that site |
|
209856
|
|
|
|
|
472243
|
|
237
|
|
|
|
|
|
|
$ali->all_seqs; # across all seqs |
238
|
|
|
|
|
|
|
; |
239
|
|
|
|
|
|
|
#### @states |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
# check whether at least two states are seen at least twice |
242
|
26232
|
|
|
|
|
66399
|
my %count_for; |
243
|
26232
|
|
|
|
|
115709
|
$count_for{$_}++ for @states; |
244
|
|
|
|
|
|
|
#### %count_for |
245
|
26232
|
|
|
|
|
59536
|
my $state_n = grep { $count_for{$_} >= 2 } keys %count_for; |
|
60682
|
|
|
|
|
127019
|
|
246
|
|
|
|
|
|
|
#### $state_n |
247
|
26232
|
100
|
|
|
|
118997
|
push @mask, $state_n >= 2 ? 1 : 0; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
|
250
|
1
|
|
|
|
|
38
|
return $class->new( mask => \@mask ); |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
sub ideal_mask { |
255
|
13
|
|
|
13
|
1
|
41
|
my $class = shift; |
256
|
13
|
|
|
|
|
37
|
my $ali = shift; |
257
|
13
|
|
50
|
|
|
46
|
my $max_res = shift // 0; # defaults to shared gaps only |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
# convert fractional max_res to conservative integer (if needed) |
260
|
13
|
100
|
66
|
|
|
470
|
$max_res = floor($max_res * $ali->count_seqs) |
261
|
|
|
|
|
|
|
if 0 < $max_res && $max_res < 1; |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# pick up right regex based on sequence type |
264
|
13
|
|
|
|
|
54
|
my $regex = $ali->gapmiss_regex; |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# determine count of non-gap-nor-missing states for each site |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# original version: easy to understand but much too slow on large matrices |
269
|
|
|
|
|
|
|
# because of unavoidable regex recompilation for each character state |
270
|
|
|
|
|
|
|
# (lots of calls to CORE::regcomp in output Devel::NYTProf) |
271
|
|
|
|
|
|
|
# my @counts; |
272
|
|
|
|
|
|
|
# for my $seq ($ali->all_seqs) { |
273
|
|
|
|
|
|
|
# my $site = 0; |
274
|
|
|
|
|
|
|
# $counts[$site++] += ($_ !~ $regex) for $seq->all_states; |
275
|
|
|
|
|
|
|
# } |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
# 'all magic comes with a price' version: the /o regex modifier does speed |
278
|
|
|
|
|
|
|
# up things but compiles the regex once for all, which leads to bugs when |
279
|
|
|
|
|
|
|
# dealing with both nucleotide and protein alignments in the same run |
280
|
|
|
|
|
|
|
# my @counts; |
281
|
|
|
|
|
|
|
# for my $seq ($ali->all_seqs) { |
282
|
|
|
|
|
|
|
# my $site = 0; |
283
|
|
|
|
|
|
|
# $counts[$site++] += ($_ !~ m/$regex/o) for $seq->all_states; |
284
|
|
|
|
|
|
|
# } |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
# improved version: more convoluted but much faster |
287
|
|
|
|
|
|
|
# idea: directly search in sequence strings |
288
|
|
|
|
|
|
|
# algorithm: set maximal count for each site |
289
|
|
|
|
|
|
|
# then: decrease corrresponding count for each gap-or-missing state |
290
|
|
|
|
|
|
|
# my @counts = ( $ali->count_seqs ) x $ali->width; |
291
|
|
|
|
|
|
|
# for my $seq ($ali->all_seqs) { |
292
|
|
|
|
|
|
|
# my $str = $seq->seq; |
293
|
|
|
|
|
|
|
# while ($str =~ m/$regex/xmsg) { |
294
|
|
|
|
|
|
|
# $counts[ pos($str) - 1 ]--; |
295
|
|
|
|
|
|
|
# } |
296
|
|
|
|
|
|
|
# } |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# parallel version: quite complex and inefficient |
299
|
|
|
|
|
|
|
# setup threading |
300
|
|
|
|
|
|
|
# my $thread_n = 2; |
301
|
|
|
|
|
|
|
# my $pl = Parallel::Loops->new($thread_n); |
302
|
|
|
|
|
|
|
# my %counts; |
303
|
|
|
|
|
|
|
# $pl->share( \%counts ); |
304
|
|
|
|
|
|
|
# |
305
|
|
|
|
|
|
|
# # define sites ranges |
306
|
|
|
|
|
|
|
# my $site_n = $ali->width; |
307
|
|
|
|
|
|
|
# my $chunk = ceil( $site_n / $thread_n ); |
308
|
|
|
|
|
|
|
# my @ranges; |
309
|
|
|
|
|
|
|
# for (my $start = 0; $start < $site_n; $start += $chunk) { |
310
|
|
|
|
|
|
|
# my $end = min( $start + $chunk, $site_n ); |
311
|
|
|
|
|
|
|
# push @ranges, [ $start..$end-1 ]; |
312
|
|
|
|
|
|
|
# } |
313
|
|
|
|
|
|
|
# |
314
|
|
|
|
|
|
|
# # process ranges in parallel |
315
|
|
|
|
|
|
|
# my @seqs = $ali->all_seqs; |
316
|
|
|
|
|
|
|
# $pl->foreach( \@ranges, sub { |
317
|
|
|
|
|
|
|
# for my $seq (@seqs) { |
318
|
|
|
|
|
|
|
# $counts{$_} += $seq->state_at($_) !~ $regex for @{$_}; |
319
|
|
|
|
|
|
|
# } |
320
|
|
|
|
|
|
|
# } ); |
321
|
|
|
|
|
|
|
# |
322
|
|
|
|
|
|
|
# # filter out sites with at most max_res non-gap-nor-missing states |
323
|
|
|
|
|
|
|
# my @mask = map { $counts{$_} > $max_res ? 1 : 0 } |
324
|
|
|
|
|
|
|
# sort { $a <=> $b } keys %counts; # watch the sorting cost! |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
# current version: even more complex but even more faster |
327
|
|
|
|
|
|
|
# idea: search for (long) gaps instead of individual states |
328
|
|
|
|
|
|
|
# algorithm: set maximal count for each site |
329
|
|
|
|
|
|
|
# then: decrease counts for each stretch of gap-or-missing states |
330
|
13
|
|
|
|
|
460
|
my @counts = ( $ali->count_seqs ) x $ali->width; |
331
|
13
|
|
|
|
|
526
|
for my $seq ($ali->all_seqs) { |
332
|
160
|
|
|
|
|
4722
|
my $str = $seq->seq; |
333
|
160
|
|
|
|
|
1023
|
while ($str =~ m/($regex+)/xmsg) { # look for next gap |
334
|
7532
|
|
|
|
|
13261
|
my $end = pos($str) - 1; # compute gap start |
335
|
7532
|
|
|
|
|
12349
|
my $begin = $end - length($1) + 1; # and gap end |
336
|
7532
|
|
|
|
|
97973
|
$counts[$_]-- for $begin..$end; # decrease count over the gap |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
# filter out sites with at most max_res non-gap-nor-missing states |
341
|
13
|
100
|
|
|
|
190
|
my @mask = map { $_ > $max_res ? 1 : 0 } @counts; |
|
86155
|
|
|
|
|
134178
|
|
342
|
|
|
|
|
|
|
|
343
|
13
|
|
|
|
|
866
|
return $class->new( mask => \@mask ); |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
my %gb_parms_for = ( |
348
|
|
|
|
|
|
|
strict => [ 0.85, 8, 10, 'N' ], # emulate original MUST settings |
349
|
|
|
|
|
|
|
medium => [ 0.75, 5, 5, 'H' ], |
350
|
|
|
|
|
|
|
loose => [ 0.55, 5, 5, 'H' ], |
351
|
|
|
|
|
|
|
); |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
sub gblocks_mask { |
354
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
355
|
0
|
|
|
|
|
0
|
my $ali = shift; |
356
|
0
|
|
0
|
|
|
0
|
my $mode = shift // 'strict'; |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
# check Gblocks settings |
359
|
0
|
0
|
|
|
|
0
|
if ( !defined $gb_parms_for{$mode} ) { |
360
|
0
|
|
|
|
|
0
|
carp "[BMC] Warning: invalid Gblocks settings: $mode; using strict!"; |
361
|
0
|
|
|
|
|
0
|
$mode = 'strict'; |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# create temp .fasta infile for Gblocks |
365
|
0
|
|
|
|
|
0
|
my $infile = $ali->temp_fasta; |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
# compute Gblocks parms depending on specified mode |
368
|
|
|
|
|
|
|
my ($t, $b1, $b2, $b3, $b4, $b5) = ( |
369
|
|
|
|
|
|
|
($ali->is_protein ? 'p' : 'd'), # Sequence Type |
370
|
|
|
|
|
|
|
int($ali->count_seqs / 2) + 1, # Conserved Position |
371
|
|
|
|
|
|
|
int($ali->count_seqs * $gb_parms_for{$mode}[0]), # Flank Position |
372
|
0
|
0
|
|
|
|
0
|
@{ $gb_parms_for{$mode} }[1..3], # other block parms |
|
0
|
|
|
|
|
0
|
|
373
|
|
|
|
|
|
|
); |
374
|
0
|
|
|
|
|
0
|
$b2 = max($b1, $b2); # with few seqs 0.55*n can be smaller than n/2+1 |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
# create Gblocks command |
377
|
0
|
|
|
|
|
0
|
my $cmd = "Gblocks $infile -t=$t" |
378
|
|
|
|
|
|
|
. " -b1=$b1 -b2=$b2 -b3=$b3 -b4=$b4 -b5=$b5" |
379
|
|
|
|
|
|
|
. ' -s=n > /dev/null 2> /dev/null' # minimal output |
380
|
|
|
|
|
|
|
; |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
# try to robustly execute Gblocks |
383
|
0
|
|
|
|
|
0
|
my $ret_code = system( [ 1, 127 ], $cmd); # Gblocks always returns 1?!? |
384
|
0
|
0
|
|
|
|
0
|
if ($ret_code == 127) { |
385
|
0
|
|
|
|
|
0
|
carp '[BMC] Warning: cannot execute Gblocks command;' |
386
|
|
|
|
|
|
|
. ' returning neutral mask!'; |
387
|
0
|
|
|
|
|
0
|
return $class->neutral_mask($ali); |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
# TODO: try to bypass shell (need for absolute path to executable then) |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
# parse Gblocks output to get blocks |
392
|
0
|
|
|
|
|
0
|
my $outfile = $infile . '-gb.htm'; |
393
|
0
|
|
|
|
|
0
|
open my $out, '<', $outfile; |
394
|
|
|
|
|
|
|
|
395
|
0
|
|
|
|
|
0
|
my @blocks; |
396
|
0
|
|
|
|
|
0
|
while (my $line = <$out>) { |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# only use the 'Flanks:' line |
399
|
0
|
|
|
|
|
0
|
my ($flanks) = $line =~ m/\AFlanks:\s+(.*)/xms; |
400
|
0
|
0
|
|
|
|
0
|
if (defined $flanks) { |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
# isolate numbers |
403
|
0
|
|
|
|
|
0
|
chomp $flanks; |
404
|
0
|
|
|
|
|
0
|
$flanks =~ s/[\[\]]//xmsg; |
405
|
0
|
|
|
|
|
0
|
my @bound_pairs = split /\s+/xms, $flanks; |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# take number pairs as block bounds |
408
|
0
|
|
|
|
|
0
|
my $it = natatime 2, @bound_pairs; |
409
|
0
|
|
|
|
|
0
|
while ( my @bounds = $it->() ) { |
410
|
0
|
|
|
|
|
0
|
push @blocks, \@bounds; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# ...and the 'New number of positions' line |
415
|
|
|
|
|
|
|
# to check Gblocks didn't shrink the original Ali due to shared gaps |
416
|
|
|
|
|
|
|
# which would left-shift all blocks bounds! |
417
|
0
|
|
|
|
|
0
|
my ($width) = $line =~ m/original\s+(\d+)\s+positions/xms; |
418
|
0
|
0
|
0
|
|
|
0
|
if (defined $width && $width != $ali->width) { |
419
|
0
|
|
|
|
|
0
|
carp '[BMC] Warning: shared gaps detected; returning neutral mask!'; |
420
|
0
|
|
|
|
|
0
|
return $class->neutral_mask($ali); |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
} |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# unlink temp files |
425
|
0
|
|
|
|
|
0
|
file( $infile)->remove; |
426
|
0
|
|
|
|
|
0
|
file($outfile)->remove; |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# compute SeqMask from Gblocks blocks |
429
|
0
|
|
|
|
|
0
|
return $class->blocks2mask( \@blocks ); |
430
|
|
|
|
|
|
|
} |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
my %bmge_parms_for = ( |
434
|
|
|
|
|
|
|
strict => [ 0.4, 0.05 ], |
435
|
|
|
|
|
|
|
medium => [ 0.5, 0.20 ], |
436
|
|
|
|
|
|
|
loose => [ 0.6, 0.40 ], |
437
|
|
|
|
|
|
|
); |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
sub bmge_mask { |
440
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
441
|
0
|
|
|
|
|
0
|
my $ali = shift; |
442
|
0
|
|
0
|
|
|
0
|
my $mode = shift // 'strict'; |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# check BMGE settings |
445
|
0
|
0
|
|
|
|
0
|
if ( !defined $bmge_parms_for{$mode} ) { |
446
|
0
|
|
|
|
|
0
|
carp "[BMC] Warning: invalid BMGE settings: $mode; using strict!"; |
447
|
0
|
|
|
|
|
0
|
$mode = 'strict'; |
448
|
|
|
|
|
|
|
} |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
# create temp .fasta infile for BMGE |
451
|
0
|
|
|
|
|
0
|
my $infile = $ali->temp_fasta; |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# compute BMGE parms depending on specified mode |
454
|
0
|
|
|
|
|
0
|
my ($entropy_cutoff, $gap_cutoff) = @{ $bmge_parms_for{$mode} }; |
|
0
|
|
|
|
|
0
|
|
455
|
0
|
0
|
|
|
|
0
|
my $coding_type = $ali->is_protein ? 'AA' : 'DNA'; |
456
|
0
|
|
|
|
|
0
|
my $outfile = $infile . '-bmge.htm'; |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
# create BMGE command |
459
|
0
|
|
|
|
|
0
|
my $cmd = "bmge.sh $infile" |
460
|
|
|
|
|
|
|
. " $coding_type $entropy_cutoff $gap_cutoff $outfile > /dev/null"; |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# try to robustly execute BMGE |
463
|
0
|
|
|
|
|
0
|
my $ret_code = system( [ 0, 127 ], $cmd); |
464
|
0
|
0
|
|
|
|
0
|
if ($ret_code == 127) { |
465
|
0
|
|
|
|
|
0
|
carp '[BMC] Warning: cannot execute BMGE command;' |
466
|
|
|
|
|
|
|
. ' returning neutral mask!'; |
467
|
0
|
|
|
|
|
0
|
return $class->neutral_mask($ali); |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
# TODO: try to bypass shell (need for absolute path to executable then) |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
# parse BMGE output to get selected sites |
472
|
0
|
|
|
|
|
0
|
open my $out, '<', $outfile; |
473
|
0
|
|
|
|
|
0
|
my $len = $ali->width; |
474
|
0
|
|
|
|
|
0
|
my $sites; |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
LINE: |
477
|
0
|
|
|
|
|
0
|
while (my $line = <$out>) { |
478
|
0
|
0
|
|
|
|
0
|
if ($line =~ m/\s+ selected: \s+ ([\ (0-9)]+)/xms) { |
479
|
0
|
|
|
|
|
0
|
$sites = [ map { $_ - 1 } split /\s+/xms, $1 ]; |
|
0
|
|
|
|
|
0
|
|
480
|
0
|
|
|
|
|
0
|
last LINE; |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
# unlink temp files |
485
|
0
|
|
|
|
|
0
|
file($infile)->remove; |
486
|
0
|
|
|
|
|
0
|
file($outfile)->remove; |
487
|
|
|
|
|
|
|
|
488
|
0
|
|
|
|
|
0
|
return $class->custom_mask($len, $sites); |
489
|
|
|
|
|
|
|
} |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
sub and_mask { |
493
|
1
|
|
|
1
|
1
|
27
|
my $class = shift; |
494
|
1
|
|
|
|
|
4
|
my $mask1 = shift; |
495
|
1
|
|
|
|
|
3
|
my $mask2 = shift; |
496
|
|
|
|
|
|
|
|
497
|
1
|
|
|
|
|
3
|
my @mask; |
498
|
1
|
|
100
|
13
|
|
15
|
@mask = pairmap { ($a && $b) // 0 } mesh ( @{$mask1}, @{$mask2} ); |
|
13
|
|
100
|
|
|
52
|
|
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
18
|
|
499
|
|
|
|
|
|
|
|
500
|
1
|
|
|
|
|
42
|
return $class->new( mask => \@mask ); |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
sub or_mask { |
505
|
1
|
|
|
1
|
1
|
9
|
my $class = shift; |
506
|
1
|
|
|
|
|
3
|
my $mask1 = shift; |
507
|
1
|
|
|
|
|
3
|
my $mask2 = shift; |
508
|
|
|
|
|
|
|
|
509
|
1
|
|
|
|
|
3
|
my @mask; |
510
|
1
|
|
100
|
13
|
|
12
|
@mask = pairmap { ($a || $b) // 0 } mesh ( @{$mask1}, @{$mask2} ); |
|
13
|
|
100
|
|
|
46
|
|
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
10
|
|
511
|
|
|
|
|
|
|
|
512
|
1
|
|
|
|
|
40
|
return $class->new( mask => \@mask ); |
513
|
|
|
|
|
|
|
} |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub negative_mask { |
517
|
13
|
|
|
13
|
1
|
99
|
my $self = shift; |
518
|
13
|
|
|
|
|
26
|
my $ali = shift; |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
# negate all states over the whole Ali width |
521
|
|
|
|
|
|
|
# Note the '+ 0' to ensure proper numeric context (0 or 1) |
522
|
13
|
|
|
|
|
54
|
my $width = $ali->width; |
523
|
13
|
|
|
|
|
133
|
my @mask = map { ( not $self->state_at($_) ) + 0 } 0..$width-1; |
|
1364
|
|
|
|
|
45752
|
|
524
|
13
|
|
|
|
|
495
|
return $self->new( mask => \@mask ); |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# SeqMask-based Ali factory methods |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
before 'filtered_ali' => sub { |
532
|
|
|
|
|
|
|
## no critic (ProtectPrivateSubs) |
533
|
|
|
|
|
|
|
return Bio::MUST::Core::Ali::_premask_check( @_[1,0,2] ); # swap args |
534
|
|
|
|
|
|
|
## use critic |
535
|
|
|
|
|
|
|
}; |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
sub filtered_ali { |
538
|
|
|
|
|
|
|
my $self = shift; |
539
|
|
|
|
|
|
|
my $ali = shift; |
540
|
|
|
|
|
|
|
my $list = shift; # optional: enable masking vs. filtering |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
# setup filtering or masking |
543
|
|
|
|
|
|
|
my $char = $list ? 'X' : q{}; |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# create new Ali object (extending header comment) |
546
|
|
|
|
|
|
|
# TODO: allow custom comments |
547
|
|
|
|
|
|
|
my $new_ali = Ali->new( |
548
|
|
|
|
|
|
|
comments => [ $ali->all_comments, |
549
|
|
|
|
|
|
|
'built by ' . ($list ? 'masked_ali' : 'filtered_ali') |
550
|
|
|
|
|
|
|
], |
551
|
|
|
|
|
|
|
); |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
SEQ: |
554
|
|
|
|
|
|
|
for my $seq ($ali->all_seqs) { |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
# for non-listed Seqs clone Seq to new Ali |
557
|
|
|
|
|
|
|
# Note: when filtering all Seqs will be affected |
558
|
|
|
|
|
|
|
if ( $list and not $list->is_listed($seq->full_id) ) { |
559
|
|
|
|
|
|
|
$new_ali->add_seq($seq->clone); |
560
|
|
|
|
|
|
|
next SEQ; |
561
|
|
|
|
|
|
|
} |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
# for listed Seqs copy marked sites from original Seq... |
564
|
|
|
|
|
|
|
# ... and either filter or mask others (set them to missing state) |
565
|
|
|
|
|
|
|
my $new_seq; |
566
|
|
|
|
|
|
|
my $site = 0; |
567
|
|
|
|
|
|
|
for my $state ($self->all_states) { |
568
|
|
|
|
|
|
|
$new_seq .= $state ? $seq->state_at($site) : $char; |
569
|
|
|
|
|
|
|
$site++; |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
# add Seq to new Ali |
573
|
|
|
|
|
|
|
$new_ali->add_seq( |
574
|
|
|
|
|
|
|
Seq->new( seq_id => $seq->full_id, seq => $new_seq ) |
575
|
|
|
|
|
|
|
); |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
return $new_ali; |
579
|
|
|
|
|
|
|
} |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
# I/O methods |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
sub load { |
586
|
1
|
|
|
1
|
1
|
179
|
my $class = shift; |
587
|
1
|
|
|
|
|
2
|
my $infile = shift; |
588
|
|
|
|
|
|
|
|
589
|
1
|
|
|
|
|
9
|
open my $in, '<', $infile; |
590
|
|
|
|
|
|
|
|
591
|
1
|
|
|
|
|
378
|
my $mask = $class->new(); |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
LINE: |
594
|
1
|
|
|
|
|
29
|
while (my $line = <$in>) { |
595
|
26234
|
|
|
|
|
41567
|
chomp $line; |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# skip empty lines and process comment lines |
598
|
26234
|
100
|
66
|
|
|
133085
|
next LINE if $line =~ $EMPTY_LINE |
599
|
|
|
|
|
|
|
|| $mask->is_comment($line); |
600
|
|
|
|
|
|
|
|
601
|
26232
|
|
|
|
|
898939
|
$mask->add_state($line); |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
1
|
|
|
|
|
28
|
return $mask; |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
sub store { |
610
|
4
|
|
|
4
|
1
|
17
|
my $self = shift; |
611
|
4
|
|
|
|
|
16
|
my $outfile = shift; |
612
|
|
|
|
|
|
|
|
613
|
4
|
|
|
|
|
42
|
open my $out, '>', $outfile; |
614
|
|
|
|
|
|
|
|
615
|
4
|
|
|
|
|
7757
|
print {$out} $self->header; |
|
4
|
|
|
|
|
54
|
|
616
|
4
|
|
|
|
|
23
|
say {$out} join "\n", $self->all_states; |
|
4
|
|
|
|
|
181
|
|
617
|
|
|
|
|
|
|
|
618
|
4
|
|
|
|
|
5166
|
return; |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
# sub load_bor { |
624
|
|
|
|
|
|
|
# #~ my $self = shift; |
625
|
|
|
|
|
|
|
# my $class = shift; |
626
|
|
|
|
|
|
|
# my $bor = shift; |
627
|
|
|
|
|
|
|
# my $ali = shift; |
628
|
|
|
|
|
|
|
# |
629
|
|
|
|
|
|
|
# #~ my $class = 'Bio::MUST::Core::SeqMask'; |
630
|
|
|
|
|
|
|
# |
631
|
|
|
|
|
|
|
# #~ open my $in, '<', $infile; |
632
|
|
|
|
|
|
|
# my $bor_file = $bor->stringify; |
633
|
|
|
|
|
|
|
# #~ tie my @rows, 'Tie::File', $bor_file; |
634
|
|
|
|
|
|
|
# #~ tie my @rows, 'Tie::File', $in; |
635
|
|
|
|
|
|
|
# #~ my $delims = $rows[-1]; |
636
|
|
|
|
|
|
|
# #~ ### $delims |
637
|
|
|
|
|
|
|
# my $delims; |
638
|
|
|
|
|
|
|
# open my $bof, '<', $bor_file; |
639
|
|
|
|
|
|
|
# while ( my $i = <$bof> ) { $delims = $i; } |
640
|
|
|
|
|
|
|
# my @blocks = pairmap { [ $a , $b ] } ( split ' ', $delims ); |
641
|
|
|
|
|
|
|
# #~ ### @blocks |
642
|
|
|
|
|
|
|
# my $mask = $class->blocks2mask(\@blocks)->mask; |
643
|
|
|
|
|
|
|
# #~ ### $mask |
644
|
|
|
|
|
|
|
# my $seq_mask = $class->new( mask => $mask); |
645
|
|
|
|
|
|
|
# my $negative = $seq_mask->negative_mask($ali); |
646
|
|
|
|
|
|
|
# return $negative; |
647
|
|
|
|
|
|
|
# } |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
sub store_una { |
655
|
1
|
|
|
1
|
1
|
5
|
my $self = shift; |
656
|
1
|
|
|
|
|
2
|
my $outfile = shift; |
657
|
1
|
|
50
|
|
|
6
|
my $args = shift // {}; # HashRef (should not be empty...) |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
# TODO: check arguments as there are no default values? |
660
|
1
|
|
|
|
|
4
|
my $ali = $args->{ali}; |
661
|
1
|
|
|
|
|
2
|
my $id = $args->{id}; |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
# search for non-gap positions in reference seq |
664
|
1
|
|
|
|
|
17
|
my $seq = $ali->get_seq_with_id($id); |
665
|
1
|
|
|
|
|
40
|
my @pos_bases = grep { !( $seq->is_gap($_) ) } 0..($seq->seq_len-1); |
|
99
|
|
|
|
|
234
|
|
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
# delete gap positions in mask |
668
|
1
|
|
|
|
|
7
|
my @mask_degap = @{ $self->mask }[ @pos_bases ]; |
|
1
|
|
|
|
|
31
|
|
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
# build mask and compute blocks |
671
|
1
|
|
|
|
|
32
|
my $new_mask = $self->new( mask => \@mask_degap ); |
672
|
1
|
|
|
|
|
5
|
my $new_blocks = $new_mask->mask2blocks; |
673
|
|
|
|
|
|
|
|
674
|
1
|
|
|
|
|
6
|
open my $out, '>', $outfile; |
675
|
|
|
|
|
|
|
|
676
|
1
|
|
|
|
|
371
|
my $filename = $ali->file; |
677
|
1
|
|
|
|
|
37
|
my ($basename, $dir, $ext) = fileparse($filename, qr{\.[^.]*}xms); |
678
|
|
|
|
|
|
|
|
679
|
1
|
|
|
|
|
165
|
say {$out} "Unambigous numbering for $basename$ext based on $id"; |
|
1
|
|
|
|
|
17
|
|
680
|
1
|
|
|
|
|
4
|
for my $new_block ( @{$new_blocks} ) { |
|
1
|
|
|
|
|
7
|
|
681
|
3
|
|
|
|
|
5
|
say {$out} join '-', @{$new_block}; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
14
|
|
682
|
|
|
|
|
|
|
} |
683
|
|
|
|
|
|
|
|
684
|
1
|
|
|
|
|
80
|
return; |
685
|
|
|
|
|
|
|
} |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
sub load_blocks { |
689
|
2
|
|
|
2
|
1
|
311
|
my $class = shift; |
690
|
2
|
|
|
|
|
15
|
my $infile = shift; |
691
|
|
|
|
|
|
|
|
692
|
2
|
|
|
|
|
19
|
open my $in, '<', $infile; |
693
|
|
|
|
|
|
|
|
694
|
2
|
|
|
|
|
696
|
my $mask = $class->new(); |
695
|
2
|
|
|
|
|
6
|
my @blocks; |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
LINE: |
698
|
2
|
|
|
|
|
1174
|
while (my $line = <$in>) { |
699
|
10
|
|
|
|
|
24
|
chomp $line; |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
# skip empty lines and process comment lines |
702
|
10
|
100
|
100
|
|
|
106
|
next LINE if $line =~ $EMPTY_LINE |
703
|
|
|
|
|
|
|
|| $mask->is_comment($line); |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
# process block specifications (one or more by line) |
706
|
5
|
|
|
6
|
|
73
|
push @blocks, pairmap { [ $a , $b ] } split /\s+/xms, $line; |
|
6
|
|
|
|
|
61
|
|
707
|
|
|
|
|
|
|
} |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
# build mask from blocks and replace empty mask |
710
|
|
|
|
|
|
|
# this is needed to preserve potential comments |
711
|
2
|
|
|
|
|
22
|
$mask->_set_mask( $class->blocks2mask( \@blocks )->mask ); |
712
|
|
|
|
|
|
|
|
713
|
2
|
|
|
|
|
76
|
return $mask; |
714
|
|
|
|
|
|
|
} |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
sub store_blocks { |
718
|
1
|
|
|
1
|
1
|
4
|
my $self = shift; |
719
|
1
|
|
|
|
|
3
|
my $outfile = shift; |
720
|
|
|
|
|
|
|
|
721
|
1
|
|
|
|
|
10
|
open my $out, '>', $outfile; |
722
|
|
|
|
|
|
|
|
723
|
1
|
|
|
|
|
360
|
say {$out} "# Coordinates of blocks \n#"; |
|
1
|
|
|
|
|
311
|
|
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
# compute blocks from mask |
726
|
1
|
|
|
|
|
10
|
my $blocks = $self->mask2blocks; |
727
|
|
|
|
|
|
|
|
728
|
1
|
|
|
|
|
5
|
for my $block ( @{$blocks} ) { |
|
1
|
|
|
|
|
8
|
|
729
|
6
|
|
|
|
|
12
|
say {$out} join "\t", @{$block}; |
|
6
|
|
|
|
|
12
|
|
|
6
|
|
|
|
|
22
|
|
730
|
|
|
|
|
|
|
} |
731
|
|
|
|
|
|
|
|
732
|
1
|
|
|
|
|
62
|
return; |
733
|
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
736
|
|
|
|
|
|
|
1; |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
__END__ |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
=pod |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
=head1 NAME |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
Bio::MUST::Core::SeqMask - Sequence mask for selecting specific sites |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
=head1 VERSION |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
version 0.212670 |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
=head1 SYNOPSIS |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
# TODO |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
=head1 DESCRIPTION |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
# TODO |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
=head1 METHODS |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
=head2 first_site |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
=head2 last_site |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
=head2 count_sites |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
=head2 coverage |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
=head2 empty_mask |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
=head2 custom_mask |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
=head2 mark_block |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
=head2 unmark_block |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
=head2 mask2blocks |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
=head2 blocks2mask |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
=head2 neutral_mask |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
=head2 variable_mask |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
=head2 parsimony_mask |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
=head2 ideal_mask |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
=head2 gblocks_mask |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
=head2 bmge_mask |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=head2 and_mask |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
=head2 or_mask |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
=head2 negative_mask |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
=head2 filtered_ali |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
=head2 load |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
=head2 store |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
=head2 load_bor |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
=head2 store_bor |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
=head2 load_una |
809
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
=head2 store_una |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=head2 load_blocks |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
=head2 store_blocks |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
=head1 AUTHOR |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
Denis BAURAIN <denis.baurain@uliege.be> |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
=for stopwords Catherine COLSON Raphael LEONARD |
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
=over 4 |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
=item * |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
Catherine COLSON <ccolson@doct.uliege.be> |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
=item * |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
Raphael LEONARD <rleonard@doct.uliege.be> |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
=back |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN. |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
841
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
=cut |