line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::MUST::Apps::Roles::OrgProcable; |
2
|
|
|
|
|
|
|
# ABSTRACT: Attributes and methods common to OrgProcessor objects |
3
|
|
|
|
|
|
|
$Bio::MUST::Apps::Roles::OrgProcable::VERSION = '0.210570'; |
4
|
1
|
|
|
1
|
|
761
|
use Moose::Role; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
11
|
|
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
5818
|
use autodie; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
8
|
|
7
|
1
|
|
|
1
|
|
5557
|
use feature qw(say); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
102
|
|
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
8
|
use Smart::Comments -ENV; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
10
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
1564
|
use List::AllUtils qw(each_array); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
79
|
|
12
|
1
|
|
|
1
|
|
601
|
use Number::Interval; |
|
1
|
|
|
|
|
2944
|
|
|
1
|
|
|
|
|
36
|
|
13
|
|
|
|
|
|
|
|
14
|
1
|
|
|
1
|
|
9
|
use Bio::MUST::Core; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
25
|
|
15
|
1
|
|
|
1
|
|
6
|
use aliased 'Bio::MUST::Core::SeqId'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
7
|
|
16
|
1
|
|
|
1
|
|
208
|
use aliased 'Bio::MUST::Core::SeqMask'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
4
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
has 'org' => ( |
20
|
|
|
|
|
|
|
is => 'ro', |
21
|
|
|
|
|
|
|
isa => 'Str', |
22
|
|
|
|
|
|
|
required => 1, |
23
|
|
|
|
|
|
|
); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
has 'banks' => ( |
27
|
|
|
|
|
|
|
traits => ['Array'], |
28
|
|
|
|
|
|
|
is => 'ro', |
29
|
|
|
|
|
|
|
isa => 'ArrayRef[Str]', |
30
|
|
|
|
|
|
|
required => 1, |
31
|
|
|
|
|
|
|
handles => { |
32
|
|
|
|
|
|
|
all_banks => 'elements', |
33
|
|
|
|
|
|
|
}, |
34
|
|
|
|
|
|
|
); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
has 'code' => ( |
38
|
|
|
|
|
|
|
is => 'ro', |
39
|
|
|
|
|
|
|
isa => 'Str', |
40
|
|
|
|
|
|
|
default => '1', |
41
|
|
|
|
|
|
|
); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub _collect_hsps { ## no critic (RequireArgUnpacking) |
45
|
0
|
|
|
0
|
|
|
my $self = shift; |
46
|
0
|
|
|
|
|
|
my %args = @_; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
my ($parser, $protein, $transcript, $mask_for, $hit_for, $strand_for) |
49
|
0
|
|
|
|
|
|
= @args{ qw(parser protein transcript mask_for hit_for strand_for) }; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Note: this method should work both for |
52
|
|
|
|
|
|
|
# - XML HSPs within a single hit (1331) |
53
|
|
|
|
|
|
|
# - multi-query/multi-hit tabular HSPs (42) |
54
|
0
|
|
0
|
|
|
|
$parser //= $transcript; |
55
|
|
|
|
|
|
|
|
56
|
0
|
|
|
|
|
|
while (my $hsp = $parser->next_hsp) { |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# get mask key from query protein (1331) or hit hsp (42) |
59
|
0
|
0
|
|
|
|
|
my $key = $transcript ? $protein->query_def : $hsp->hit_id; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# update coverage on corresponding hit |
62
|
0
|
|
|
|
|
|
push @{ $mask_for->{$key} }, Number::Interval->new( |
|
0
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
min => $hsp->hit_start, max => $hsp->hit_end, |
64
|
|
|
|
|
|
|
incmin => 1, incmax => 1, |
65
|
|
|
|
|
|
|
); |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# record protein => best transcript pair (only for 1331) |
68
|
|
|
|
|
|
|
# Note: we do this here rather than in leel OrgProcessor for symmetry |
69
|
0
|
0
|
0
|
|
|
|
$hit_for->{$key} //= $transcript->id |
70
|
|
|
|
|
|
|
if defined $hit_for; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# record strand (only for 42) |
73
|
0
|
0
|
|
|
|
|
$strand_for->{$key} += $hsp->hit_strand |
74
|
|
|
|
|
|
|
if defined $strand_for; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
|
return; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
sub _fetch_and_trim_hits { ## no critic (RequireArgUnpacking) |
82
|
0
|
|
|
0
|
|
|
my $self = shift; |
83
|
0
|
|
|
|
|
|
my %args = @_; |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
my ($blastdb, $mask_for, $hit_for, $strand_for) |
86
|
0
|
|
|
|
|
|
= @args{ qw(blastdb mask_for hit_for strand_for) }; |
87
|
|
|
|
|
|
|
|
88
|
0
|
|
|
|
|
|
my $rp = $self->ali_proc->run_proc; |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# compute seq chunks to extract from all hits (default behavior) |
91
|
0
|
|
|
|
|
|
my @entries; |
92
|
0
|
0
|
|
|
|
|
if ($rp->trim_homologues eq 'on') { |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# tie might help making app completely deterministic |
95
|
0
|
|
|
|
|
|
tie my %chunks_for, 'Tie::IxHash'; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# loop through masks to consolidate neighboring ranges into chunks |
98
|
|
|
|
|
|
|
# Note: while ranges always deal with hit seqs... |
99
|
|
|
|
|
|
|
# ... the key is either a protein abbr_id (1331) or a hit id (42) |
100
|
0
|
|
|
|
|
|
while ( my ($key, $mask) = each %{$mask_for} ) { |
|
0
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# first fetch seed range (corresponding to the best HSP) |
103
|
|
|
|
|
|
|
# this will be useful for 1331 only (see below) |
104
|
0
|
|
|
|
|
|
my $seed = $mask->[0]->min; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# then sort ranges in ascending order |
107
|
0
|
|
|
|
|
|
my @ranges = sort { $a->min <=> $b->min } @{$mask}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# merge successive ranges not farther away than hit_max_shift |
110
|
0
|
|
|
|
|
|
my $range1 = shift @ranges; |
111
|
0
|
|
|
|
|
|
while (my $range2 = shift @ranges) { |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# too distant: store current chunk and start new one |
114
|
0
|
0
|
|
|
|
|
if ( $range2->min - $range1->max > $rp->trim_max_shift) { |
115
|
0
|
|
|
|
|
|
push @{ $chunks_for{$key} }, $range1; |
|
0
|
|
|
|
|
|
|
116
|
0
|
|
|
|
|
|
$range1 = $range2; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# close enough: extend current chunk |
120
|
|
|
|
|
|
|
else { |
121
|
0
|
|
|
|
|
|
$range1->max( |
122
|
|
|
|
|
|
|
List::AllUtils::max( $range1->max, $range2->max ) |
123
|
|
|
|
|
|
|
); # Note: this is needed as BLAST can return HSPs |
124
|
|
|
|
|
|
|
} # that are completely included within better HSPs... |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# store last chunk |
128
|
0
|
|
|
|
|
|
push @{ $chunks_for{$key} }, $range1; |
|
0
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# only keep the (extended) range containing seed (only for 1331) |
131
|
|
|
|
|
|
|
# this allows extracting the correct seq in case of paralogy |
132
|
0
|
0
|
|
|
|
|
if (defined $hit_for) { |
133
|
|
|
|
|
|
|
$chunks_for{$key} = [ |
134
|
0
|
|
|
|
|
|
grep { $_->contains($seed) } @{ $chunks_for{$key} } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
]; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# format chunks for retrieval with blastdbcmd... |
140
|
|
|
|
|
|
|
# ... allowing for hit_extra_margin (no risk of out-of-bound max) |
141
|
|
|
|
|
|
|
# keys are converted to hit ids on the fly (only for 1331) |
142
|
0
|
|
|
|
|
|
while ( my ($key, $chunks) = each %chunks_for ) { |
143
|
|
|
|
|
|
|
push @entries, map { |
144
|
0
|
0
|
|
|
|
|
(defined $hit_for ? $hit_for->{$key} : $key) . q{ } |
145
|
|
|
|
|
|
|
. List::AllUtils::max(1, $_->min - $rp->trim_extra_margin) |
146
|
|
|
|
|
|
|
. q{-} . ($_->max + $rp->trim_extra_margin) |
147
|
0
|
|
|
|
|
|
} @{$chunks}; |
|
0
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# otherwise simply extract whole hits |
152
|
|
|
|
|
|
|
else { |
153
|
|
|
|
|
|
|
@entries = map { # again on the fly (see above) |
154
|
0
|
0
|
|
|
|
|
defined $hit_for ? $hit_for->{$_} : $_ |
155
|
0
|
|
|
|
|
|
} keys %{$mask_for}; |
|
0
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# fetch hits from BLAST database |
159
|
|
|
|
|
|
|
# in any case extracted seqs are hits (for both 1331 and 42) |
160
|
|
|
|
|
|
|
# trimming is left to blastdbcmd (should be very efficient) |
161
|
0
|
|
|
|
|
|
my @hits = $blastdb->blastdbcmd( \@entries )->all_seqs; |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# rename hits after (long) id of corresponding protein (only for 1331) |
164
|
0
|
0
|
|
|
|
|
if (defined $hit_for) { |
165
|
0
|
|
|
|
|
|
my @abbr_ids = keys %{$mask_for}; |
|
0
|
|
|
|
|
|
|
166
|
0
|
|
|
|
|
|
my $ea = each_array(@abbr_ids, @hits); |
167
|
0
|
|
|
|
|
|
while (my ($abbr_id, $hit) = $ea->() ) { |
168
|
0
|
|
|
|
|
|
my $new_id = $self->long_id_for($abbr_id); |
169
|
0
|
|
|
|
|
|
$hit->set_seq_id( SeqId->new( full_id => $new_id ) ); |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# rename hits to fill in tax-reports (only for 42) |
174
|
|
|
|
|
|
|
# this is only needed when trim_homologues is on (default behavior) |
175
|
0
|
0
|
0
|
|
|
|
if (defined $strand_for && $rp->trim_homologues eq 'on') { |
176
|
0
|
|
|
|
|
|
my %count_for; |
177
|
0
|
|
|
|
|
|
my $ea = each_array(@entries, @hits); |
178
|
0
|
|
|
|
|
|
while (my ($entry, $hit) = $ea->() ) { |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# extract details from entry |
181
|
0
|
|
|
|
|
|
my ($hit_id, $start, $end) = $entry =~ m/(\S+)\s(\d+)\-(\d+)/xms; |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# compute hit strand (through HSPs votes) |
184
|
|
|
|
|
|
|
# Note: strand might be inaccurate in case of multiple chunks |
185
|
0
|
0
|
|
|
|
|
my $strand = $strand_for->{$hit_id} >= 0 ? 1 : -1; |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# embed range and strand in hit_id |
188
|
0
|
|
|
|
|
|
my $new_id = join ':::', $hit_id, $start, $end, $strand; |
189
|
0
|
|
|
|
|
|
$hit->set_seq_id( SeqId->new( full_id => $new_id ) ); |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
0
|
|
|
|
|
|
return @hits; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
sub display { ## no critic (RequireArgUnpacking) |
198
|
0
|
|
|
0
|
0
|
|
return join "\n--- ", q{}, @_ |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
|
202
|
1
|
|
|
1
|
|
1314
|
no Moose::Role; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
10
|
|
203
|
|
|
|
|
|
|
1; |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
__END__ |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=pod |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head1 NAME |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Bio::MUST::Apps::Roles::OrgProcable - Attributes and methods common to OrgProcessor objects |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head1 VERSION |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
version 0.210570 |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=head1 SYNOPSIS |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# TODO |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=head1 DESCRIPTION |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# TODO |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head1 AUTHOR |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Denis BAURAIN <denis.baurain@uliege.be> |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
234
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=cut |