line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Palantir::Roles::Fillable; |
2
|
|
|
|
|
|
|
# ABSTRACT: Fillable Moose role for the construction of DomainPlus object arrays and Exploratory methods |
3
|
|
|
|
|
|
|
$Bio::Palantir::Roles::Fillable::VERSION = '0.200700'; |
4
|
1
|
|
|
1
|
|
593
|
use Moose::Role; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
9
|
|
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
4733
|
use autodie; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
8
|
|
7
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
4553
|
use Const::Fast; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
8
|
|
9
|
1
|
|
|
1
|
|
949
|
use File::ShareDir qw(dist_dir); |
|
1
|
|
|
|
|
18934
|
|
|
1
|
|
|
|
|
44
|
|
10
|
1
|
|
|
1
|
|
7
|
use File::Temp; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
63
|
|
11
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
6
|
use aliased 'Bio::FastParsers::Hmmer::DomTable'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
7
|
|
13
|
1
|
|
|
1
|
|
190273
|
use aliased 'Bio::Palantir::Refiner::DomainPlus'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
6
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
const my $DATA_PATH => dist_dir('Bio-Palantir') . '/'; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# pHMM database sources: |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# Weber T, Blin K, Duddela S, et al. antiSMASH 3.0-a comprehensive resource for |
21
|
|
|
|
|
|
|
# the genome mining of biosynthetic gene clusters. Nucleic Acids Res. |
22
|
|
|
|
|
|
|
# 2015;43(W1):W237âW243. doi:10.1093/nar/gkv437 |
23
|
|
|
|
|
|
|
# |
24
|
|
|
|
|
|
|
# Khayatt, B. I., Overmars, L., Siezen, R. J., & Francke, C. (2013). |
25
|
|
|
|
|
|
|
# Classification of the Adenylation and Acyl-Transferase Activity of NRPS and |
26
|
|
|
|
|
|
|
# PKS Systems Using Ensembles of Substrate Specific Hidden Markov Models. |
27
|
|
|
|
|
|
|
# PLoS ONE, 8(4). http://doi.org/10.1371/journal.pone.0062136 |
28
|
|
|
|
|
|
|
# |
29
|
|
|
|
|
|
|
# Bushley, K. E., & Turgeon, B. G. (2010). Phylogenomics reveals subfamilies of |
30
|
|
|
|
|
|
|
# fungal nonribosomal peptide synthetases and their evolutionary relationships. |
31
|
|
|
|
|
|
|
# BMC Evolutionary Biology, 10(1), 26. http://doi.org/10.1186/1471-2148-10-26 |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# biological data |
34
|
|
|
|
|
|
|
my %ADJUSTMENT_FOR = ( # length column useless: this information can be obtained by OO variable |
35
|
|
|
|
|
|
|
# for antismash activity |
36
|
|
|
|
|
|
|
A => { start => 20, end => 100, length => 418 }, |
37
|
|
|
|
|
|
|
Red => { start => 0, end => 0 , length => 242 }, # equivalent of TD |
38
|
|
|
|
|
|
|
# NRPS |
39
|
|
|
|
|
|
|
Condensation => { start => 0, end => 150, length => 300 }, |
40
|
|
|
|
|
|
|
Condensation_LCL => { start => 0, end => 154, length => 296 }, |
41
|
|
|
|
|
|
|
Condensation_DCL => { start => 0, end => 150, length => 300 }, |
42
|
|
|
|
|
|
|
Condensation_Starter => { start => 0, end => 150, length => 300 }, |
43
|
|
|
|
|
|
|
Condensation_Dual => { start => 0, end => 157, length => 293 }, |
44
|
|
|
|
|
|
|
Epimerization => { start => 0, end => 133, length => 317 }, |
45
|
|
|
|
|
|
|
Heterocyclization => { start => 0, end => 150, length => 300 }, |
46
|
|
|
|
|
|
|
X => { start => 0, end => 147, length => 303 }, |
47
|
|
|
|
|
|
|
Cglyc => { start => 0, end => 151, length => 299 }, |
48
|
|
|
|
|
|
|
Condensation_fungi => { start => 0, end => 251, length => 209 }, # où faut-il ajuster ? |
49
|
|
|
|
|
|
|
'AMP-binding' => { start => 27, end => 66, length => 418 }, |
50
|
|
|
|
|
|
|
'AMP-binding_fungi' => { start => 4, end => 0 , length => 507 }, |
51
|
|
|
|
|
|
|
ACPS => { start => 0, end => 0 , length => 78 }, |
52
|
|
|
|
|
|
|
Aminotran_1_2 => { start => 0, end => 0 , length => 363 }, |
53
|
|
|
|
|
|
|
Aminotran_3 => { start => 0, end => 0 , length => 339 }, |
54
|
|
|
|
|
|
|
Aminotran_4 => { start => 0, end => 0 , length => 232 }, |
55
|
|
|
|
|
|
|
Aminotran_5 => { start => 0, end => 0 , length => 371 }, |
56
|
|
|
|
|
|
|
'A-OX' => { start => 0, end => -250,length => 780 }, # it overlaps following domains |
57
|
|
|
|
|
|
|
B => { start => 0, end => 0 , length => 365 }, |
58
|
|
|
|
|
|
|
cMT => { start => 0, end => 0 , length => 230 }, |
59
|
|
|
|
|
|
|
ECH => { start => 0, end => 0 , length => 245 }, |
60
|
|
|
|
|
|
|
Epimerization => { start => 0, end => 0 , length => 317 }, |
61
|
|
|
|
|
|
|
F => { start => 0, end => 0 , length => 127 }, |
62
|
|
|
|
|
|
|
Fkbh => { start => 0, end => 0 , length => 142 }, |
63
|
|
|
|
|
|
|
GNAT => { start => 0, end => 0 , length => 139 }, |
64
|
|
|
|
|
|
|
Hal => { start => 0, end => 0 , length => 222 }, |
65
|
|
|
|
|
|
|
Heterocyclization => { start => 0, end => 0 , length => 300 }, |
66
|
|
|
|
|
|
|
NAD_binding_4 => { start => 0, end => 0 , length => 249 }, |
67
|
|
|
|
|
|
|
nMT => { start => 0, end => 0 , length => 244 }, |
68
|
|
|
|
|
|
|
'NRPS-COM_Cterm' => { start => 0, end => 0 , length => 21 }, |
69
|
|
|
|
|
|
|
'NRPS-COM_Nterm' => { start => 0, end => 0 , length => 33 }, |
70
|
|
|
|
|
|
|
oMT => { start => 0, end => 0 , length => 280 }, |
71
|
|
|
|
|
|
|
PCP => { start => 15, end => 0 , length => 70 }, |
72
|
|
|
|
|
|
|
'PP-binding' => { start => 15, end => 0 , length => 70 }, |
73
|
|
|
|
|
|
|
PCP_fungi => { start => 0, end => 0 , length => 86 }, |
74
|
|
|
|
|
|
|
PS => { start => 0, end => 0 , length => 214 }, |
75
|
|
|
|
|
|
|
TD => { start => 0, end => 0 , length => 242 }, |
76
|
|
|
|
|
|
|
Thioesterase => { start => 0, end => 0 , length => 229 }, |
77
|
|
|
|
|
|
|
# PKS |
78
|
|
|
|
|
|
|
PKS_KS => { start => 8, end => 0, length => 426 }, |
79
|
|
|
|
|
|
|
PKS_KR => { start => 0, end => 0, length => 185 }, |
80
|
|
|
|
|
|
|
PKS_ER => { start => 0, end => 0, length => 313 }, |
81
|
|
|
|
|
|
|
PKS_DH => { start => 0, end => 0, length => 166 }, |
82
|
|
|
|
|
|
|
PKS_DH2 => { start => 0, end => 0, length => 233 }, |
83
|
|
|
|
|
|
|
PKS_DHt => { start => 0, end => 0, length => 236 }, |
84
|
|
|
|
|
|
|
PKS_AT => { start => 114, end => 22, length => 298 }, |
85
|
|
|
|
|
|
|
Polyketide_cyc => { start => 0, end => 0, length => 130 }, |
86
|
|
|
|
|
|
|
Polyketide_cyc2 => { start => 0, end => 0, length => 139 }, |
87
|
|
|
|
|
|
|
PKS_Docking_Nterm => { start => 0, end => 0, length => 28 }, |
88
|
|
|
|
|
|
|
PKS_Docking_Cterm => { start => 0, end => 0, length => 73 }, |
89
|
|
|
|
|
|
|
CAL_domain => { start => 0, end => 0, length => 465 }, |
90
|
|
|
|
|
|
|
'Trans-AT_docking' => { start => 0, end => 0, length => 155 }, |
91
|
|
|
|
|
|
|
ACP => { start => 0, end => 0, length => 73 }, |
92
|
|
|
|
|
|
|
ACP_beta => { start => 0, end => 0, length => 67 }, |
93
|
|
|
|
|
|
|
); |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# public methods |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub detect_domains { ## no critic (RequireArgUnpacking) |
99
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
100
|
|
|
|
|
|
|
|
101
|
0
|
|
|
|
|
|
my ($seq, $gene_pos, $gap_coords, $from_seq) = @_; #TODO switch to hash |
102
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
my %hit_for = %{ $self->_parse_generic_domains($seq) }; |
|
0
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# build DomainPlus objects |
106
|
0
|
|
|
|
|
|
my @domains; |
107
|
0
|
|
|
|
|
|
for my $hit (keys %hit_for) { |
108
|
|
|
|
|
|
|
push @domains, DomainPlus->new( |
109
|
0
|
|
|
|
|
|
map { $_ => $hit_for{$hit}{$_} } keys %{ $hit_for{$hit} } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
); |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
0
|
|
|
|
|
|
$_->_set_protein_sequence($seq) for @domains; |
114
|
|
|
|
|
|
|
|
115
|
0
|
|
|
|
|
|
$self->_use_hit_information($_, $gene_pos) for @domains; |
116
|
|
|
|
|
|
|
|
117
|
0
|
|
|
|
|
|
$self->_elongate_coordinates(\@domains, $gap_coords); |
118
|
0
|
|
|
|
|
|
@domains = $self->_handle_overlaps($from_seq, @domains); |
119
|
0
|
|
|
|
|
|
$self->_refine_coordinates(@domains); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# subtype the domains |
122
|
0
|
|
|
|
|
|
$self->_get_domain_subtype($_) for @domains; |
123
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
|
return (@domains); |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# private methods |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub _elongate_coordinates { |
130
|
0
|
|
|
0
|
|
|
my $self = shift; |
131
|
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
my $domains = shift; |
133
|
0
|
|
|
|
|
|
my $gap_coords = shift; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# adjustment of domain coordinates to get plausible domain sizes |
136
|
0
|
|
|
|
|
|
for my $domain (@{ $domains }) { |
|
0
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# begin at profile start even if does not match the profile, and adjust the coords depending of the domain nature |
139
|
0
|
|
|
|
|
|
my $start = $domain->begin - $domain->hmm_from; # complete the domain if the phmm didnt match entirely the sequence |
140
|
0
|
|
|
|
|
|
my $end = $domain->end + ($domain->tlen - $domain->hmm_to); |
141
|
|
|
|
|
|
|
|
142
|
0
|
0
|
|
|
|
|
if ($ADJUSTMENT_FOR{$domain->target_name}) { |
143
|
0
|
|
0
|
|
|
|
$start -= $ADJUSTMENT_FOR{$domain->target_name}{start} // 0; |
144
|
0
|
|
0
|
|
|
|
$end += $ADJUSTMENT_FOR{$domain->target_name}{end} // 0; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# handle truncated domains |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# check strain orientation |
150
|
0
|
|
|
|
|
|
my ($gene_begin, $gene_end); |
151
|
0
|
0
|
|
|
|
|
if ($self->gene_begin > $self->gene_end) { |
152
|
0
|
|
|
|
|
|
$gene_begin = $self->gene_end; |
153
|
0
|
|
|
|
|
|
$gene_end = $self->gene_begin; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
else { |
157
|
0
|
|
|
|
|
|
$gene_begin = $self->gene_begin; |
158
|
0
|
|
|
|
|
|
$gene_end = $self->gene_end; |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# standard elongation checks |
162
|
0
|
0
|
|
|
|
|
unless ($gap_coords) { |
163
|
0
|
0
|
|
|
|
|
$start = 1 if $start <= 0; # as we do not use -1 position before substr() |
164
|
0
|
0
|
|
|
|
|
$end = $gene_end if $end > $gene_end; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# gap filling elongation check |
168
|
|
|
|
|
|
|
else { |
169
|
0
|
0
|
|
|
|
|
$start = $gap_coords->[0] if $start < $gap_coords->[0]; |
170
|
0
|
0
|
|
|
|
|
$end = ($gap_coords->[1] - 1) if $end >= $gap_coords->[1]; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
0
|
|
|
|
|
|
my $size = $end - $start + 1; |
174
|
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
|
$domain->_set_begin($start); |
176
|
0
|
|
|
|
|
|
$domain->_set_end($end); |
177
|
0
|
|
|
|
|
|
$domain->_set_size($size); |
178
|
0
|
|
|
|
|
|
$domain->_set_coordinates( [$start, $end] ); |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
0
|
|
|
|
|
|
return; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
sub _handle_overlaps { ## no critic (RequireArgUnpacking) |
185
|
0
|
|
|
0
|
|
|
my $self = shift; |
186
|
|
|
|
|
|
|
|
187
|
0
|
|
0
|
|
|
|
my $from_seq = shift // 0; |
188
|
0
|
|
|
|
|
|
my @domains = @_; |
189
|
|
|
|
|
|
|
my @sorted_domains |
190
|
0
|
0
|
|
|
|
|
= sort { $b->score <=> $a->{score} || $a->begin <=> $b->begin } |
|
0
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
@domains |
192
|
|
|
|
|
|
|
; |
193
|
|
|
|
|
|
|
|
194
|
0
|
|
|
|
|
|
my %deja_vu; |
195
|
|
|
|
|
|
|
my @non_overlapping_domains; |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
REF: |
198
|
0
|
|
|
|
|
|
for my $ref_hit ( 0..(@sorted_domains - 1) ) { |
199
|
|
|
|
|
|
|
|
200
|
0
|
0
|
|
|
|
|
next REF if $deja_vu{$ref_hit}; |
201
|
|
|
|
|
|
|
|
202
|
0
|
|
|
|
|
|
my ($x1, $y1) = map { $sorted_domains[$ref_hit]->$_ } qw(begin end); |
|
0
|
|
|
|
|
|
|
203
|
0
|
|
|
|
|
|
$deja_vu{$ref_hit} = 1; |
204
|
|
|
|
|
|
|
|
205
|
0
|
|
|
|
|
|
my @overlaps = $ref_hit; |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
CMP: |
208
|
0
|
|
|
|
|
|
for my $cmp_hit ( 0..(@sorted_domains - 1) ) { |
209
|
|
|
|
|
|
|
|
210
|
0
|
0
|
|
|
|
|
next CMP if $deja_vu{$cmp_hit}; |
211
|
|
|
|
|
|
|
|
212
|
0
|
|
|
|
|
|
my ($x2, $y2) = map { $sorted_domains[$cmp_hit]->$_ } qw(begin end); |
|
0
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
|
214
|
0
|
0
|
0
|
|
|
|
unless ($x2 > $y1 || $y2 < $x1) { ## no critic (ProhibitNegativeExpressionsInUnlessAndUntilConditions) |
215
|
|
|
|
|
|
|
|
216
|
0
|
|
|
|
|
|
push @overlaps, $cmp_hit; |
217
|
|
|
|
|
|
|
|
218
|
0
|
0
|
|
|
|
|
if ($from_seq == 1) { |
219
|
0
|
0
|
|
|
|
|
$deja_vu{$cmp_hit} = 1 |
220
|
|
|
|
|
|
|
if $sorted_domains[$cmp_hit]->class |
221
|
|
|
|
|
|
|
eq $sorted_domains[$ref_hit]->class |
222
|
|
|
|
|
|
|
; # avoid superposition of domain of the same class |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
else { |
226
|
0
|
|
|
|
|
|
$deja_vu{$cmp_hit} = 1; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
0
|
|
|
|
|
|
push @non_overlapping_domains, $sorted_domains[ $overlaps[0] ]; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
0
|
|
|
|
|
|
return(@non_overlapping_domains); |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub _refine_coordinates { ## no critic (RequireArgUnpacking) |
238
|
0
|
|
|
0
|
|
|
my $self = shift; |
239
|
|
|
|
|
|
|
|
240
|
0
|
|
|
|
|
|
my @domains = @_; |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# final refinment of coordinates |
243
|
0
|
|
|
|
|
|
my @sorted_domains = sort { $a->begin <=> $b->begin } @domains; |
|
0
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
245
|
0
|
|
|
|
|
|
for my $i (0..(scalar @sorted_domains - 2)) { |
246
|
|
|
|
|
|
|
|
247
|
0
|
0
|
|
|
|
|
if ($sorted_domains[$i]->end >= $sorted_domains[$i + 1]->begin) { # si le hit recouvre la fin du hit précént |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
my $adjust_start |
250
|
|
|
|
|
|
|
= $ADJUSTMENT_FOR{ $sorted_domains[$i + 1]->target_name }{start} |
251
|
0
|
|
0
|
|
|
|
// 0 |
252
|
|
|
|
|
|
|
; |
253
|
|
|
|
|
|
|
|
254
|
0
|
0
|
|
|
|
|
if ($adjust_start > 0) { # si ajustement start domaine suivant |
255
|
|
|
|
|
|
|
|
256
|
0
|
0
|
|
|
|
|
if ( ($sorted_domains[$i + 1]->begin + $adjust_start) |
257
|
|
|
|
|
|
|
> $sorted_domains[$i]->end) { # si conserver une partie de l'ajustement est possible |
258
|
|
|
|
|
|
|
# et si la partie qui empiète le hit précédent est uen région ajoutée artificiellement --> réduire la taille de la région ajoutée (jusqu'au pHMM tout au plus |
259
|
|
|
|
|
|
|
# il vaut mieux commencer par éliminer les parties ajoutées en N-ter d'abord, car elles sont assez limitées. |
260
|
0
|
|
|
|
|
|
$sorted_domains[$i + 1]->_set_begin( |
261
|
|
|
|
|
|
|
$sorted_domains[$i]->end + 1); |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
else { # sinon déduire ajustement start sans endommager les coordonées initiales du domaine |
265
|
0
|
|
|
|
|
|
$sorted_domains[$i + 1]->_set_begin( |
266
|
|
|
|
|
|
|
$sorted_domains[$i + 1]->begin + $adjust_start); |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
0
|
0
|
|
|
|
|
if ($sorted_domains[$i]->end >= $sorted_domains[$i + 1]->begin) { # si cela ne résoud pas le problème, on peut réduire la taille de la région ajoutée C-ter du domaine précédent (jusqu'au pHMM tout au plus |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
my $adjust_end |
274
|
|
|
|
|
|
|
= $ADJUSTMENT_FOR{ $sorted_domains[$i]->target_name }{end} |
275
|
0
|
|
0
|
|
|
|
// 0 |
276
|
|
|
|
|
|
|
; |
277
|
|
|
|
|
|
|
|
278
|
0
|
0
|
|
|
|
|
if ($adjust_end > 0) { # si ajustement end |
279
|
|
|
|
|
|
|
|
280
|
0
|
0
|
|
|
|
|
if ( ($sorted_domains[$i]->end - $adjust_end) |
281
|
|
|
|
|
|
|
< $sorted_domains[$i + 1]->begin) { # si conserver une partie de l'ajustement est possible |
282
|
|
|
|
|
|
|
|
283
|
0
|
|
|
|
|
|
$sorted_domains[$i]->_set_end( |
284
|
|
|
|
|
|
|
$sorted_domains[$i + 1]->begin - 1); |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
else { # sinon déduire ajustement end sans endommager les coordonées initiales du domaine |
288
|
|
|
|
|
|
|
|
289
|
0
|
|
|
|
|
|
$sorted_domains[$i]->_set_end($sorted_domains[$i]->end - $adjust_end); |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
# ne rien faire si les parties qui s'empiètent font partie du pHMM --> comment pourrait-on les distinguer ? |
293
|
|
|
|
|
|
|
} |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# attribute refined sequence |
297
|
0
|
|
|
|
|
|
for my $domain (@domains) { |
298
|
0
|
|
|
|
|
|
$domain->_set_size($domain->end - $domain->begin + 1); |
299
|
|
|
|
|
|
|
|
300
|
0
|
|
|
|
|
|
my ($seq) = $self->protein_sequence; # list context |
301
|
0
|
|
|
|
|
|
$domain->_set_protein_sequence( |
302
|
|
|
|
|
|
|
substr($seq, $domain->begin - 1, $domain->size) ); |
303
|
|
|
|
|
|
|
} |
304
|
|
|
|
|
|
|
|
305
|
0
|
|
|
|
|
|
return; |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
sub _get_domain_subtype { |
309
|
0
|
|
|
0
|
|
|
my $self = shift; |
310
|
|
|
|
|
|
|
|
311
|
0
|
|
|
|
|
|
my $domain = shift; |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# search for A & AT substrate specificity and C & KS subtypes |
314
|
0
|
|
|
|
|
|
my %hmmdb_for = ( |
315
|
|
|
|
|
|
|
A => $DATA_PATH . 'A_specificity.hmm', |
316
|
|
|
|
|
|
|
AT => $DATA_PATH . 'AT_specificity.hmm', |
317
|
|
|
|
|
|
|
C => $DATA_PATH . 'C_subtypes.hmm', |
318
|
|
|
|
|
|
|
KS => $DATA_PATH . 'KS_subtypes.hmm', |
319
|
|
|
|
|
|
|
); |
320
|
|
|
|
|
|
|
|
321
|
0
|
0
|
|
|
|
|
unless ($hmmdb_for{$domain->symbol}) { |
322
|
|
|
|
|
|
|
|
323
|
0
|
0
|
|
|
|
|
if ($domain->function =~ m/MT | ^Amt$ | Aminotran/xms) { |
324
|
0
|
|
|
|
|
|
$domain->_set_subtype($domain->target_name); |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
else { |
328
|
0
|
|
|
|
|
|
$domain->_set_subtype('NULL'); |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
0
|
|
|
|
|
|
$domain->_set_subtype_evalue('NULL'); |
332
|
0
|
|
|
|
|
|
$domain->_set_subtype_score('NULL'); |
333
|
0
|
|
|
|
|
|
return; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
0
|
|
|
|
|
|
my $seq = $domain->protein_sequence; |
337
|
|
|
|
|
|
|
|
338
|
0
|
|
|
|
|
|
my $hmmdb = $hmmdb_for{$domain->symbol}; |
339
|
0
|
|
|
|
|
|
my $tbout = $self->_do_hmmscan($seq, $hmmdb); |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
# parse domtblout hmmer report |
342
|
0
|
|
|
|
|
|
my $report = DomTable->new( file => $tbout->filename ); |
343
|
|
|
|
|
|
|
|
344
|
0
|
|
|
|
|
|
my %subtype_for; |
345
|
|
|
|
|
|
|
my $i; |
346
|
|
|
|
|
|
|
|
347
|
0
|
|
|
|
|
|
while (my $hit = $report->next_hit) { |
348
|
|
|
|
|
|
|
|
349
|
0
|
|
|
|
|
|
$i++; |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
$subtype_for{$hit->target_name} = { # avoid doublons |
352
|
0
|
|
|
|
|
|
map { $_ => $hit->$_ } |
|
0
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
qw(target_name ali_from ali_to hmm_from hmm_to tlen) |
354
|
|
|
|
|
|
|
}; |
355
|
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
|
$subtype_for{$hit->target_name}{evalue} = $hit->i_evalue; |
357
|
0
|
|
|
|
|
|
$subtype_for{$hit->target_name}{score} = $hit->dom_score; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
0
|
0
|
|
|
|
|
unless (values %subtype_for) { |
361
|
0
|
|
|
|
|
|
$domain->_set_subtype('na'); |
362
|
0
|
|
|
|
|
|
$domain->_set_subtype_evalue('na'); |
363
|
0
|
|
|
|
|
|
$domain->_set_subtype_score('na'); |
364
|
0
|
|
|
|
|
|
return; |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
#TODO handle multiple hits for the same pHMM -> give many times the same value |
368
|
|
|
|
|
|
|
my @sorted_hits |
369
|
0
|
|
|
|
|
|
= sort { $subtype_for{$b}{score} <=> $subtype_for{$a}{score} } |
|
0
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
keys %subtype_for |
371
|
|
|
|
|
|
|
; |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# get best scoring subtype but also the closest ones (>= 95% than the best score) |
374
|
0
|
|
|
|
|
|
my $score_ref = $subtype_for{$sorted_hits[0]}{score}; |
375
|
|
|
|
|
|
|
my @keys |
376
|
0
|
|
|
|
|
|
= grep { $subtype_for{$_}{score} >= ($score_ref / 100) * 95 } |
|
0
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
@sorted_hits |
378
|
|
|
|
|
|
|
; |
379
|
|
|
|
|
|
|
|
380
|
0
|
|
|
|
|
|
my @values = map { $subtype_for{$_}{target_name} } @keys; |
|
0
|
|
|
|
|
|
|
381
|
0
|
|
|
|
|
|
my @evalues = map { $subtype_for{$_}{evalue} } @keys; |
|
0
|
|
|
|
|
|
|
382
|
0
|
|
|
|
|
|
my @scores = map { $subtype_for{$_}{score} } @keys; |
|
0
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
|
384
|
0
|
|
|
|
|
|
$domain->_set_subtype( (join '/', @values)); |
385
|
0
|
|
|
|
|
|
$domain->_set_subtype_evalue( (join '/', @evalues)); |
386
|
0
|
|
|
|
|
|
$domain->_set_subtype_score( (join '/', @scores)); |
387
|
|
|
|
|
|
|
|
388
|
0
|
|
|
|
|
|
return; |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
# sub _get_docking_domains { |
392
|
|
|
|
|
|
|
# my $self = shift; |
393
|
|
|
|
|
|
|
# |
394
|
|
|
|
|
|
|
# my $seq = shift; |
395
|
|
|
|
|
|
|
# my $hmmdb = $DATA_PATH . 'docking.hmm'; |
396
|
|
|
|
|
|
|
# |
397
|
|
|
|
|
|
|
# my $tbout = $self->_do_hmmscan($seq, $hmmdb); |
398
|
|
|
|
|
|
|
# |
399
|
|
|
|
|
|
|
# # parse domtblout hmmer report |
400
|
|
|
|
|
|
|
# my $report = DomTable->new( file => $tbout->filename ); |
401
|
|
|
|
|
|
|
# |
402
|
|
|
|
|
|
|
# my %hit_for; |
403
|
|
|
|
|
|
|
# my $i; |
404
|
|
|
|
|
|
|
# |
405
|
|
|
|
|
|
|
# while (my $hit = $report->next_hit) { |
406
|
|
|
|
|
|
|
# |
407
|
|
|
|
|
|
|
# if ($ARGV{'--evalue-threshold'}) { |
408
|
|
|
|
|
|
|
# next if $hit->evalue > $ARGV{'--evalue-threshold'}; |
409
|
|
|
|
|
|
|
# } |
410
|
|
|
|
|
|
|
# $i++; |
411
|
|
|
|
|
|
|
# |
412
|
|
|
|
|
|
|
# $hit_for{ 'hit_' . $i } = { |
413
|
|
|
|
|
|
|
# map { $_ => $hit->$_ } qw(target_name ali_from ali_to hmm_from hmm_to tlen score evalue ) |
414
|
|
|
|
|
|
|
# }; |
415
|
|
|
|
|
|
|
# } |
416
|
|
|
|
|
|
|
# |
417
|
|
|
|
|
|
|
# return \%hit_for; |
418
|
|
|
|
|
|
|
# } |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
## no critic (ProhibitUnusedPrivateSubroutines) |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub _get_domain_features { |
424
|
0
|
|
|
0
|
|
|
my $self = shift; |
425
|
|
|
|
|
|
|
|
426
|
0
|
|
|
|
|
|
my $domain = shift; |
427
|
0
|
|
0
|
|
|
|
my $gene_pos = shift // 0; |
428
|
|
|
|
|
|
|
|
429
|
0
|
|
|
|
|
|
my $query = $domain->protein_sequence; |
430
|
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
|
my %domain_for = %{ $self->_parse_generic_domains($query) }; |
|
0
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
|
433
|
0
|
0
|
|
|
|
|
unless (%domain_for) { |
434
|
0
|
|
|
|
|
|
$domain->_set_function('to_remove'); |
435
|
0
|
|
|
|
|
|
return; |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
my $best_hit |
439
|
0
|
|
|
|
|
|
= (sort { $domain_for{$b}{score} <=> $domain_for{$a}{score} } |
|
0
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
keys %domain_for)[0] |
441
|
|
|
|
|
|
|
; |
442
|
|
|
|
|
|
|
|
443
|
0
|
|
|
|
|
|
for my $feature (keys %{ $domain_for{$best_hit} }) { |
|
0
|
|
|
|
|
|
|
444
|
0
|
|
|
|
|
|
my $_set_attr = '_set_' . $feature; |
445
|
0
|
|
|
|
|
|
$domain->$_set_attr( $domain_for{$best_hit}{$feature} ); |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
0
|
|
|
|
|
|
$self->_use_hit_information($domain, $gene_pos); |
449
|
0
|
|
|
|
|
|
return; |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
## use critic |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
sub _use_hit_information { |
456
|
0
|
|
|
0
|
|
|
my $self = shift; |
457
|
0
|
|
|
|
|
|
my $domain = shift; |
458
|
0
|
|
|
|
|
|
my $gene_pos = shift; |
459
|
|
|
|
|
|
|
|
460
|
0
|
|
|
|
|
|
$domain->_set_begin($gene_pos + $domain->ali_from); |
461
|
0
|
|
|
|
|
|
$domain->_set_end($gene_pos + $domain->ali_to); |
462
|
|
|
|
|
|
|
|
463
|
0
|
|
|
|
|
|
$domain->_set_phmm_name($domain->target_name); |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
|
466
|
0
|
0
|
0
|
|
|
|
if (! $domain->function || $domain->function =~ m/NA/xmsi) { |
467
|
0
|
|
|
|
|
|
$domain->_set_function($domain->target_name) |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
|
470
|
0
|
|
|
|
|
|
return; |
471
|
|
|
|
|
|
|
} |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
sub _parse_generic_domains { |
475
|
0
|
|
|
0
|
|
|
my $self = shift; |
476
|
|
|
|
|
|
|
|
477
|
0
|
|
|
|
|
|
my $seq = shift; |
478
|
|
|
|
|
|
|
|
479
|
0
|
|
|
|
|
|
my $hmmdb = $DATA_PATH . 'generic_domains.hmm'; |
480
|
|
|
|
|
|
|
|
481
|
0
|
|
|
|
|
|
my $tbout = $self->_do_hmmscan($seq, $hmmdb); |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# parsing of domtblout hmmscan report |
484
|
0
|
|
|
|
|
|
my $report = DomTable->new( file => $tbout->filename ); |
485
|
|
|
|
|
|
|
|
486
|
0
|
|
|
|
|
|
my $ug = Data::UUID->new; |
487
|
0
|
|
|
|
|
|
my %hit_for; |
488
|
0
|
|
|
|
|
|
my $evalue_threshold = 10e-3; |
489
|
0
|
|
|
|
|
|
my $i = 1; |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
HIT: |
492
|
0
|
|
|
|
|
|
while (my $hit = $report->next_hit) { |
493
|
|
|
|
|
|
|
|
494
|
0
|
0
|
|
|
|
|
next HIT if $hit->evalue > $evalue_threshold; |
495
|
|
|
|
|
|
|
|
496
|
0
|
|
|
|
|
|
my $ui = $ug->create_str; |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
$hit_for{$ui} = { |
499
|
0
|
|
|
|
|
|
map { $_ => $hit->$_ } |
|
0
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
qw(query_name target_name ali_from ali_to hmm_from hmm_to tlen qlen) |
501
|
|
|
|
|
|
|
}; |
502
|
|
|
|
|
|
|
|
503
|
0
|
|
|
|
|
|
$hit_for{$ui}{score} = $hit->dom_score; |
504
|
0
|
|
|
|
|
|
$hit_for{$ui}{evalue} = $hit->i_evalue; # i-value = independent evalue ('evalue' return cumulative evalues for the sequence, and c-evalue may be deleted because potentially e-value) |
505
|
0
|
|
|
|
|
|
$i++; |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
|
508
|
0
|
|
|
|
|
|
return \%hit_for; |
509
|
|
|
|
|
|
|
} |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
sub _do_hmmscan { |
512
|
0
|
|
|
0
|
|
|
my $self = shift; |
513
|
0
|
|
|
|
|
|
my $seq = shift; |
514
|
0
|
|
|
|
|
|
my $hmmdb = shift; |
515
|
|
|
|
|
|
|
|
516
|
0
|
|
|
|
|
|
my $query = File::Temp->new( |
517
|
|
|
|
|
|
|
template => 'tempfile_XXXXXXXXXXXXXXX', |
518
|
|
|
|
|
|
|
suffix => '.faa', |
519
|
|
|
|
|
|
|
unlock => 1 |
520
|
|
|
|
|
|
|
); |
521
|
|
|
|
|
|
|
|
522
|
0
|
|
|
|
|
|
print $query '>query' . "\n" . $seq; |
523
|
|
|
|
|
|
|
|
524
|
0
|
|
|
|
|
|
my $pgm = 'hmmscan'; |
525
|
|
|
|
|
|
|
|
526
|
0
|
|
|
|
|
|
my $cpu_n = 1; |
527
|
0
|
|
|
|
|
|
my $tbout = File::Temp->new(suffix => '_domtblout.tsv'); |
528
|
0
|
|
|
|
|
|
my $opt = ' --domtblout ' . $tbout . ' --cpu ' . $cpu_n; |
529
|
0
|
|
|
|
|
|
my $log = File::Temp->new(suffix => '_hmmscan.log', unlock => 1); |
530
|
|
|
|
|
|
|
|
531
|
0
|
|
|
|
|
|
my $cmd = "$pgm $opt $hmmdb $query > $log"; |
532
|
0
|
|
|
|
|
|
system $cmd; |
533
|
|
|
|
|
|
|
|
534
|
0
|
|
|
|
|
|
return $tbout; |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
|
537
|
1
|
|
|
1
|
|
2491
|
no Moose::Role; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
6
|
|
538
|
|
|
|
|
|
|
1; |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
__END__ |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=pod |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head1 NAME |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
Bio::Palantir::Roles::Fillable - Fillable Moose role for the construction of DomainPlus object arrays and Exploratory methods |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=head1 VERSION |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
version 0.200700 |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=head1 SYNOPSIS |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
# TODO |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=head1 DESCRIPTION |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
# TODO |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=head1 AUTHOR |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
Loic MEUNIER <lmeunier@uliege.be> |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
This software is copyright (c) 2019 by University of Liege / Unit of Eukaryotic Phylogenomics / Loic MEUNIER and Denis BAURAIN. |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
569
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=cut |