| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Bio::Palantir::Roles::Fillable; |
|
2
|
|
|
|
|
|
|
# ABSTRACT: Fillable Moose role for the construction of DomainPlus object arrays and Exploratory methods |
|
3
|
|
|
|
|
|
|
$Bio::Palantir::Roles::Fillable::VERSION = '0.211420'; |
|
4
|
1
|
|
|
1
|
|
812
|
use Moose::Role; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
9
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
5864
|
use autodie; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
11
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
5649
|
use Const::Fast; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
8
|
|
|
9
|
1
|
|
|
1
|
|
679
|
use File::ShareDir qw(dist_dir); |
|
|
1
|
|
|
|
|
25295
|
|
|
|
1
|
|
|
|
|
64
|
|
|
10
|
1
|
|
|
1
|
|
10
|
use File::Temp; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
98
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
1
|
|
|
1
|
|
10
|
use aliased 'Bio::FastParsers::Hmmer::DomTable'; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
12
|
|
|
13
|
1
|
|
|
1
|
|
236303
|
use aliased 'Bio::Palantir::Refiner::DomainPlus'; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
7
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
const my $DATA_PATH => dist_dir('Bio-Palantir') . '/'; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# pHMM database sources: |
|
19
|
|
|
|
|
|
|
# |
|
20
|
|
|
|
|
|
|
# Weber T, Blin K, Duddela S, et al. antiSMASH 3.0-a comprehensive resource for |
|
21
|
|
|
|
|
|
|
# the genome mining of biosynthetic gene clusters. Nucleic Acids Res. |
|
22
|
|
|
|
|
|
|
# 2015;43(W1):W237–W243. doi:10.1093/nar/gkv437 |
|
23
|
|
|
|
|
|
|
# |
|
24
|
|
|
|
|
|
|
# Khayatt, B. I., Overmars, L., Siezen, R. J., & Francke, C. (2013). |
|
25
|
|
|
|
|
|
|
# Classification of the Adenylation and Acyl-Transferase Activity of NRPS and |
|
26
|
|
|
|
|
|
|
# PKS Systems Using Ensembles of Substrate Specific Hidden Markov Models. |
|
27
|
|
|
|
|
|
|
# PLoS ONE, 8(4). http://doi.org/10.1371/journal.pone.0062136 |
|
28
|
|
|
|
|
|
|
# |
|
29
|
|
|
|
|
|
|
# Bushley, K. E., & Turgeon, B. G. (2010). Phylogenomics reveals subfamilies of |
|
30
|
|
|
|
|
|
|
# fungal nonribosomal peptide synthetases and their evolutionary relationships. |
|
31
|
|
|
|
|
|
|
# BMC Evolutionary Biology, 10(1), 26. http://doi.org/10.1186/1471-2148-10-26 |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# biological data |
|
34
|
|
|
|
|
|
|
my %ADJUSTMENT_FOR = ( # length column useless: this information can be obtained by OO variable |
|
35
|
|
|
|
|
|
|
# for antismash activity |
|
36
|
|
|
|
|
|
|
A => { start => 20, end => 100, length => 418 }, |
|
37
|
|
|
|
|
|
|
Red => { start => 0, end => 0 , length => 242 }, # equivalent of TD |
|
38
|
|
|
|
|
|
|
# NRPS |
|
39
|
|
|
|
|
|
|
Condensation => { start => 0, end => 150, length => 300 }, |
|
40
|
|
|
|
|
|
|
Condensation_LCL => { start => 0, end => 154, length => 296 }, |
|
41
|
|
|
|
|
|
|
Condensation_DCL => { start => 0, end => 150, length => 300 }, |
|
42
|
|
|
|
|
|
|
Condensation_Starter => { start => 0, end => 150, length => 300 }, |
|
43
|
|
|
|
|
|
|
Condensation_Dual => { start => 0, end => 157, length => 293 }, |
|
44
|
|
|
|
|
|
|
Epimerization => { start => 0, end => 133, length => 317 }, |
|
45
|
|
|
|
|
|
|
Heterocyclization => { start => 0, end => 150, length => 300 }, |
|
46
|
|
|
|
|
|
|
X => { start => 0, end => 147, length => 303 }, |
|
47
|
|
|
|
|
|
|
Cglyc => { start => 0, end => 151, length => 299 }, |
|
48
|
|
|
|
|
|
|
Condensation_fungi => { start => 0, end => 251, length => 209 }, # où faut-il ajuster ? |
|
49
|
|
|
|
|
|
|
'AMP-binding' => { start => 27, end => 66, length => 418 }, |
|
50
|
|
|
|
|
|
|
'AMP-binding_fungi' => { start => 4, end => 0 , length => 507 }, |
|
51
|
|
|
|
|
|
|
ACPS => { start => 0, end => 0 , length => 78 }, |
|
52
|
|
|
|
|
|
|
Aminotran_1_2 => { start => 0, end => 0 , length => 363 }, |
|
53
|
|
|
|
|
|
|
Aminotran_3 => { start => 0, end => 0 , length => 339 }, |
|
54
|
|
|
|
|
|
|
Aminotran_4 => { start => 0, end => 0 , length => 232 }, |
|
55
|
|
|
|
|
|
|
Aminotran_5 => { start => 0, end => 0 , length => 371 }, |
|
56
|
|
|
|
|
|
|
'A-OX' => { start => 0, end => -250,length => 780 }, # it overlaps following domains |
|
57
|
|
|
|
|
|
|
B => { start => 0, end => 0 , length => 365 }, |
|
58
|
|
|
|
|
|
|
cMT => { start => 0, end => 0 , length => 230 }, |
|
59
|
|
|
|
|
|
|
ECH => { start => 0, end => 0 , length => 245 }, |
|
60
|
|
|
|
|
|
|
Epimerization => { start => 0, end => 0 , length => 317 }, |
|
61
|
|
|
|
|
|
|
F => { start => 0, end => 0 , length => 127 }, |
|
62
|
|
|
|
|
|
|
Fkbh => { start => 0, end => 0 , length => 142 }, |
|
63
|
|
|
|
|
|
|
GNAT => { start => 0, end => 0 , length => 139 }, |
|
64
|
|
|
|
|
|
|
Hal => { start => 0, end => 0 , length => 222 }, |
|
65
|
|
|
|
|
|
|
Heterocyclization => { start => 0, end => 0 , length => 300 }, |
|
66
|
|
|
|
|
|
|
NAD_binding_4 => { start => 0, end => 0 , length => 249 }, |
|
67
|
|
|
|
|
|
|
nMT => { start => 0, end => 0 , length => 244 }, |
|
68
|
|
|
|
|
|
|
'NRPS-COM_Cterm' => { start => 0, end => 0 , length => 21 }, |
|
69
|
|
|
|
|
|
|
'NRPS-COM_Nterm' => { start => 0, end => 0 , length => 33 }, |
|
70
|
|
|
|
|
|
|
oMT => { start => 0, end => 0 , length => 280 }, |
|
71
|
|
|
|
|
|
|
PCP => { start => 15, end => 0 , length => 70 }, |
|
72
|
|
|
|
|
|
|
'PP-binding' => { start => 15, end => 0 , length => 70 }, |
|
73
|
|
|
|
|
|
|
PCP_fungi => { start => 0, end => 0 , length => 86 }, |
|
74
|
|
|
|
|
|
|
PS => { start => 0, end => 0 , length => 214 }, |
|
75
|
|
|
|
|
|
|
TD => { start => 0, end => 0 , length => 242 }, |
|
76
|
|
|
|
|
|
|
Thioesterase => { start => 0, end => 0 , length => 229 }, |
|
77
|
|
|
|
|
|
|
# PKS |
|
78
|
|
|
|
|
|
|
PKS_KS => { start => 8, end => 0, length => 426 }, |
|
79
|
|
|
|
|
|
|
PKS_KR => { start => 0, end => 0, length => 185 }, |
|
80
|
|
|
|
|
|
|
PKS_ER => { start => 0, end => 0, length => 313 }, |
|
81
|
|
|
|
|
|
|
PKS_DH => { start => 0, end => 0, length => 166 }, |
|
82
|
|
|
|
|
|
|
PKS_DH2 => { start => 0, end => 0, length => 233 }, |
|
83
|
|
|
|
|
|
|
PKS_DHt => { start => 0, end => 0, length => 236 }, |
|
84
|
|
|
|
|
|
|
PKS_AT => { start => 114, end => 22, length => 298 }, |
|
85
|
|
|
|
|
|
|
Polyketide_cyc => { start => 0, end => 0, length => 130 }, |
|
86
|
|
|
|
|
|
|
Polyketide_cyc2 => { start => 0, end => 0, length => 139 }, |
|
87
|
|
|
|
|
|
|
PKS_Docking_Nterm => { start => 0, end => 0, length => 28 }, |
|
88
|
|
|
|
|
|
|
PKS_Docking_Cterm => { start => 0, end => 0, length => 73 }, |
|
89
|
|
|
|
|
|
|
CAL_domain => { start => 0, end => 0, length => 465 }, |
|
90
|
|
|
|
|
|
|
'Trans-AT_docking' => { start => 0, end => 0, length => 155 }, |
|
91
|
|
|
|
|
|
|
ACP => { start => 0, end => 0, length => 73 }, |
|
92
|
|
|
|
|
|
|
ACP_beta => { start => 0, end => 0, length => 67 }, |
|
93
|
|
|
|
|
|
|
); |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# public methods |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub detect_domains { ## no critic (RequireArgUnpacking) |
|
99
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
100
|
|
|
|
|
|
|
|
|
101
|
0
|
|
|
|
|
|
my ($seq, $gene_pos, $gap_coords, $from_seq) = @_; #TODO switch to hash |
|
102
|
|
|
|
|
|
|
|
|
103
|
0
|
|
|
|
|
|
my %hit_for = %{ $self->_parse_generic_domains($seq) }; |
|
|
0
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# build DomainPlus objects |
|
106
|
0
|
|
|
|
|
|
my @domains; |
|
107
|
0
|
|
|
|
|
|
for my $hit (keys %hit_for) { |
|
108
|
|
|
|
|
|
|
push @domains, DomainPlus->new( |
|
109
|
0
|
|
|
|
|
|
map { $_ => $hit_for{$hit}{$_} } keys %{ $hit_for{$hit} } |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
); |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
|
|
113
|
0
|
|
|
|
|
|
$_->_set_protein_sequence($seq) for @domains; |
|
114
|
|
|
|
|
|
|
|
|
115
|
0
|
|
|
|
|
|
$self->_use_hit_information($_, $gene_pos) for @domains; |
|
116
|
|
|
|
|
|
|
|
|
117
|
0
|
|
|
|
|
|
$self->_elongate_coordinates(\@domains, $gap_coords); |
|
118
|
0
|
|
|
|
|
|
@domains = $self->_handle_overlaps($from_seq, @domains); |
|
119
|
0
|
|
|
|
|
|
$self->_refine_coordinates(@domains); |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# subtype the domains |
|
122
|
0
|
|
|
|
|
|
$self->_get_domain_subtype($_) for @domains; |
|
123
|
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
|
return (@domains); |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# private methods |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub _elongate_coordinates { |
|
130
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
131
|
|
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
my $domains = shift; |
|
133
|
0
|
|
|
|
|
|
my $gap_coords = shift; |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# adjustment of domain coordinates to get plausible domain sizes |
|
136
|
0
|
|
|
|
|
|
for my $domain (@{ $domains }) { |
|
|
0
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# begin at profile start even if does not match the profile, and adjust the coords depending of the domain nature |
|
139
|
0
|
|
|
|
|
|
my $start = $domain->begin - $domain->hmm_from; # complete the domain if the phmm didnt match entirely the sequence |
|
140
|
0
|
|
|
|
|
|
my $end = $domain->end + ($domain->tlen - $domain->hmm_to); |
|
141
|
|
|
|
|
|
|
|
|
142
|
0
|
0
|
|
|
|
|
if ($ADJUSTMENT_FOR{$domain->target_name}) { |
|
143
|
0
|
|
0
|
|
|
|
$start -= $ADJUSTMENT_FOR{$domain->target_name}{start} // 0; |
|
144
|
0
|
|
0
|
|
|
|
$end += $ADJUSTMENT_FOR{$domain->target_name}{end} // 0; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# handle truncated domains |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# check strain orientation |
|
150
|
0
|
|
|
|
|
|
my ($gene_begin, $gene_end); |
|
151
|
0
|
0
|
|
|
|
|
if ($self->gene_begin > $self->gene_end) { |
|
152
|
0
|
|
|
|
|
|
$gene_begin = $self->gene_end; |
|
153
|
0
|
|
|
|
|
|
$gene_end = $self->gene_begin; |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
else { |
|
157
|
0
|
|
|
|
|
|
$gene_begin = $self->gene_begin; |
|
158
|
0
|
|
|
|
|
|
$gene_end = $self->gene_end; |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# standard elongation checks |
|
162
|
0
|
0
|
|
|
|
|
unless ($gap_coords) { |
|
163
|
0
|
0
|
|
|
|
|
$start = 1 if $start <= 0; # as we do not use -1 position before substr() |
|
164
|
0
|
0
|
|
|
|
|
$end = $gene_end if $end > $gene_end; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# gap filling elongation check |
|
168
|
|
|
|
|
|
|
else { |
|
169
|
0
|
0
|
|
|
|
|
$start = $gap_coords->[0] if $start < $gap_coords->[0]; |
|
170
|
0
|
0
|
|
|
|
|
$end = ($gap_coords->[1] - 1) if $end >= $gap_coords->[1]; |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
|
|
173
|
0
|
|
|
|
|
|
my $size = $end - $start + 1; |
|
174
|
|
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
|
$domain->_set_begin($start); |
|
176
|
0
|
|
|
|
|
|
$domain->_set_end($end); |
|
177
|
0
|
|
|
|
|
|
$domain->_set_size($size); |
|
178
|
0
|
|
|
|
|
|
$domain->_set_coordinates( [$start, $end] ); |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
0
|
|
|
|
|
|
return; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
sub _handle_overlaps { ## no critic (RequireArgUnpacking) |
|
185
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
186
|
|
|
|
|
|
|
|
|
187
|
0
|
|
0
|
|
|
|
my $from_seq = shift // 0; |
|
188
|
0
|
|
|
|
|
|
my @domains = @_; |
|
189
|
|
|
|
|
|
|
my @sorted_domains |
|
190
|
0
|
0
|
|
|
|
|
= sort { $b->score <=> $a->{score} || $a->begin <=> $b->begin } |
|
|
0
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
@domains |
|
192
|
|
|
|
|
|
|
; |
|
193
|
|
|
|
|
|
|
|
|
194
|
0
|
|
|
|
|
|
my %deja_vu; |
|
195
|
|
|
|
|
|
|
my @non_overlapping_domains; |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
REF: |
|
198
|
0
|
|
|
|
|
|
for my $ref_hit ( 0..(@sorted_domains - 1) ) { |
|
199
|
|
|
|
|
|
|
|
|
200
|
0
|
0
|
|
|
|
|
next REF if $deja_vu{$ref_hit}; |
|
201
|
|
|
|
|
|
|
|
|
202
|
0
|
|
|
|
|
|
my ($x1, $y1) = map { $sorted_domains[$ref_hit]->$_ } qw(begin end); |
|
|
0
|
|
|
|
|
|
|
|
203
|
0
|
|
|
|
|
|
$deja_vu{$ref_hit} = 1; |
|
204
|
|
|
|
|
|
|
|
|
205
|
0
|
|
|
|
|
|
my @overlaps = $ref_hit; |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
CMP: |
|
208
|
0
|
|
|
|
|
|
for my $cmp_hit ( 0..(@sorted_domains - 1) ) { |
|
209
|
|
|
|
|
|
|
|
|
210
|
0
|
0
|
|
|
|
|
next CMP if $deja_vu{$cmp_hit}; |
|
211
|
|
|
|
|
|
|
|
|
212
|
0
|
|
|
|
|
|
my ($x2, $y2) = map { $sorted_domains[$cmp_hit]->$_ } qw(begin end); |
|
|
0
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
|
|
214
|
0
|
0
|
0
|
|
|
|
unless ($x2 > $y1 || $y2 < $x1) { ## no critic (ProhibitNegativeExpressionsInUnlessAndUntilConditions) |
|
215
|
|
|
|
|
|
|
|
|
216
|
0
|
|
|
|
|
|
push @overlaps, $cmp_hit; |
|
217
|
|
|
|
|
|
|
|
|
218
|
0
|
0
|
|
|
|
|
if ($from_seq == 1) { |
|
219
|
0
|
0
|
|
|
|
|
$deja_vu{$cmp_hit} = 1 |
|
220
|
|
|
|
|
|
|
if $sorted_domains[$cmp_hit]->class |
|
221
|
|
|
|
|
|
|
eq $sorted_domains[$ref_hit]->class |
|
222
|
|
|
|
|
|
|
; # avoid superposition of domain of the same class |
|
223
|
|
|
|
|
|
|
} |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
else { |
|
226
|
0
|
|
|
|
|
|
$deja_vu{$cmp_hit} = 1; |
|
227
|
|
|
|
|
|
|
} |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
} |
|
230
|
|
|
|
|
|
|
|
|
231
|
0
|
|
|
|
|
|
push @non_overlapping_domains, $sorted_domains[ $overlaps[0] ]; |
|
232
|
|
|
|
|
|
|
} |
|
233
|
|
|
|
|
|
|
|
|
234
|
0
|
|
|
|
|
|
return(@non_overlapping_domains); |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
sub _refine_coordinates { ## no critic (RequireArgUnpacking) |
|
238
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
239
|
|
|
|
|
|
|
|
|
240
|
0
|
|
|
|
|
|
my @domains = @_; |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# final refinment of coordinates |
|
243
|
0
|
|
|
|
|
|
my @sorted_domains = sort { $a->begin <=> $b->begin } @domains; |
|
|
0
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
|
|
245
|
0
|
|
|
|
|
|
for my $i (0..(scalar @sorted_domains - 2)) { |
|
246
|
|
|
|
|
|
|
|
|
247
|
0
|
0
|
|
|
|
|
if ($sorted_domains[$i]->end >= $sorted_domains[$i + 1]->begin) { # si le hit recouvre la fin du hit précént |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
my $adjust_start |
|
250
|
|
|
|
|
|
|
= $ADJUSTMENT_FOR{ $sorted_domains[$i + 1]->target_name }{start} |
|
251
|
0
|
|
0
|
|
|
|
// 0 |
|
252
|
|
|
|
|
|
|
; |
|
253
|
|
|
|
|
|
|
|
|
254
|
0
|
0
|
|
|
|
|
if ($adjust_start > 0) { # si ajustement start domaine suivant |
|
255
|
|
|
|
|
|
|
|
|
256
|
0
|
0
|
|
|
|
|
if ( ($sorted_domains[$i + 1]->begin + $adjust_start) |
|
257
|
|
|
|
|
|
|
> $sorted_domains[$i]->end) { # si conserver une partie de l'ajustement est possible |
|
258
|
|
|
|
|
|
|
# et si la partie qui empiète le hit précédent est uen région ajoutée artificiellement --> réduire la taille de la région ajoutée (jusqu'au pHMM tout au plus |
|
259
|
|
|
|
|
|
|
# il vaut mieux commencer par éliminer les parties ajoutées en N-ter d'abord, car elles sont assez limitées. |
|
260
|
0
|
|
|
|
|
|
$sorted_domains[$i + 1]->_set_begin( |
|
261
|
|
|
|
|
|
|
$sorted_domains[$i]->end + 1); |
|
262
|
|
|
|
|
|
|
} |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
else { # sinon déduire ajustement start sans endommager les coordonées initiales du domaine |
|
265
|
0
|
|
|
|
|
|
$sorted_domains[$i + 1]->_set_begin( |
|
266
|
|
|
|
|
|
|
$sorted_domains[$i + 1]->begin + $adjust_start); |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
} |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
|
|
271
|
0
|
0
|
|
|
|
|
if ($sorted_domains[$i]->end >= $sorted_domains[$i + 1]->begin) { # si cela ne résoud pas le problème, on peut réduire la taille de la région ajoutée C-ter du domaine précédent (jusqu'au pHMM tout au plus |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
my $adjust_end |
|
274
|
|
|
|
|
|
|
= $ADJUSTMENT_FOR{ $sorted_domains[$i]->target_name }{end} |
|
275
|
0
|
|
0
|
|
|
|
// 0 |
|
276
|
|
|
|
|
|
|
; |
|
277
|
|
|
|
|
|
|
|
|
278
|
0
|
0
|
|
|
|
|
if ($adjust_end > 0) { # si ajustement end |
|
279
|
|
|
|
|
|
|
|
|
280
|
0
|
0
|
|
|
|
|
if ( ($sorted_domains[$i]->end - $adjust_end) |
|
281
|
|
|
|
|
|
|
< $sorted_domains[$i + 1]->begin) { # si conserver une partie de l'ajustement est possible |
|
282
|
|
|
|
|
|
|
|
|
283
|
0
|
|
|
|
|
|
$sorted_domains[$i]->_set_end( |
|
284
|
|
|
|
|
|
|
$sorted_domains[$i + 1]->begin - 1); |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
else { # sinon déduire ajustement end sans endommager les coordonées initiales du domaine |
|
288
|
|
|
|
|
|
|
|
|
289
|
0
|
|
|
|
|
|
$sorted_domains[$i]->_set_end($sorted_domains[$i]->end - $adjust_end); |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
} |
|
292
|
|
|
|
|
|
|
# ne rien faire si les parties qui s'empiètent font partie du pHMM --> comment pourrait-on les distinguer ? |
|
293
|
|
|
|
|
|
|
} |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# attribute refined sequence |
|
297
|
0
|
|
|
|
|
|
for my $domain (@domains) { |
|
298
|
0
|
|
|
|
|
|
$domain->_set_size($domain->end - $domain->begin + 1); |
|
299
|
|
|
|
|
|
|
|
|
300
|
0
|
|
|
|
|
|
my ($seq) = $self->protein_sequence; # list context |
|
301
|
0
|
|
|
|
|
|
$domain->_set_protein_sequence( |
|
302
|
|
|
|
|
|
|
substr($seq, $domain->begin - 1, $domain->size) ); |
|
303
|
|
|
|
|
|
|
} |
|
304
|
|
|
|
|
|
|
|
|
305
|
0
|
|
|
|
|
|
return; |
|
306
|
|
|
|
|
|
|
} |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
sub _get_domain_subtype { |
|
309
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
310
|
|
|
|
|
|
|
|
|
311
|
0
|
|
|
|
|
|
my $domain = shift; |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# search for A & AT substrate specificity and C & KS subtypes |
|
314
|
0
|
|
|
|
|
|
my %hmmdb_for = ( |
|
315
|
|
|
|
|
|
|
A => $DATA_PATH . 'A_specificity.hmm', |
|
316
|
|
|
|
|
|
|
AT => $DATA_PATH . 'AT_specificity.hmm', |
|
317
|
|
|
|
|
|
|
C => $DATA_PATH . 'C_subtypes.hmm', |
|
318
|
|
|
|
|
|
|
KS => $DATA_PATH . 'KS_subtypes.hmm', |
|
319
|
|
|
|
|
|
|
); |
|
320
|
|
|
|
|
|
|
|
|
321
|
0
|
0
|
|
|
|
|
unless ($hmmdb_for{$domain->symbol}) { |
|
322
|
|
|
|
|
|
|
|
|
323
|
0
|
0
|
|
|
|
|
if ($domain->function =~ m/MT | ^Amt$ | Aminotran/xms) { |
|
324
|
0
|
|
|
|
|
|
$domain->_set_subtype($domain->target_name); |
|
325
|
|
|
|
|
|
|
} |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
else { |
|
328
|
0
|
|
|
|
|
|
$domain->_set_subtype('NULL'); |
|
329
|
|
|
|
|
|
|
} |
|
330
|
|
|
|
|
|
|
|
|
331
|
0
|
|
|
|
|
|
$domain->_set_subtype_evalue('NULL'); |
|
332
|
0
|
|
|
|
|
|
$domain->_set_subtype_score('NULL'); |
|
333
|
0
|
|
|
|
|
|
return; |
|
334
|
|
|
|
|
|
|
} |
|
335
|
|
|
|
|
|
|
|
|
336
|
0
|
|
|
|
|
|
my $seq = $domain->protein_sequence; |
|
337
|
|
|
|
|
|
|
|
|
338
|
0
|
|
|
|
|
|
my $hmmdb = $hmmdb_for{$domain->symbol}; |
|
339
|
0
|
|
|
|
|
|
my $tbout = $self->_do_hmmscan($seq, $hmmdb); |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
# parse domtblout hmmer report |
|
342
|
0
|
|
|
|
|
|
my $report = DomTable->new( file => $tbout->filename ); |
|
343
|
|
|
|
|
|
|
|
|
344
|
0
|
|
|
|
|
|
my %subtype_for; |
|
345
|
|
|
|
|
|
|
my $i; |
|
346
|
|
|
|
|
|
|
|
|
347
|
0
|
|
|
|
|
|
while (my $hit = $report->next_hit) { |
|
348
|
|
|
|
|
|
|
|
|
349
|
0
|
|
|
|
|
|
$i++; |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
$subtype_for{$hit->target_name} = { # avoid doublons |
|
352
|
0
|
|
|
|
|
|
map { $_ => $hit->$_ } |
|
|
0
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
qw(target_name ali_from ali_to hmm_from hmm_to tlen) |
|
354
|
|
|
|
|
|
|
}; |
|
355
|
|
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
|
$subtype_for{$hit->target_name}{evalue} = $hit->i_evalue; |
|
357
|
0
|
|
|
|
|
|
$subtype_for{$hit->target_name}{score} = $hit->dom_score; |
|
358
|
|
|
|
|
|
|
} |
|
359
|
|
|
|
|
|
|
|
|
360
|
0
|
0
|
|
|
|
|
unless (values %subtype_for) { |
|
361
|
0
|
|
|
|
|
|
$domain->_set_subtype('na'); |
|
362
|
0
|
|
|
|
|
|
$domain->_set_subtype_evalue('na'); |
|
363
|
0
|
|
|
|
|
|
$domain->_set_subtype_score('na'); |
|
364
|
0
|
|
|
|
|
|
return; |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
#TODO handle multiple hits for the same pHMM -> give many times the same value |
|
368
|
|
|
|
|
|
|
my @sorted_hits |
|
369
|
0
|
|
|
|
|
|
= sort { $subtype_for{$b}{score} <=> $subtype_for{$a}{score} } |
|
|
0
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
keys %subtype_for |
|
371
|
|
|
|
|
|
|
; |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# get best scoring subtype but also the closest ones (>= 95% than the best score) |
|
374
|
0
|
|
|
|
|
|
my $score_ref = $subtype_for{$sorted_hits[0]}{score}; |
|
375
|
|
|
|
|
|
|
my @keys |
|
376
|
0
|
|
|
|
|
|
= grep { $subtype_for{$_}{score} >= ($score_ref / 100) * 95 } |
|
|
0
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
@sorted_hits |
|
378
|
|
|
|
|
|
|
; |
|
379
|
|
|
|
|
|
|
|
|
380
|
0
|
|
|
|
|
|
my @values = map { $subtype_for{$_}{target_name} } @keys; |
|
|
0
|
|
|
|
|
|
|
|
381
|
0
|
|
|
|
|
|
my @evalues = map { $subtype_for{$_}{evalue} } @keys; |
|
|
0
|
|
|
|
|
|
|
|
382
|
0
|
|
|
|
|
|
my @scores = map { $subtype_for{$_}{score} } @keys; |
|
|
0
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
|
|
384
|
0
|
|
|
|
|
|
$domain->_set_subtype( (join '/', @values)); |
|
385
|
0
|
|
|
|
|
|
$domain->_set_subtype_evalue( (join '/', @evalues)); |
|
386
|
0
|
|
|
|
|
|
$domain->_set_subtype_score( (join '/', @scores)); |
|
387
|
|
|
|
|
|
|
|
|
388
|
0
|
|
|
|
|
|
return; |
|
389
|
|
|
|
|
|
|
} |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
# sub _get_docking_domains { |
|
392
|
|
|
|
|
|
|
# my $self = shift; |
|
393
|
|
|
|
|
|
|
# |
|
394
|
|
|
|
|
|
|
# my $seq = shift; |
|
395
|
|
|
|
|
|
|
# my $hmmdb = $DATA_PATH . 'docking.hmm'; |
|
396
|
|
|
|
|
|
|
# |
|
397
|
|
|
|
|
|
|
# my $tbout = $self->_do_hmmscan($seq, $hmmdb); |
|
398
|
|
|
|
|
|
|
# |
|
399
|
|
|
|
|
|
|
# # parse domtblout hmmer report |
|
400
|
|
|
|
|
|
|
# my $report = DomTable->new( file => $tbout->filename ); |
|
401
|
|
|
|
|
|
|
# |
|
402
|
|
|
|
|
|
|
# my %hit_for; |
|
403
|
|
|
|
|
|
|
# my $i; |
|
404
|
|
|
|
|
|
|
# |
|
405
|
|
|
|
|
|
|
# while (my $hit = $report->next_hit) { |
|
406
|
|
|
|
|
|
|
# |
|
407
|
|
|
|
|
|
|
# if ($ARGV{'--evalue-threshold'}) { |
|
408
|
|
|
|
|
|
|
# next if $hit->evalue > $ARGV{'--evalue-threshold'}; |
|
409
|
|
|
|
|
|
|
# } |
|
410
|
|
|
|
|
|
|
# $i++; |
|
411
|
|
|
|
|
|
|
# |
|
412
|
|
|
|
|
|
|
# $hit_for{ 'hit_' . $i } = { |
|
413
|
|
|
|
|
|
|
# map { $_ => $hit->$_ } qw(target_name ali_from ali_to hmm_from hmm_to tlen score evalue ) |
|
414
|
|
|
|
|
|
|
# }; |
|
415
|
|
|
|
|
|
|
# } |
|
416
|
|
|
|
|
|
|
# |
|
417
|
|
|
|
|
|
|
# return \%hit_for; |
|
418
|
|
|
|
|
|
|
# } |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
## no critic (ProhibitUnusedPrivateSubroutines) |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub _get_domain_features { |
|
424
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
425
|
|
|
|
|
|
|
|
|
426
|
0
|
|
|
|
|
|
my $domain = shift; |
|
427
|
0
|
|
0
|
|
|
|
my $gene_pos = shift // 0; |
|
428
|
|
|
|
|
|
|
|
|
429
|
0
|
|
|
|
|
|
my $query = $domain->protein_sequence; |
|
430
|
|
|
|
|
|
|
|
|
431
|
0
|
|
|
|
|
|
my %domain_for = %{ $self->_parse_generic_domains($query) }; |
|
|
0
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
|
|
433
|
0
|
0
|
|
|
|
|
unless (%domain_for) { |
|
434
|
0
|
|
|
|
|
|
$domain->_set_function('to_remove'); |
|
435
|
0
|
|
|
|
|
|
return; |
|
436
|
|
|
|
|
|
|
} |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
my $best_hit |
|
439
|
0
|
|
|
|
|
|
= (sort { $domain_for{$b}{score} <=> $domain_for{$a}{score} } |
|
|
0
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
keys %domain_for)[0] |
|
441
|
|
|
|
|
|
|
; |
|
442
|
|
|
|
|
|
|
|
|
443
|
0
|
|
|
|
|
|
for my $feature (keys %{ $domain_for{$best_hit} }) { |
|
|
0
|
|
|
|
|
|
|
|
444
|
0
|
|
|
|
|
|
my $_set_attr = '_set_' . $feature; |
|
445
|
0
|
|
|
|
|
|
$domain->$_set_attr( $domain_for{$best_hit}{$feature} ); |
|
446
|
|
|
|
|
|
|
} |
|
447
|
|
|
|
|
|
|
|
|
448
|
0
|
|
|
|
|
|
$self->_use_hit_information($domain, $gene_pos); |
|
449
|
0
|
|
|
|
|
|
return; |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
## use critic |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
sub _use_hit_information { |
|
456
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
457
|
0
|
|
|
|
|
|
my $domain = shift; |
|
458
|
0
|
|
|
|
|
|
my $gene_pos = shift; |
|
459
|
|
|
|
|
|
|
|
|
460
|
0
|
|
|
|
|
|
$domain->_set_begin($gene_pos + $domain->ali_from); |
|
461
|
0
|
|
|
|
|
|
$domain->_set_end($gene_pos + $domain->ali_to); |
|
462
|
|
|
|
|
|
|
|
|
463
|
0
|
|
|
|
|
|
$domain->_set_phmm_name($domain->target_name); |
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
|
|
466
|
0
|
0
|
0
|
|
|
|
if (! $domain->function || $domain->function =~ m/NA/xmsi) { |
|
467
|
0
|
|
|
|
|
|
$domain->_set_function($domain->target_name) |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
|
|
470
|
0
|
|
|
|
|
|
return; |
|
471
|
|
|
|
|
|
|
} |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
sub _parse_generic_domains { |
|
475
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
476
|
|
|
|
|
|
|
|
|
477
|
0
|
|
|
|
|
|
my $seq = shift; |
|
478
|
|
|
|
|
|
|
|
|
479
|
0
|
|
|
|
|
|
my $hmmdb = $DATA_PATH . 'generic_domains.hmm'; |
|
480
|
|
|
|
|
|
|
|
|
481
|
0
|
|
|
|
|
|
my $tbout = $self->_do_hmmscan($seq, $hmmdb); |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# parsing of domtblout hmmscan report |
|
484
|
0
|
|
|
|
|
|
my $report = DomTable->new( file => $tbout->filename ); |
|
485
|
|
|
|
|
|
|
|
|
486
|
0
|
|
|
|
|
|
my $ug = Data::UUID->new; |
|
487
|
0
|
|
|
|
|
|
my %hit_for; |
|
488
|
0
|
|
|
|
|
|
my $evalue_threshold = 10e-3; |
|
489
|
0
|
|
|
|
|
|
my $i = 1; |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
HIT: |
|
492
|
0
|
|
|
|
|
|
while (my $hit = $report->next_hit) { |
|
493
|
|
|
|
|
|
|
|
|
494
|
0
|
0
|
|
|
|
|
next HIT if $hit->evalue > $evalue_threshold; |
|
495
|
|
|
|
|
|
|
|
|
496
|
0
|
|
|
|
|
|
my $ui = $ug->create_str; |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
$hit_for{$ui} = { |
|
499
|
0
|
|
|
|
|
|
map { $_ => $hit->$_ } |
|
|
0
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
qw(query_name target_name ali_from ali_to hmm_from hmm_to tlen qlen) |
|
501
|
|
|
|
|
|
|
}; |
|
502
|
|
|
|
|
|
|
|
|
503
|
0
|
|
|
|
|
|
$hit_for{$ui}{score} = $hit->dom_score; |
|
504
|
0
|
|
|
|
|
|
$hit_for{$ui}{evalue} = $hit->i_evalue; # i-value = independent evalue ('evalue' return cumulative evalues for the sequence, and c-evalue may be deleted because potentially e-value) |
|
505
|
0
|
|
|
|
|
|
$i++; |
|
506
|
|
|
|
|
|
|
} |
|
507
|
|
|
|
|
|
|
|
|
508
|
0
|
|
|
|
|
|
return \%hit_for; |
|
509
|
|
|
|
|
|
|
} |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
sub _do_hmmscan { |
|
512
|
0
|
|
|
0
|
|
|
my $self = shift; |
|
513
|
0
|
|
|
|
|
|
my $seq = shift; |
|
514
|
0
|
|
|
|
|
|
my $hmmdb = shift; |
|
515
|
|
|
|
|
|
|
|
|
516
|
0
|
|
|
|
|
|
my $query = File::Temp->new( |
|
517
|
|
|
|
|
|
|
template => 'tempfile_XXXXXXXXXXXXXXX', |
|
518
|
|
|
|
|
|
|
suffix => '.faa', |
|
519
|
|
|
|
|
|
|
unlock => 1 |
|
520
|
|
|
|
|
|
|
); |
|
521
|
|
|
|
|
|
|
|
|
522
|
0
|
|
|
|
|
|
print $query '>query' . "\n" . $seq; |
|
523
|
|
|
|
|
|
|
|
|
524
|
0
|
|
|
|
|
|
my $pgm = 'hmmscan'; |
|
525
|
|
|
|
|
|
|
|
|
526
|
0
|
|
|
|
|
|
my $cpu_n = 1; |
|
527
|
0
|
|
|
|
|
|
my $tbout = File::Temp->new(suffix => '_domtblout.tsv'); |
|
528
|
0
|
|
|
|
|
|
my $opt = ' --domtblout ' . $tbout . ' --cpu ' . $cpu_n; |
|
529
|
0
|
|
|
|
|
|
my $log = File::Temp->new(suffix => '_hmmscan.log', unlock => 1); |
|
530
|
|
|
|
|
|
|
|
|
531
|
0
|
|
|
|
|
|
my $cmd = "$pgm $opt $hmmdb $query > $log"; |
|
532
|
0
|
|
|
|
|
|
system $cmd; |
|
533
|
|
|
|
|
|
|
|
|
534
|
0
|
|
|
|
|
|
return $tbout; |
|
535
|
|
|
|
|
|
|
} |
|
536
|
|
|
|
|
|
|
|
|
537
|
1
|
|
|
1
|
|
3104
|
no Moose::Role; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
6
|
|
|
538
|
|
|
|
|
|
|
1; |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
__END__ |
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=pod |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head1 NAME |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
Bio::Palantir::Roles::Fillable - Fillable Moose role for the construction of DomainPlus object arrays and Exploratory methods |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=head1 VERSION |
|
549
|
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
version 0.211420 |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
553
|
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
# TODO |
|
555
|
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
# TODO |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=head1 AUTHOR |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
Loic MEUNIER <lmeunier@uliege.be> |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
This software is copyright (c) 2019 by University of Liege / Unit of Eukaryotic Phylogenomics / Loic MEUNIER and Denis BAURAIN. |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
|
569
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=cut |