line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::MUST::Drivers::Exonerate::Aligned; |
2
|
|
|
|
|
|
|
# ABSTRACT: Bio::MUST driver for running the exonerate alignment program |
3
|
|
|
|
|
|
|
$Bio::MUST::Drivers::Exonerate::Aligned::VERSION = '0.210160'; |
4
|
5
|
|
|
5
|
|
45
|
use Moose; |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
49
|
|
5
|
5
|
|
|
5
|
|
37682
|
use namespace::autoclean; |
|
5
|
|
|
|
|
16
|
|
|
5
|
|
|
|
|
61
|
|
6
|
|
|
|
|
|
|
|
7
|
5
|
|
|
5
|
|
549
|
use autodie; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
48
|
|
8
|
5
|
|
|
5
|
|
28912
|
use feature qw(say switch); |
|
5
|
|
|
|
|
15
|
|
|
5
|
|
|
|
|
576
|
|
9
|
5
|
|
|
5
|
|
3399
|
use experimental qw(smartmatch); # to suppress warnings about 'when' |
|
5
|
|
|
|
|
6563
|
|
|
5
|
|
|
|
|
30
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# use Smart::Comments '###'; |
12
|
|
|
|
|
|
|
|
13
|
5
|
|
|
5
|
|
392
|
use Carp; |
|
5
|
|
|
|
|
13
|
|
|
5
|
|
|
|
|
293
|
|
14
|
5
|
|
|
5
|
|
35
|
use Const::Fast; |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
61
|
|
15
|
5
|
|
|
5
|
|
364
|
use IPC::System::Simple qw(system); |
|
5
|
|
|
|
|
15
|
|
|
5
|
|
|
|
|
288
|
|
16
|
5
|
|
|
5
|
|
44
|
use Module::Runtime qw(use_module); |
|
5
|
|
|
|
|
24
|
|
|
5
|
|
|
|
|
77
|
|
17
|
5
|
|
|
5
|
|
525
|
use Path::Class qw(file); |
|
5
|
|
|
|
|
16
|
|
|
5
|
|
|
|
|
296
|
|
18
|
|
|
|
|
|
|
|
19
|
5
|
|
|
5
|
|
34
|
use Bio::MUST::Core; |
|
5
|
|
|
|
|
13
|
|
|
5
|
|
|
|
|
222
|
|
20
|
5
|
|
|
5
|
|
44
|
use Bio::MUST::Core::Constants qw(:gaps); |
|
5
|
|
|
|
|
13
|
|
|
5
|
|
|
|
|
1105
|
|
21
|
5
|
|
|
5
|
|
48
|
use aliased 'Bio::MUST::Core::Ali'; |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
63
|
|
22
|
5
|
|
|
5
|
|
1375
|
use aliased 'Bio::MUST::Core::Seq'; |
|
5
|
|
|
|
|
19
|
|
|
5
|
|
|
|
|
26
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
has 'dna_seq' => ( |
26
|
|
|
|
|
|
|
is => 'ro', |
27
|
|
|
|
|
|
|
isa => 'Bio::MUST::Core::Seq', |
28
|
|
|
|
|
|
|
required => 1, |
29
|
|
|
|
|
|
|
); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
has 'pep_seq' => ( |
32
|
|
|
|
|
|
|
is => 'ro', |
33
|
|
|
|
|
|
|
isa => 'Bio::MUST::Core::Seq', |
34
|
|
|
|
|
|
|
required => 1, |
35
|
|
|
|
|
|
|
); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# TODO: homogeneize with main class |
38
|
|
|
|
|
|
|
has 'code' => ( |
39
|
|
|
|
|
|
|
is => 'ro', |
40
|
|
|
|
|
|
|
isa => 'Str', |
41
|
|
|
|
|
|
|
default => '1', |
42
|
|
|
|
|
|
|
); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
has 'model' => ( |
45
|
|
|
|
|
|
|
is => 'ro', |
46
|
|
|
|
|
|
|
isa => 'Str', |
47
|
|
|
|
|
|
|
init_arg => undef, |
48
|
|
|
|
|
|
|
writer => '_set_model', |
49
|
|
|
|
|
|
|
); |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
has $_ => ( |
52
|
|
|
|
|
|
|
is => 'ro', |
53
|
|
|
|
|
|
|
isa => 'Num', |
54
|
|
|
|
|
|
|
init_arg => undef, |
55
|
|
|
|
|
|
|
writer => '_set_' . $_, |
56
|
|
|
|
|
|
|
) for qw( |
57
|
|
|
|
|
|
|
query_start query_end |
58
|
|
|
|
|
|
|
target_start target_end |
59
|
|
|
|
|
|
|
score |
60
|
|
|
|
|
|
|
); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
has $_ . '_seq' => ( |
63
|
|
|
|
|
|
|
is => 'ro', |
64
|
|
|
|
|
|
|
isa => 'Bio::MUST::Core::Seq', |
65
|
|
|
|
|
|
|
init_arg => undef, |
66
|
|
|
|
|
|
|
writer => '_set_' . $_, |
67
|
|
|
|
|
|
|
) for qw(query target spliced); |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# TODO: subclass this to avoid code repetition with main class |
71
|
|
|
|
|
|
|
# TODO: handle reverse complement |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub BUILD { |
74
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# provision executable |
77
|
0
|
|
|
|
|
|
my $app = use_module('Bio::MUST::Provision::Exonerate')->new; |
78
|
0
|
|
|
|
|
|
$app->meet(); |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# build temp Ali file for input DNA seq |
81
|
0
|
|
|
|
|
|
my $dna = Ali->new( |
82
|
|
|
|
|
|
|
seqs => [ $self->dna_seq ], |
83
|
|
|
|
|
|
|
guessing => 0, |
84
|
|
|
|
|
|
|
); |
85
|
0
|
|
|
|
|
|
my $dnafile = $dna->temp_fasta; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# build temp Ali file for input PEP seq |
88
|
0
|
|
|
|
|
|
my $pep = Ali->new( |
89
|
|
|
|
|
|
|
seqs => [ $self->pep_seq ], |
90
|
|
|
|
|
|
|
guessing => 0, |
91
|
|
|
|
|
|
|
); |
92
|
0
|
|
|
|
|
|
my $pepfile = $pep->temp_fasta; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# execute exonerate |
95
|
0
|
|
|
|
|
|
my $outfile = $self->_exonerate($dnafile, $pepfile); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# parse outfile and populate seqs |
98
|
0
|
|
|
|
|
|
$self->_parse_outfile($outfile); |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# check that everything ran fine |
101
|
0
|
0
|
0
|
|
|
|
unless ( $self->query_seq->seq_len ) { |
|
|
0
|
|
|
|
|
|
102
|
0
|
|
|
|
|
|
carp '[BMD] Warning: exonerate could not align seqs;' |
103
|
|
|
|
|
|
|
. ' returning empty seqs!'; |
104
|
|
|
|
|
|
|
### dnafile: join q{}, "\n", file($dnafile)->slurp |
105
|
|
|
|
|
|
|
### pepfile: join q{}, "\n", file($pepfile)->slurp |
106
|
|
|
|
|
|
|
} |
107
|
0
|
|
|
|
|
|
elsif ( $self->query_seq->seq_len != $self->target_seq->seq_len ) { |
108
|
0
|
|
|
|
|
|
carp '[BMD] Warning: query and target seqs not of same length!'; |
109
|
|
|
|
|
|
|
### dnafile: join q{}, "\n", file($dnafile)->slurp |
110
|
|
|
|
|
|
|
### pepfile: join q{}, "\n", file($pepfile)->slurp |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
elsif ( $self->spliced_seq->seq_len != 3 * $self->target_seq->seq_len ) { |
113
|
|
|
|
|
|
|
carp '[BMD] Warning: DNA and protein target seqs not of same length!'; |
114
|
|
|
|
|
|
|
### dnafile: join q{}, "\n", file($dnafile)->slurp |
115
|
|
|
|
|
|
|
### pepfile: join q{}, "\n", file($pepfile)->slurp |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
# unlink temp files |
119
|
0
|
|
|
|
|
|
file($_)->remove for ($dnafile, $pepfile, $outfile); |
120
|
|
|
|
|
|
|
|
121
|
0
|
|
|
|
|
|
return; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
const my %ARGS_FOR => ( |
126
|
|
|
|
|
|
|
'lc' => 'protein2genome', |
127
|
|
|
|
|
|
|
'bf' => 'protein2genome:bestfit --exhaustive', |
128
|
|
|
|
|
|
|
); |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
const my $NEW_HSP => qr{C4 \s Alignment:}xms; |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub _exonerate { |
133
|
0
|
|
|
0
|
|
|
my $self = shift; |
134
|
0
|
|
|
|
|
|
my $dnafile = shift; |
135
|
0
|
|
|
|
|
|
my $pepfile = shift; |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# first try with heuristic 'protein2genome' model |
138
|
|
|
|
|
|
|
# if it yields > 1 HSPs try with exhaustive 'protein2genome:bestfit' model |
139
|
|
|
|
|
|
|
# it this better model fails then return first HSP from lesser model |
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
my $pgm = 'exonerate'; |
142
|
0
|
|
|
|
|
|
my $code = $self->code; |
143
|
|
|
|
|
|
|
|
144
|
0
|
|
|
|
|
|
my $return = q{}; |
145
|
0
|
|
|
|
|
|
my $remove = q{}; |
146
|
|
|
|
|
|
|
|
147
|
0
|
|
|
|
|
|
my %outfile_for; |
148
|
|
|
|
|
|
|
my $hsp_n; |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
MODEL: |
151
|
0
|
|
|
|
|
|
for my $model ( qw(lc bf) ) { |
152
|
|
|
|
|
|
|
|
153
|
0
|
|
|
|
|
|
$remove = $return; # lesser model failed (if any) |
154
|
0
|
|
|
|
|
|
$return = $model; # try better model |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# create exonerate command |
157
|
0
|
|
|
|
|
|
my $outfile = $dnafile . ".exo.$model.out"; |
158
|
0
|
|
|
|
|
|
$outfile_for{$model} = $outfile; |
159
|
|
|
|
|
|
|
my $cmd = qq{$pgm --showvulgar no --showalignment yes} |
160
|
|
|
|
|
|
|
. ' --alignmentwidth 100000' # needed for robust splicing |
161
|
0
|
|
|
|
|
|
. " --verbose 0 --geneticcode $code --model " . $ARGS_FOR{$model} |
162
|
|
|
|
|
|
|
. " --target $dnafile --query $pepfile > $outfile 2> /dev/null" |
163
|
|
|
|
|
|
|
; |
164
|
|
|
|
|
|
|
#### $cmd |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# Note: we ask for a single-line alignment to avoid the difficult |
167
|
|
|
|
|
|
|
# parsing of linebreak-interrupted runs of spaces within introns |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# try to robustly execute exonerate |
170
|
0
|
|
|
|
|
|
my $ret_code = system( [ 0, 1, 127, 139 ], $cmd); |
171
|
0
|
0
|
|
|
|
|
if ($ret_code == 127) { |
172
|
0
|
|
|
|
|
|
carp "[BMD] Warning: cannot execute $pgm command;" |
173
|
|
|
|
|
|
|
. ' returning nothing!'; |
174
|
0
|
|
|
|
|
|
return; # This will likely crash calling code but that's OK. |
175
|
|
|
|
|
|
|
} |
176
|
0
|
0
|
|
|
|
|
if ($ret_code == 139) { |
177
|
0
|
|
|
|
|
|
carp "[BMD] Warning: $pgm crashed; skipping model: $model!"; |
178
|
|
|
|
|
|
|
# do nothing more to leave loop with accurate $hsp_n |
179
|
|
|
|
|
|
|
} |
180
|
0
|
0
|
|
|
|
|
if ($ret_code == 1) { |
181
|
0
|
|
|
|
|
|
carp "[BMD] Warning: $pgm crashed because of a bad nt seq!"; |
182
|
0
|
|
|
|
|
|
return $outfile_for{$model}; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
# TODO: try to bypass shell (need for absolute path to executable then) |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# check number of HSPs in outfile |
187
|
0
|
|
|
|
|
|
my @lines = file($outfile)->slurp; |
188
|
0
|
|
|
|
|
|
$hsp_n = grep { m/$NEW_HSP/xms } @lines; |
|
0
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# stop here if only one; otherwise possibly try better model |
191
|
0
|
0
|
|
|
|
|
last MODEL if $hsp_n == 1; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
# if no HSP then better model was tried and finally failed |
195
|
|
|
|
|
|
|
# thus switch back to lesser model (with > 1 HSPs) |
196
|
0
|
0
|
|
|
|
|
if ($hsp_n == 0) { |
197
|
0
|
|
|
|
|
|
carp "[BMD] Warning: cannot get only one HSP from $pgm;" |
198
|
|
|
|
|
|
|
. ' returning first one!'; |
199
|
0
|
|
|
|
|
|
($return, $remove) = ($remove, $return); |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
#### $remove |
203
|
|
|
|
|
|
|
#### $return |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# delete unused file (if any) and return outfile |
206
|
0
|
|
|
|
|
|
file( $outfile_for{ $remove} )->remove; |
207
|
0
|
|
|
|
|
|
$self->_set_model( $return); |
208
|
|
|
|
|
|
|
|
209
|
0
|
|
|
|
|
|
return $outfile_for{$return}; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# Raw score: 1215 |
214
|
|
|
|
|
|
|
# Query range: 0 -> 247 |
215
|
|
|
|
|
|
|
# Target range: 1429 -> 2184 |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# 43 : ysGlyIleValLysGluIleIle<->AspProGlyArgGlyAlaProLeuAlaArgValVal : 61 |
218
|
|
|
|
|
|
|
# ||||||||||||||||||||||| |||||||||||||||||||||||||||||||||||| |
219
|
|
|
|
|
|
|
# ysGlyIleValLysGluIleIleHisAspProGlyArgGlyAlaProLeuAlaArgValVal |
220
|
|
|
|
|
|
|
# 1554 : AGGGCATAGTGAAAGAAATCATACACGACCCAGGAAGAGGCGCTCCCTTAGCCAGAGTTGTT : 1613 |
221
|
|
|
|
|
|
|
# |
222
|
|
|
|
|
|
|
# ... |
223
|
|
|
|
|
|
|
# |
224
|
|
|
|
|
|
|
# 145 : sThrArgValArgLeuProSerGlySerLysLysValLeuSerSerThrAsnArg-AlaVal : 164 |
225
|
|
|
|
|
|
|
# |||||||||||||||||||||||||||||||||||||||||||||||||||||||#.!!||| |
226
|
|
|
|
|
|
|
# sThrArgValArgLeuProSerGlySerLysLysValLeuSerSerThrAsnArg#UnkVal |
227
|
|
|
|
|
|
|
# 1863 : AACGCGCGTGCGTCTTCCGTCAGGATCGAAAAAGGTGCTATCGTCGACGAATCGAGNCCGTG : 1923 |
228
|
|
|
|
|
|
|
# |
229
|
|
|
|
|
|
|
# ... |
230
|
|
|
|
|
|
|
# |
231
|
|
|
|
|
|
|
# 177 : gGlnArgLysAlaHisLeuIleGluIleGlnValAsnGlyGlyThrValAlaGlnLysValAsp : 197 |
232
|
|
|
|
|
|
|
# ||||!:!||||||||||||!!:|||||||||||||||||||||:!!|||||| !!||||||||| |
233
|
|
|
|
|
|
|
# gGlnLysLysAlaHisLeuMetGluIleGlnValAsnGlyGlySerValAla***LysValAsp |
234
|
|
|
|
|
|
|
# 321 : GCAAAAGAAGGCCCACCTTATGGAAATCCAAGTCAATGGTGGATCCGTTGCGTAAAAGGTCGAC : 383 |
235
|
|
|
|
|
|
|
# |
236
|
|
|
|
|
|
|
# ... |
237
|
|
|
|
|
|
|
# |
238
|
|
|
|
|
|
|
# 91 : rgLysHisProTrpHisValIleArgIleAsnLysMetLeuSerCysAlaGlyAlaAspArg{L : 111 |
239
|
|
|
|
|
|
|
# ||:!!|||||||||||||||:!!|||||||||||||||||||||||||||||||||||||||{| |
240
|
|
|
|
|
|
|
# rgGlnHisProTrpHisValLeuArgIleAsnLysMetLeuSerCysAlaGlyAlaAspArg{L |
241
|
|
|
|
|
|
|
# 257 : GACAACATCCATGGCACGTTCTTAGAATCAACAAGATGCTTTCCTGCGCAGGTGCCGATAGA{C : 319 |
242
|
|
|
|
|
|
|
# |
243
|
|
|
|
|
|
|
# 112 : e} >>>> Target Intron 1 >>>> {u}GlnGlnGlyMetArgGlyAlaPheGlyLys : 121 |
244
|
|
|
|
|
|
|
# |} 86 bp {|} ||||||||| !|||||||||||| |
245
|
|
|
|
|
|
|
# e}++ --{u}UnkUnkGlyMetArgHisAlaPheGlyLys |
246
|
|
|
|
|
|
|
# 320 : T}gt.........................nn{N}NNNNNCGGTATGAGACATGCTTTCGGAAAG : 435 |
247
|
|
|
|
|
|
|
# |
248
|
|
|
|
|
|
|
# ... |
249
|
|
|
|
|
|
|
# |
250
|
|
|
|
|
|
|
# 126 : SerGlnAlaLeuAspValIleHisGluTyrPheLys : 137 |
251
|
|
|
|
|
|
|
# !!!! !||||||!!: !!:!! ! !||| |
252
|
|
|
|
|
|
|
# ThrProAlaLeuGluPheValUnkUnkValHisLys |
253
|
|
|
|
|
|
|
# 689 : ACGCCAGCTCTGGAGTTCGTAAKGRGAGTACATAAG : 652 |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
# regexes for parsing outfile |
256
|
|
|
|
|
|
|
const my $LEFT_NUM => qr{-?\d+\s+:}xms; |
257
|
|
|
|
|
|
|
const my $RIGHT_NUM => qr{:\s+-?\d+}xms; |
258
|
|
|
|
|
|
|
const my $SEQ_STR => qr{[A-Za-z0-9\.\#\*\+\-\<\>\{\}\ ]+}xms; |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
sub _parse_outfile { |
261
|
0
|
|
|
0
|
|
|
my $self = shift; |
262
|
0
|
|
|
|
|
|
my $outfile = shift; |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
# read output file (human-readable alignment) |
265
|
0
|
|
|
|
|
|
open my $out, '<', $outfile; |
266
|
|
|
|
|
|
|
#### $outfile |
267
|
|
|
|
|
|
|
|
268
|
0
|
|
|
|
|
|
my $query_seq = q{}; |
269
|
0
|
|
|
|
|
|
my $target_seq = q{}; |
270
|
0
|
|
|
|
|
|
my $spliced_seq = q{}; |
271
|
|
|
|
|
|
|
|
272
|
0
|
|
|
|
|
|
my $hsp_n = 0; |
273
|
0
|
|
|
|
|
|
my $in_block = 0; |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
LINE: |
276
|
0
|
|
|
|
|
|
while (my $line = <$out>) { |
277
|
0
|
|
|
|
|
|
chomp $line; |
278
|
|
|
|
|
|
|
##### $line |
279
|
|
|
|
|
|
|
|
280
|
0
|
|
|
|
|
|
given ($line) { |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# header lines |
283
|
0
|
|
|
|
|
|
when (/$NEW_HSP/xms) { |
284
|
0
|
|
|
|
|
|
$hsp_n++; |
285
|
0
|
0
|
|
|
|
|
last LINE if $hsp_n > 1; # process only first HSP |
286
|
|
|
|
|
|
|
} |
287
|
0
|
|
|
|
|
|
when (/^ \s*Raw \s score:\s (\d+) /xms) { |
288
|
0
|
|
|
|
|
|
$self->_set_score( $1 ); |
289
|
|
|
|
|
|
|
} |
290
|
0
|
|
|
|
|
|
when (/^ \s*Query \s range:\s (\d+) \s->\s (\d+) /xms) { |
291
|
0
|
|
|
|
|
|
$self->_set_query_start( $1 + 1 ); |
292
|
0
|
|
|
|
|
|
$self->_set_query_end( $2); |
293
|
|
|
|
|
|
|
} # Note: coordinates are converted to BLAST/GFF standard |
294
|
0
|
|
|
|
|
|
when (/^ \s*Target \s range:\s (\d+) \s->\s (\d+) /xms) { |
295
|
0
|
|
|
|
|
|
$self->_set_target_start( $1 + 1 ); |
296
|
0
|
|
|
|
|
|
$self->_set_target_end( $2); |
297
|
|
|
|
|
|
|
} # Note: coordinates are converted to BLAST/GFF standard |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# alignment lines |
300
|
0
|
|
|
|
|
|
when (/^ \s* $LEFT_NUM \s ($SEQ_STR) \s $RIGHT_NUM \s* $/xms) { |
301
|
0
|
0
|
|
|
|
|
unless ($in_block) { |
302
|
|
|
|
|
|
|
# this is query line |
303
|
0
|
|
|
|
|
|
$query_seq .= $1; |
304
|
0
|
|
|
|
|
|
$in_block = 1; |
305
|
0
|
|
|
|
|
|
<$out>; # skip midline (may be hard to parse) |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
else { |
308
|
|
|
|
|
|
|
# this is DNA |
309
|
0
|
|
|
|
|
|
$spliced_seq .= $1; |
310
|
0
|
|
|
|
|
|
$in_block = 0; |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
} |
313
|
0
|
|
|
|
|
|
when (/^ \s+ ($SEQ_STR) \s* $/xms) { |
314
|
|
|
|
|
|
|
# this is likely target line |
315
|
0
|
0
|
|
|
|
|
$target_seq .= $1 if $in_block; |
316
|
|
|
|
|
|
|
} |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# remove introns and frameshifts in all seqs simultaneously |
321
|
0
|
|
|
|
|
|
($query_seq, $target_seq, $spliced_seq) |
322
|
|
|
|
|
|
|
= _splice_seqs($query_seq, $target_seq, $spliced_seq); |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
# store aligned pep seqs |
325
|
0
|
|
|
|
|
|
$self->_set_query( |
326
|
|
|
|
|
|
|
Seq->new( |
327
|
|
|
|
|
|
|
seq_id => $self->pep_seq->full_id, |
328
|
|
|
|
|
|
|
seq => _canonize_seq(1, $query_seq) |
329
|
|
|
|
|
|
|
) |
330
|
|
|
|
|
|
|
); |
331
|
0
|
|
|
|
|
|
$self->_set_target( |
332
|
|
|
|
|
|
|
Seq->new( |
333
|
|
|
|
|
|
|
seq_id => $self->dna_seq->full_id, |
334
|
|
|
|
|
|
|
seq => _canonize_seq(1, $target_seq) |
335
|
|
|
|
|
|
|
) |
336
|
|
|
|
|
|
|
); |
337
|
0
|
|
|
|
|
|
$self->_set_spliced( |
338
|
|
|
|
|
|
|
Seq->new( |
339
|
|
|
|
|
|
|
seq_id => $self->dna_seq->full_id, |
340
|
|
|
|
|
|
|
seq => _canonize_seq(0, $spliced_seq) |
341
|
|
|
|
|
|
|
) |
342
|
|
|
|
|
|
|
); |
343
|
|
|
|
|
|
|
|
344
|
0
|
|
|
|
|
|
return; |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
# regexes for splicing seqs |
349
|
|
|
|
|
|
|
const my $ANGLES => qr{(?:<|>){4}}xms; |
350
|
|
|
|
|
|
|
const my $TARGET => qr{Target \s Intron \s \d+}xms; |
351
|
|
|
|
|
|
|
const my $INTRON => qr{\s{2} $ANGLES \s $TARGET \s $ANGLES \s{2}}xms; |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
sub _splice_seqs { |
354
|
0
|
|
|
0
|
|
|
my $query_seq = shift; |
355
|
0
|
|
|
|
|
|
my $target_seq = shift; |
356
|
0
|
|
|
|
|
|
my $spliced_seq = shift; |
357
|
|
|
|
|
|
|
#### $query_seq |
358
|
|
|
|
|
|
|
#### $target_seq |
359
|
|
|
|
|
|
|
#### $spliced_seq |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# preserve framshifts in target_seq |
362
|
|
|
|
|
|
|
# Note: this is needed to avoid splicing out real frameshifts below |
363
|
0
|
|
|
|
|
|
$target_seq =~ s/\#{2}\-{3}/##Unk/xmsg; # 1-nt deletion frameshifts |
364
|
0
|
|
|
|
|
|
$target_seq =~ s/\#{3} /Unk/xmsg; # ??? |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
# store intron pos and 'lengths' in query_seq |
367
|
0
|
|
|
|
|
|
my @introns; |
368
|
0
|
|
|
|
|
|
while ($query_seq =~ m/($INTRON)/xmsg) { |
369
|
0
|
|
|
|
|
|
my $len = length $1; |
370
|
0
|
|
|
|
|
|
my $pos = pos($query_seq) - $len; |
371
|
0
|
|
|
|
|
|
push @introns, [ $pos, $len ]; |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# store frameshift pos and 'lengths' in target_seq |
375
|
0
|
|
|
|
|
|
while ($target_seq =~ m/(\#+)/xmsg) { |
376
|
0
|
|
|
|
|
|
my $len = length $1; |
377
|
0
|
|
|
|
|
|
my $pos = pos($target_seq) - $len; |
378
|
0
|
|
|
|
|
|
push @introns, [ $pos, $len ]; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# splice out introns starting from seq ends |
382
|
|
|
|
|
|
|
# @introns |
383
|
0
|
|
|
|
|
|
for my $intron (sort { $b->[0] <=> $a->[0] } @introns) { |
|
0
|
|
|
|
|
|
|
384
|
0
|
|
|
|
|
|
my ($pos, $len) = @{$intron}; |
|
0
|
|
|
|
|
|
|
385
|
0
|
|
|
|
|
|
my $query_in = substr $query_seq, $pos, $len, q{}; |
386
|
0
|
|
|
|
|
|
my $target_in = substr $target_seq, $pos, $len, q{}; |
387
|
0
|
|
|
|
|
|
my $spliced_in = substr $spliced_seq, $pos, $len, q{}; |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
#### $query_in |
390
|
|
|
|
|
|
|
#### $target_in |
391
|
|
|
|
|
|
|
#### $spliced_in |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
#### $query_seq |
395
|
|
|
|
|
|
|
#### $target_seq |
396
|
|
|
|
|
|
|
#### $spliced_seq |
397
|
|
|
|
|
|
|
|
398
|
0
|
|
|
|
|
|
return ($query_seq, $target_seq, $spliced_seq); |
399
|
|
|
|
|
|
|
} |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
const my %SHORT_FOR => ( |
403
|
|
|
|
|
|
|
Ala => 'A', |
404
|
|
|
|
|
|
|
Asx => 'B', |
405
|
|
|
|
|
|
|
Cys => 'C', |
406
|
|
|
|
|
|
|
Asp => 'D', |
407
|
|
|
|
|
|
|
Glu => 'E', |
408
|
|
|
|
|
|
|
Phe => 'F', |
409
|
|
|
|
|
|
|
Gly => 'G', |
410
|
|
|
|
|
|
|
His => 'H', |
411
|
|
|
|
|
|
|
Ile => 'I', |
412
|
|
|
|
|
|
|
Lys => 'K', |
413
|
|
|
|
|
|
|
Leu => 'L', |
414
|
|
|
|
|
|
|
Met => 'M', |
415
|
|
|
|
|
|
|
Asn => 'N', |
416
|
|
|
|
|
|
|
Pro => 'P', |
417
|
|
|
|
|
|
|
Gln => 'Q', |
418
|
|
|
|
|
|
|
Arg => 'R', |
419
|
|
|
|
|
|
|
Ser => 'S', |
420
|
|
|
|
|
|
|
Thr => 'T', |
421
|
|
|
|
|
|
|
Val => 'V', |
422
|
|
|
|
|
|
|
Trp => 'W', |
423
|
|
|
|
|
|
|
Xaa => 'X', |
424
|
|
|
|
|
|
|
Tyr => 'Y', |
425
|
|
|
|
|
|
|
Glx => 'Z', |
426
|
|
|
|
|
|
|
Unk => $FRAMESHIFT, |
427
|
|
|
|
|
|
|
'---' => '-', |
428
|
|
|
|
|
|
|
); |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
sub _canonize_seq { |
431
|
0
|
|
|
0
|
|
|
my $prot = shift; |
432
|
0
|
|
|
|
|
|
my $seq3 = shift; |
433
|
|
|
|
|
|
|
#### $seq3 |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# example of 1-nt deletion |
436
|
|
|
|
|
|
|
# ArgGlyAsnAla--GlyGlyGlnHisHisHisArgIle...LysPhePheThrArgArgAlaGlu--GluLysIleLys |
437
|
|
|
|
|
|
|
# |
438
|
|
|
|
|
|
|
# ArgGlyAsnAla##---GlyGlnHisHisHisArgIle...LysPhePheThrArgArgAlaGlu##---LysIleLys |
439
|
|
|
|
|
|
|
# CGTGGTAATGCTGT---GGTCAGCACCACCACAGAATT...AAGTTCTTCACACGCCGTGCTGAGGA---AAGATCAAG |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
# ensure no residual frameshift |
442
|
0
|
|
|
|
|
|
$seq3 =~ tr/\{\}//d; # spliced codons |
443
|
0
|
|
|
|
|
|
$seq3 =~ s/\<\-\>/---/xmsg; # insertion |
444
|
0
|
|
|
|
|
|
$seq3 =~ s/\*{3} /Unk/xmsg; # STOP |
445
|
|
|
|
|
|
|
#### $seq3 |
446
|
|
|
|
|
|
|
|
447
|
0
|
0
|
|
|
|
|
return $seq3 unless $prot; |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# 'translate' 3-letter AAs to 1-letter AAs |
450
|
0
|
|
|
|
|
|
my $seq1 = q{}; |
451
|
0
|
|
|
|
|
|
for (my $i = 0; $i < length $seq3; $i += 3) { |
452
|
0
|
|
|
|
|
|
my $triplet = substr $seq3, $i, 3; |
453
|
0
|
|
|
|
|
|
$seq1 .= $SHORT_FOR{ $triplet }; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
#### $seq1 |
456
|
|
|
|
|
|
|
|
457
|
0
|
|
|
|
|
|
return $seq1; |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
461
|
|
|
|
|
|
|
1; |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
__END__ |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=pod |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=head1 NAME |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Bio::MUST::Drivers::Exonerate::Aligned - Bio::MUST driver for running the exonerate alignment program |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
=head1 VERSION |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
version 0.210160 |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=head1 SYNOPSIS |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# TODO |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=head1 DESCRIPTION |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
# TODO |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
=head1 AUTHOR |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
Denis BAURAIN <denis.baurain@uliege.be> |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN. |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
492
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
=cut |