line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Tools::SeqPattern::Backtranslate; |
2
|
2
|
|
|
2
|
|
633
|
use strict; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
44
|
|
3
|
2
|
|
|
2
|
|
5
|
use warnings; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
41
|
|
4
|
|
|
|
|
|
|
|
5
|
2
|
|
|
2
|
|
5
|
use base qw(Bio::Root::Root); |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
463
|
|
6
|
2
|
|
|
2
|
|
6
|
use base qw(Exporter); |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
95
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Bio::Tools::SeqPattern::Backtranslate |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 DESCRIPTION |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
This module should not be used directly. It provides helper methods to |
15
|
|
|
|
|
|
|
Bio::Tools::SeqPattern to reverse translate protein patterns. |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=cut |
18
|
|
|
|
|
|
|
|
19
|
2
|
|
|
2
|
|
862
|
use Bio::Seq; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
62
|
|
20
|
2
|
|
|
2
|
|
9
|
use Bio::Tools::CodonTable; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
75
|
|
21
|
|
|
|
|
|
|
|
22
|
2
|
|
|
2
|
|
6
|
use List::MoreUtils qw(uniq); |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
21
|
|
23
|
2
|
|
|
2
|
|
1004
|
use Carp qw(croak); |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
2072
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
our @EXPORT_OK = qw(_reverse_translate_motif); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
our @EXPORT = @EXPORT_OK; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub _reverse_translate_motif { |
30
|
|
|
|
|
|
|
# Main subroutine. It takes a Profam-like motif and returns its |
31
|
|
|
|
|
|
|
# reverse translation using degenerate codons. |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Steps: |
34
|
|
|
|
|
|
|
# 1. Tokenize, then parse tokens. |
35
|
|
|
|
|
|
|
# 2. Reverse translate each token type. |
36
|
|
|
|
|
|
|
# 3. Join tokens in original order. Return the resulting string. |
37
|
|
|
|
|
|
|
|
38
|
12
|
|
|
12
|
|
3316
|
my $motif = shift; |
39
|
|
|
|
|
|
|
|
40
|
12
|
|
|
|
|
21
|
$motif =~ s/\./X/g; |
41
|
12
|
|
|
|
|
19
|
$motif = uc $motif; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
### 1. Tokenize, parse the motif. |
44
|
12
|
|
|
|
|
17
|
my ( $ordered, $classified ) = _parse_motif($motif); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
### 2. Reverse translate each token type. |
47
|
|
|
|
|
|
|
# Reverse translate the plain (unambiguous) tokens. |
48
|
11
|
|
|
|
|
30
|
my $ct = Bio::Tools::CodonTable->new; |
49
|
11
|
|
|
|
|
10
|
foreach my $seq ( @{ $classified->{plain} } ) { |
|
11
|
|
|
|
|
15
|
|
50
|
19
|
|
|
|
|
59
|
my $seqO |
51
|
|
|
|
|
|
|
= Bio::Seq->new( -seq => $$seq, -alphabet => 'protein' ); |
52
|
19
|
|
|
|
|
38
|
$$seq = $ct->reverse_translate_all($seqO); |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# Reverse translate the ambiguous tokens. |
56
|
11
|
|
|
|
|
9
|
foreach my $token ( @{ $classified->{ambiguous} } ) { |
|
11
|
|
|
|
|
19
|
|
57
|
8
|
|
|
|
|
31
|
my ($aas) = $$token =~ m(([A-Za-z\.]+)); |
58
|
8
|
|
|
|
|
8
|
my @codons_to_contract; |
59
|
|
|
|
|
|
|
|
60
|
8
|
|
|
|
|
16
|
foreach my $residue ( split '', $aas ) { |
61
|
32
|
|
|
|
|
55
|
push @codons_to_contract, $ct->revtranslate($residue); |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
8
|
|
|
|
|
15
|
my $ambiguous_codon = _contract_codons(@codons_to_contract); |
65
|
8
|
|
|
|
|
13
|
$$token = $ambiguous_codon; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# Reverse translate the negated residues. |
69
|
11
|
|
|
|
|
11
|
foreach my $token ( @{ $classified->{negated} } ) { |
|
11
|
|
|
|
|
14
|
|
70
|
6
|
|
|
|
|
18
|
my ($aas) = $$token =~ m(([A-Za-z\.]+)); |
71
|
6
|
|
|
|
|
9
|
my $ambiguous_codon = _negated_aas_to_codon($aas); |
72
|
6
|
|
|
|
|
10
|
$$token = $ambiguous_codon; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
### 3. Join the profile back from its tokens. |
76
|
11
|
|
|
|
|
10
|
return join '', map {$$_} @{$ordered}; |
|
41
|
|
|
|
|
71
|
|
|
11
|
|
|
|
|
14
|
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub _parse_motif { |
81
|
|
|
|
|
|
|
# Profam-like motif parser. It takes the pattern as a string, and |
82
|
|
|
|
|
|
|
# returns two data structures that contain the tokens, organized |
83
|
|
|
|
|
|
|
# by order of appearance in the pattern (first return value) and by |
84
|
|
|
|
|
|
|
# category (second return value). |
85
|
|
|
|
|
|
|
|
86
|
12
|
|
|
12
|
|
632
|
my $motif = shift; |
87
|
12
|
|
|
|
|
19
|
my $parser = _tokenize_motif($motif); |
88
|
12
|
|
|
|
|
9
|
my ( %tokens, @tokens ); |
89
|
|
|
|
|
|
|
|
90
|
12
|
|
|
|
|
14
|
while ( my $token = $parser->() ) { |
91
|
42
|
100
|
|
|
|
80
|
croak ("Unknown syntax token: <", $token->[1], ">") |
92
|
|
|
|
|
|
|
if ( $token->[0] eq 'UNKNOWN' ); |
93
|
41
|
|
|
|
|
21
|
push @{ $tokens{ $token->[0] } }, \$token->[1]; |
|
41
|
|
|
|
|
70
|
|
94
|
41
|
|
|
|
|
66
|
push @tokens, \$token->[1]; |
95
|
|
|
|
|
|
|
} |
96
|
11
|
|
|
|
|
41
|
return ( \@tokens, \%tokens ); |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub _tokenize_motif { |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Return a tokenizer iterator that sequentially recognizes and |
102
|
|
|
|
|
|
|
# returns each token in the input pattern. |
103
|
|
|
|
|
|
|
# Examples of each token type: |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# ambiguous: a position with more than one possible residue. |
106
|
|
|
|
|
|
|
# eg. [ALEP] |
107
|
|
|
|
|
|
|
# negated: a position in which some residues are excluded. |
108
|
|
|
|
|
|
|
# eg. [^WY] |
109
|
|
|
|
|
|
|
# plain: a common sequence of residues. One position, one residue. |
110
|
|
|
|
|
|
|
# eg. MAAEIK |
111
|
|
|
|
|
|
|
# open_par, close_par: tags surrounding a motif that is repeated |
112
|
|
|
|
|
|
|
# a certain number of times. |
113
|
|
|
|
|
|
|
# eg. (...){3} |
114
|
|
|
|
|
|
|
|
115
|
12
|
|
|
12
|
|
12
|
my $target = shift; |
116
|
|
|
|
|
|
|
return sub { |
117
|
53
|
100
|
|
53
|
|
107
|
return [ 'ambiguous', $1 ] |
118
|
|
|
|
|
|
|
if $target =~ /\G (\[[A-Za-z\.]+\]) /gcx; |
119
|
45
|
100
|
|
|
|
70
|
return [ 'negated', $1 ] |
120
|
|
|
|
|
|
|
if $target =~ /\G (\[\^[A-Za-z\.]+\]) /gcx; |
121
|
39
|
100
|
|
|
|
117
|
return [ 'plain', $1 ] |
122
|
|
|
|
|
|
|
if $target =~ /\G ([A-Za-z\.]+) /gcx; |
123
|
20
|
100
|
|
|
|
43
|
return [ 'open_par', $1 ] |
124
|
|
|
|
|
|
|
if $target =~ /\G (\() /gcx; |
125
|
16
|
100
|
|
|
|
33
|
return [ 'close_par', $1 ] |
126
|
|
|
|
|
|
|
if $target =~ /\G (\)[\{\d+[,\d+]*\}]*) /gcx; |
127
|
12
|
100
|
|
|
|
18
|
return [ 'UNKNOWN', $1 ] |
128
|
|
|
|
|
|
|
if $target =~ /\G (.) /gcx; |
129
|
11
|
|
|
|
|
20
|
return; |
130
|
12
|
|
|
|
|
37
|
}; |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub _contract_codons { |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Take a list of codons, return an ambiguous codon. |
136
|
8
|
|
|
8
|
|
10
|
my @codons = map { uc $_ } @_; |
|
56
|
|
|
|
|
54
|
|
137
|
|
|
|
|
|
|
|
138
|
8
|
|
|
|
|
13
|
my @by_letter = ( [], [], [], ); |
139
|
8
|
|
|
|
|
8
|
my $ambiguous_codon; |
140
|
8
|
|
|
|
|
8
|
foreach my $codon (@codons) { |
141
|
56
|
|
|
|
|
56
|
my @letters = split '', $codon; |
142
|
56
|
|
|
|
|
44
|
for my $i ( 0 .. 2 ) { |
143
|
168
|
|
|
|
|
89
|
push @{ $by_letter[$i] }, $letters[$i]; |
|
168
|
|
|
|
|
200
|
|
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
} |
146
|
8
|
|
|
|
|
7
|
for my $i ( 0 .. 2 ) { |
147
|
|
|
|
|
|
|
$ambiguous_codon |
148
|
24
|
|
|
|
|
19
|
.= _convert( 'dna', _uniq_string( @{ $by_letter[$i] } ) ); |
|
24
|
|
|
|
|
33
|
|
149
|
|
|
|
|
|
|
} |
150
|
8
|
|
|
|
|
22
|
return $ambiguous_codon; |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
sub _expand_codon { |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Given a degenerate codon, return a list with all its |
156
|
|
|
|
|
|
|
# constituents. Takes a three-letter string (codon) as |
157
|
|
|
|
|
|
|
# input, returns a list with three-letter scalars. |
158
|
|
|
|
|
|
|
|
159
|
300
|
|
|
300
|
|
227
|
my $codon = shift; |
160
|
300
|
50
|
|
|
|
372
|
die "Wrong codon length!\n" if length $codon != 3; |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
|
163
|
300
|
|
|
|
|
183
|
my ( @codons, @return_bases ); |
164
|
300
|
|
|
|
|
389
|
my @orig_bases = split '', $codon; |
165
|
|
|
|
|
|
|
|
166
|
300
|
|
|
|
|
300
|
for my $i ( 0 .. 2 ) { |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# from each redundant base, create a list with all their |
169
|
|
|
|
|
|
|
# components (e.g., N -> (A, C, G, T) ); |
170
|
900
|
|
|
|
|
913
|
my @components = split '', _convert('dna', $orig_bases[$i] ); |
171
|
900
|
|
|
|
|
1519
|
$orig_bases[$i] = [@components]; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Combine all the bases of each of the three positions of the |
175
|
|
|
|
|
|
|
# codons, and build the return list. |
176
|
300
|
|
|
|
|
208
|
for my $i ( @{ $orig_bases[0] } ) { |
|
300
|
|
|
|
|
335
|
|
177
|
1050
|
|
|
|
|
608
|
for my $j ( @{ $orig_bases[1] } ) { |
|
1050
|
|
|
|
|
914
|
|
178
|
3558
|
|
|
|
|
1895
|
for my $k ( @{ $orig_bases[2] } ) { |
|
3558
|
|
|
|
|
3021
|
|
179
|
11832
|
|
|
|
|
10954
|
push @return_bases, $i . $j . $k; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
} |
183
|
300
|
|
|
|
|
2390
|
return @return_bases; |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
{ |
187
|
|
|
|
|
|
|
my %convert; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub _convert { |
190
|
|
|
|
|
|
|
# Interconvert between redundant and non-redundant protein and |
191
|
|
|
|
|
|
|
# dna alphabets. Takes an alphabet (protein or dna) and a string |
192
|
|
|
|
|
|
|
# with the letter, and returns its equivalent in |
193
|
|
|
|
|
|
|
# redundant/non-redundant alphabet. Example ACTG -> N. |
194
|
|
|
|
|
|
|
|
195
|
924
|
|
|
924
|
|
668
|
my ($alphabet, $letter) = @_; |
196
|
924
|
50
|
33
|
|
|
4998
|
unless ( |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
197
|
|
|
|
|
|
|
$alphabet and $alphabet =~ /^dna$|^protein$/i |
198
|
|
|
|
|
|
|
and $letter and length $letter <= 4 |
199
|
0
|
|
|
|
|
0
|
) { croak "Wrong arguments!\n"; } |
200
|
|
|
|
|
|
|
|
201
|
924
|
100
|
|
|
|
1694
|
unless (%convert) { |
202
|
2
|
|
|
|
|
19
|
%convert = ( |
203
|
|
|
|
|
|
|
'dna' => { |
204
|
|
|
|
|
|
|
qw(N ACGT B CGT D AGT H ACT V ACG K GT |
205
|
|
|
|
|
|
|
M AC R AG S CG W AT Y CT A A C C T T G G) |
206
|
|
|
|
|
|
|
}, |
207
|
|
|
|
|
|
|
'protein' => { |
208
|
|
|
|
|
|
|
'.' => 'ACDEFGHIJKLMNOPQRSTUVWY', |
209
|
|
|
|
|
|
|
X => 'ACDEFGHIJKLMNOPQRSTUVWY', |
210
|
|
|
|
|
|
|
Z => 'QE', |
211
|
|
|
|
|
|
|
B => 'ND', |
212
|
|
|
|
|
|
|
}, |
213
|
|
|
|
|
|
|
); |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# Make %convert hash key/value agnostic. |
216
|
2
|
|
|
|
|
4
|
foreach my $alphabet ( keys %convert ) { |
217
|
38
|
|
|
|
|
51
|
map { $convert{$alphabet}->{ $convert{$alphabet}{$_} } = $_ } |
218
|
4
|
|
|
|
|
4
|
keys %{ $convert{$alphabet} }; |
|
4
|
|
|
|
|
7
|
|
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
924
|
|
|
|
|
1644
|
return $convert{$alphabet}{$letter}; |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub _uniq_string { |
228
|
|
|
|
|
|
|
# Takes a list of letters and returns an alphabetically sorted |
229
|
|
|
|
|
|
|
# list with unique elements. |
230
|
|
|
|
|
|
|
|
231
|
24
|
|
|
24
|
|
37
|
my @letters = @_; |
232
|
24
|
|
|
|
|
89
|
return join '', sort { $a cmp $b } uniq @letters; |
|
48
|
|
|
|
|
77
|
|
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
{ |
236
|
|
|
|
|
|
|
my ( @codon_library, $ct ); |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
sub _negated_aas_to_codon { |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# Given a string of residues, returns a degenerate codon that will |
241
|
|
|
|
|
|
|
# not be translated into any of them, while maximizing degeneracy |
242
|
|
|
|
|
|
|
# (ie, it tries to also translate into as many residues as possible). |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# This functionality is required for reverse translating profiles |
245
|
|
|
|
|
|
|
# that contain negative patterns: [^X]. This means that the current |
246
|
|
|
|
|
|
|
# position should not contain aminoacid X, but can have any of the |
247
|
|
|
|
|
|
|
# others. The reverse translated nucleotide sequence should |
248
|
|
|
|
|
|
|
# reflect this. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Approach: construct a list of all possible codons, incluiding all |
251
|
|
|
|
|
|
|
# degenerate bases. This is an array of 15x15x15 = 3375 elements. |
252
|
|
|
|
|
|
|
# Order them by descendent "degeneracy". |
253
|
|
|
|
|
|
|
# Return the first one whose expansion in 4-lettered codons |
254
|
|
|
|
|
|
|
# doesn't contain a codon that translates into any of the |
255
|
|
|
|
|
|
|
# non-wanted residues. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# * Since this takes some time, I presorted them and saved them. |
258
|
|
|
|
|
|
|
# Reading them from a file takes a fraction of the time that it taes |
259
|
|
|
|
|
|
|
# to re-sort them every time the application is launched. |
260
|
|
|
|
|
|
|
|
261
|
6
|
|
|
6
|
|
7
|
my $aas_to_avoid = shift; |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# Initialize reusable variables if it's the first time the sub |
264
|
|
|
|
|
|
|
# is called. |
265
|
6
|
100
|
|
|
|
14
|
unless (@codon_library) { |
266
|
2
|
|
|
|
|
6
|
while () { chomp; push @codon_library, split ' ', $_ } |
|
356
|
|
|
|
|
214
|
|
|
356
|
|
|
|
|
1863
|
|
267
|
|
|
|
|
|
|
} |
268
|
6
|
100
|
|
|
|
13
|
unless ($ct) { $ct = Bio::Tools::CodonTable->new; } |
|
2
|
|
|
|
|
8
|
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# Reverse translate the unwanted aminoacids to unwanted codons. |
271
|
6
|
|
|
|
|
4
|
my @unwanted_codons; |
272
|
6
|
|
|
|
|
12
|
foreach my $aa ( split '', $aas_to_avoid ) { |
273
|
18
|
|
|
|
|
29
|
push @unwanted_codons, $ct->revtranslate($aa); |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
6
|
|
|
|
|
7
|
foreach my $degenerate_codon (@codon_library) { |
277
|
300
|
|
|
|
|
336
|
my @codons = _expand_codon($degenerate_codon); |
278
|
300
|
|
|
|
|
402
|
my $success = 1; |
279
|
|
|
|
|
|
|
|
280
|
300
|
|
|
|
|
233
|
foreach my $unwanted (@unwanted_codons) { |
281
|
3000
|
100
|
|
|
|
2129
|
if ( grep { uc $unwanted eq $_ } @codons ) { |
|
118320
|
|
|
|
|
91757
|
|
282
|
1788
|
|
|
|
|
1370
|
$success = 0; |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
300
|
100
|
|
|
|
781
|
if ($success) { return $degenerate_codon } |
|
6
|
|
|
|
|
24
|
|
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
1; |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Copyright 2009 Bruno Vecchi, all rights reserved. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
299
|
|
|
|
|
|
|
under the same terms as Perl itself. |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=cut |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
__DATA__ |