line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::Seq::EncodedSeq |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Cared for by Aaron Mackey |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Aaron Mackey |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::Seq::EncodedSeq - subtype of L to store DNA that encodes a protein |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
$obj = Bio::Seq::EncodedSeq->new( -seq => $dna, |
21
|
|
|
|
|
|
|
-encoding => "CCCCCCCIIIIICCCCC", |
22
|
|
|
|
|
|
|
-start => 1, |
23
|
|
|
|
|
|
|
-strand => 1, |
24
|
|
|
|
|
|
|
-length => 17 ); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# splice out (and possibly revcomp) the coding sequence |
27
|
|
|
|
|
|
|
$cds = obj->cds; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# obtain the protein translation of the sequence |
30
|
|
|
|
|
|
|
$prot = $obj->translate; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# other access/inspection routines as with Bio::LocatableSeq and |
33
|
|
|
|
|
|
|
# Bio::SeqI; note that coordinates are relative only to the DNA |
34
|
|
|
|
|
|
|
# sequence, not it's implicit encoded protein sequence. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 DESCRIPTION |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
Bio::Seq::EncodedSeq is a L |
39
|
|
|
|
|
|
|
object that holds a DNA sequence as well as information about the |
40
|
|
|
|
|
|
|
coding potential of that DNA sequence. It is meant to be useful in an |
41
|
|
|
|
|
|
|
alignment context, where the DNA may contain frameshifts, gaps and/or |
42
|
|
|
|
|
|
|
introns, or in describing the transcript of a gene. An EncodedSeq |
43
|
|
|
|
|
|
|
provides the ability to access the "spliced" coding sequence, meaning |
44
|
|
|
|
|
|
|
that all introns and gaps are removed, and any frameshifts are |
45
|
|
|
|
|
|
|
adjusted to provide a "clean" CDS. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
In order to make simultaneous use of either the DNA or the implicit |
48
|
|
|
|
|
|
|
encoded protein sequence coordinates, please see |
49
|
|
|
|
|
|
|
L. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head1 ENCODING |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
We use the term "encoding" here to refer to the series of symbols that |
54
|
|
|
|
|
|
|
we use to identify which residues of a DNA sequence are protein-coding |
55
|
|
|
|
|
|
|
(i.e. part of a codon), intronic, part of a 5' or 3', frameshift |
56
|
|
|
|
|
|
|
"mutations", etc. From this information, a Bio::Seq::EncodedSeq is |
57
|
|
|
|
|
|
|
able to "figure out" its translational CDS. There are two sets of |
58
|
|
|
|
|
|
|
coding characters, one termed "implicit" and one termed "explicit". |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
The "implicit" encoding is a bit simpler than the "explicit" encoding: |
61
|
|
|
|
|
|
|
'C' is used for any nucleotide that's part of a codon, 'U' for any |
62
|
|
|
|
|
|
|
UTR, etc. The full list is shown below: |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Code Meaning |
65
|
|
|
|
|
|
|
---- ------- |
66
|
|
|
|
|
|
|
C coding |
67
|
|
|
|
|
|
|
I intronic |
68
|
|
|
|
|
|
|
U untranslated |
69
|
|
|
|
|
|
|
G gapped (for use in alignments) |
70
|
|
|
|
|
|
|
F a "forward", +1 frameshift |
71
|
|
|
|
|
|
|
B a "backward", -1 frameshift |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
The "explicit" encoding is just an expansion of the "implicit" |
74
|
|
|
|
|
|
|
encoding, to denote phase: |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
Code Meaning |
77
|
|
|
|
|
|
|
---- ------- |
78
|
|
|
|
|
|
|
C coding, 1st codon position |
79
|
|
|
|
|
|
|
D coding, 2nd codon position |
80
|
|
|
|
|
|
|
E coding, 3rd codon position |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
I intronic, phase 0 (relative to intron begin) |
83
|
|
|
|
|
|
|
J intronic, phase 1 |
84
|
|
|
|
|
|
|
K intronic, phase 2 |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
U untranslated 3'UTR |
87
|
|
|
|
|
|
|
V untranslated 5'UTR |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
G gapped (for use in alignments) |
90
|
|
|
|
|
|
|
F a "forward", +1 frameshift |
91
|
|
|
|
|
|
|
B a "backward", -1 frameshift |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Note that the explicit coding is meant to provide easy access to |
94
|
|
|
|
|
|
|
position/phase specific nucleotides: |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
$obj = Bio::Seq::EncodedSeq->new(-seq => "ACAATCAGACTACG...", |
97
|
|
|
|
|
|
|
-encoding => "CCCCCCIII..." |
98
|
|
|
|
|
|
|
); |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# fetch arrays of nucleotides at each codon position: |
101
|
|
|
|
|
|
|
my @pos1 = $obj->dnaseq(encoding => 'C', explicit => 1); |
102
|
|
|
|
|
|
|
my @pos2 = $obj->dnaseq(encoding => 'D'); |
103
|
|
|
|
|
|
|
my @pos3 = $obj->dnaseq(encoding => 'E'); |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# fetch arrays of "3-1" codon dinucleotides, useful for genomic |
106
|
|
|
|
|
|
|
# signature analyses without compounding influences of codon bias: |
107
|
|
|
|
|
|
|
my @pairs = $obj->dnaseq(encoding => 'EC'); |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head1 FEEDBACK |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head2 Mailing Lists |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
114
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
115
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
118
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head2 Support |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
I |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
127
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
128
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
129
|
|
|
|
|
|
|
with code and data examples if at all possible. |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=head2 Reporting Bugs |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
134
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
135
|
|
|
|
|
|
|
web: |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=head1 AUTHOR - Aaron Mackey |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Email amackey@virginia.edu |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head1 APPENDIX |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
146
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=cut |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
package Bio::Seq::EncodedSeq; |
153
|
|
|
|
|
|
|
|
154
|
1
|
|
|
1
|
|
435
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
155
|
|
|
|
|
|
|
|
156
|
1
|
|
|
1
|
|
3
|
use base qw(Bio::LocatableSeq); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
305
|
|
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head2 new |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Title : new |
162
|
|
|
|
|
|
|
Usage : $obj = Bio::Seq::EncodedSeq->new(-seq => "AGTACGTGTCATG", |
163
|
|
|
|
|
|
|
-encoding => "CCCCCCFCCCCCC", |
164
|
|
|
|
|
|
|
-id => "myseq", |
165
|
|
|
|
|
|
|
-start => 1, |
166
|
|
|
|
|
|
|
-end => 13, |
167
|
|
|
|
|
|
|
-strand => 1 |
168
|
|
|
|
|
|
|
); |
169
|
|
|
|
|
|
|
Function: creates a new Bio::Seq::EncodedSeq object from a supplied DNA |
170
|
|
|
|
|
|
|
sequence |
171
|
|
|
|
|
|
|
Returns : a new Bio::Seq::EncodedSeq object |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
Args : seq - primary nucleotide sequence used to encode the |
174
|
|
|
|
|
|
|
protein; note that any positions involved in a |
175
|
|
|
|
|
|
|
gap ('G') or backward frameshift ('B') should |
176
|
|
|
|
|
|
|
have one or more gap characters; if the encoding |
177
|
|
|
|
|
|
|
specifies G or B, but no (or not enough) gap |
178
|
|
|
|
|
|
|
characters exist, *they'll be added*; similary, |
179
|
|
|
|
|
|
|
if there are gap characters without a |
180
|
|
|
|
|
|
|
corresponding G or B encoding, G's will be |
181
|
|
|
|
|
|
|
inserted into the encoding. This allows some |
182
|
|
|
|
|
|
|
flexibility in specifying your sequence and |
183
|
|
|
|
|
|
|
coding without having to calculate a lot of the |
184
|
|
|
|
|
|
|
encoding for yourself. |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
encoding - a string of characters (see Encoding Table) |
187
|
|
|
|
|
|
|
describing backwards frameshifts implied by the |
188
|
|
|
|
|
|
|
encoding but not present in the sequence will be |
189
|
|
|
|
|
|
|
added (as '-'s) to the sequence. If not |
190
|
|
|
|
|
|
|
supplied, it will be assumed that all positions |
191
|
|
|
|
|
|
|
are coding (C). Encoding may include either |
192
|
|
|
|
|
|
|
implicit phase encoding characters (i.e. "CCC") |
193
|
|
|
|
|
|
|
and/or explicit encoding characters (i.e. "CDE"). |
194
|
|
|
|
|
|
|
Additionally, prefixed numbers may be used to |
195
|
|
|
|
|
|
|
denote repetition (i.e. "27C3I28C"). |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Alternatively, encoding may be a hashref |
198
|
|
|
|
|
|
|
datastructure, with encoding characters as keys |
199
|
|
|
|
|
|
|
and Bio::LocationI objects (or arrayrefs of |
200
|
|
|
|
|
|
|
Bio::LocationI objects) as values, e.g.: |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
{ C => [ Bio::Location::Simple->new(1,9), |
203
|
|
|
|
|
|
|
Bio::Location::Simple->new(11,13) ], |
204
|
|
|
|
|
|
|
F => Bio::Location::Simple->new(10,10), |
205
|
|
|
|
|
|
|
} # same as "CCCCCCCCCFCCC" |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
Note that if the location ranges overlap, the |
208
|
|
|
|
|
|
|
behavior of the encoding will be undefined |
209
|
|
|
|
|
|
|
(well, it will be defined, but only according to |
210
|
|
|
|
|
|
|
the order in which the hash keys are read, which |
211
|
|
|
|
|
|
|
is basically undefined ... just don't do that). |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
id, start, end, strand - as with Bio::LocatableSeq; note |
214
|
|
|
|
|
|
|
that the coordinates are relative to the |
215
|
|
|
|
|
|
|
encoding DNA sequence, not the implicit protein |
216
|
|
|
|
|
|
|
sequence. If strand is reversed, then the |
217
|
|
|
|
|
|
|
encoding is assumed to be relative to the |
218
|
|
|
|
|
|
|
reverse strand as well. |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=cut |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub new { |
223
|
4
|
|
|
4
|
1
|
103
|
my ($self, @args) = @_; |
224
|
4
|
|
|
|
|
18
|
$self = $self->SUPER::new(@args, -alphabet => 'dna'); |
225
|
4
|
|
|
|
|
13
|
my ($enc) = $self->_rearrange([qw(ENCODING)], @args); |
226
|
|
|
|
|
|
|
# set the real encoding: |
227
|
4
|
50
|
|
|
|
7
|
if ($enc) { |
228
|
0
|
|
|
|
|
0
|
$self->encoding($enc); |
229
|
|
|
|
|
|
|
} else { |
230
|
4
|
|
|
|
|
9
|
$self->_recheck_encoding; |
231
|
|
|
|
|
|
|
} |
232
|
4
|
|
|
|
|
26
|
return $self; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=head2 encoding |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Title : encoding |
239
|
|
|
|
|
|
|
Usage : $obj->encoding("CCCCCC"); |
240
|
|
|
|
|
|
|
$obj->encoding( -encoding => { I => $location } ); |
241
|
|
|
|
|
|
|
$enc = $obj->encoding(-explicit => 1); |
242
|
|
|
|
|
|
|
$enc = $obj->encoding("CCCCCC", -explicit => 1); |
243
|
|
|
|
|
|
|
$enc = $obj->encoding(-location => $location, |
244
|
|
|
|
|
|
|
-explicit => 1, |
245
|
|
|
|
|
|
|
-absolute => 1 ); |
246
|
|
|
|
|
|
|
Function: get/set the objects encoding, either globally or by location(s). |
247
|
|
|
|
|
|
|
Returns : the (possibly new) encoding string. |
248
|
|
|
|
|
|
|
Args : encoding - see the encoding argument to the new() function. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
explicit - whether or not to return explicit phase |
251
|
|
|
|
|
|
|
information in the coding (i.e. "CCC" becomes |
252
|
|
|
|
|
|
|
"CDE", "III" becomes "IJK", etc); defaults to 0. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
location - optional; location to get/set the encoding. |
255
|
|
|
|
|
|
|
Defaults to the entire sequence. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
absolute - whether or not the locational elements (either |
258
|
|
|
|
|
|
|
in the encoding hashref or the location |
259
|
|
|
|
|
|
|
argument) are relative to the absolute start/end |
260
|
|
|
|
|
|
|
of the Bio::LocatableSeq, or to the internal, |
261
|
|
|
|
|
|
|
relative coordinate system (beginning at 1); |
262
|
|
|
|
|
|
|
defaults to 0 (i.e. relative) |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=cut |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub encoding { |
267
|
8
|
|
|
8
|
1
|
15
|
my ($self, @args) = @_; |
268
|
8
|
|
|
|
|
27
|
my ($enc, $loc, $exp, $abs) = $self->_rearrange([qw(ENCODING LOCATION EXPLICIT ABSOLUTE)], @args); |
269
|
|
|
|
|
|
|
|
270
|
8
|
100
|
|
|
|
25
|
if (!$enc) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
271
|
|
|
|
|
|
|
# do nothing; _recheck_encoding will fix for us, if necessary |
272
|
|
|
|
|
|
|
} elsif (ref $enc eq 'HASH') { |
273
|
0
|
|
|
|
|
0
|
$self->throw( -class => 'Bio::Root::NotImplemented', |
274
|
|
|
|
|
|
|
-text => "Hashref functionality not yet implemented;\nplease email me if you really need this."); |
275
|
|
|
|
|
|
|
#TODO: finish all this |
276
|
0
|
|
|
|
|
0
|
while (my ($char, $locs) = each %$enc) { |
277
|
0
|
0
|
|
|
|
0
|
if (ref $locs eq 'ARRAY') { |
|
|
0
|
|
|
|
|
|
278
|
|
|
|
|
|
|
} elsif (UNIVERSAL::isa($locs, "Bio::LocationI")) { |
279
|
|
|
|
|
|
|
} else { |
280
|
0
|
|
|
|
|
0
|
$self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}"); |
|
0
|
|
|
|
|
0
|
|
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
} elsif (! ref $enc) { |
284
|
4
|
|
|
|
|
5
|
$enc = uc $enc; |
285
|
4
|
100
|
66
|
|
|
20
|
$exp = 1 if (!defined $exp && $enc =~ m/[DEJKV]/o); |
286
|
|
|
|
|
|
|
|
287
|
4
|
100
|
|
|
|
9
|
if ($enc =~ m/\d/o) { # numerically "enhanced" encoding |
288
|
1
|
|
|
|
|
1
|
my $numenc = $enc; |
289
|
1
|
|
|
|
|
1
|
$enc = ''; |
290
|
1
|
|
|
|
|
5
|
while ($numenc =~ m/\G(\d*)([CDEIJKUVGFB])/g) { |
291
|
4
|
|
|
|
|
6
|
my ($num, $char) = ($1, $2); |
292
|
4
|
100
|
|
|
|
6
|
$num = 1 unless $num; |
293
|
4
|
|
|
|
|
12
|
$enc .= $char x $num; |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
4
|
50
|
66
|
|
|
26
|
if (defined $exp && $exp == 0 && $enc =~ m/([^CIUGFB])/) { |
|
|
50
|
33
|
|
|
|
|
298
|
0
|
|
|
|
|
0
|
$self->throw("Unrecognized character '$1' in implicit encoding"); |
299
|
|
|
|
|
|
|
} elsif ($enc =~ m/[^CDEIJKUVGFB]/) { |
300
|
0
|
|
|
|
|
0
|
$self->throw("Unrecognized character '$1' in explicit encoding"); |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
|
303
|
4
|
100
|
|
|
|
7
|
if ($loc) { # a global location, over which to apply the specified encoding. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# balk if too many non-gap characters present in encoding: |
306
|
1
|
|
|
|
|
2
|
my ($ct) = $enc =~ tr/GB/GB/; |
307
|
1
|
|
|
|
|
2
|
$ct = length($enc) - $ct; |
308
|
1
|
50
|
33
|
|
|
4
|
$self->throw("Location length doesn't match number of coding chars in encoding!") |
309
|
|
|
|
|
|
|
if ($loc->location_type eq 'EXACT' && $loc->length != $ct); |
310
|
|
|
|
|
|
|
|
311
|
1
|
|
|
|
|
2
|
my $start = $loc->start; |
312
|
1
|
|
|
|
|
14
|
my $end = $loc->end; |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
# strip any encoding that hangs off the ends of the sequence: |
315
|
1
|
50
|
|
|
|
3
|
if ($start < $self->start) { |
316
|
0
|
|
|
|
|
0
|
my $diff = $self->start - $start; |
317
|
0
|
|
|
|
|
0
|
$start = $self->start; |
318
|
0
|
|
|
|
|
0
|
$enc = substr($enc, $diff); |
319
|
|
|
|
|
|
|
} |
320
|
1
|
50
|
|
|
|
2
|
if ($end > $self->end) { |
321
|
0
|
|
|
|
|
0
|
my $diff = $end - $self->end; |
322
|
0
|
|
|
|
|
0
|
$end = $self->end; |
323
|
0
|
|
|
|
|
0
|
$enc = substr($enc, -$diff); |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
1
|
|
|
|
|
2
|
my $currenc = $self->{_encoding}; |
327
|
1
|
|
|
|
|
2
|
my $currseq = $self->seq; |
328
|
|
|
|
|
|
|
|
329
|
1
|
|
|
|
|
2
|
my ($spanstart, $spanend) = ($self->column_from_residue_number($start), |
330
|
|
|
|
|
|
|
$self->column_from_residue_number($end) ); |
331
|
|
|
|
|
|
|
|
332
|
1
|
50
|
|
|
|
4
|
if ($currseq) { |
333
|
|
|
|
|
|
|
# strip any gaps in sequence spanned by this location: |
334
|
1
|
50
|
|
|
|
2
|
($spanstart, $spanend) = ($spanend, $spanstart) if $self->strand < 0; |
335
|
1
|
50
|
|
|
|
2
|
my ($before, $in, $after) = $currseq =~ m/(.{@{[ $spanstart - ($loc->location_type eq 'IN-BETWEEN' ? 0 : 1) ]}}) |
|
1
|
|
|
|
|
2
|
|
336
|
1
|
50
|
|
|
|
2
|
(.{@{[ $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1) ]}}) |
337
|
|
|
|
|
|
|
(.*) |
338
|
|
|
|
|
|
|
/x; |
339
|
1
|
|
50
|
|
|
5
|
$in ||= ''; |
340
|
1
|
|
|
|
|
1
|
$in =~ s/[\.\-]+//g; |
341
|
1
|
|
50
|
|
|
6
|
$currseq = ($before||'') . $in. ($after||''); |
|
|
|
50
|
|
|
|
|
342
|
|
|
|
|
|
|
# change seq without changing the alphabet |
343
|
1
|
|
|
|
|
2
|
$self->seq($currseq,$self->alphabet()); |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
1
|
50
|
|
|
|
2
|
$currenc = reverse $currenc if $self->strand < 0; |
347
|
1
|
50
|
|
|
|
3
|
substr($currenc, $spanstart, $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1), |
|
|
50
|
|
|
|
|
|
348
|
|
|
|
|
|
|
$self->strand >= 0 ? $enc : reverse $enc); |
349
|
1
|
50
|
|
|
|
2
|
$currenc = reverse $currenc if $self->strand < 0; |
350
|
|
|
|
|
|
|
|
351
|
1
|
|
|
|
|
1
|
$self->{_encoding} = $currenc; |
352
|
1
|
|
|
|
|
3
|
$self->_recheck_encoding; |
353
|
|
|
|
|
|
|
|
354
|
1
|
|
|
|
|
2
|
$currenc = $self->{_encoding}; |
355
|
1
|
50
|
|
|
|
2
|
$currenc = reverse $currenc if $self->strand < 0; |
356
|
1
|
|
|
|
|
2
|
$enc = substr($currenc, $spanstart, length $enc); |
357
|
1
|
50
|
|
|
|
2
|
$enc = reverse $enc if $self->strand < 0; |
358
|
|
|
|
|
|
|
|
359
|
1
|
50
|
|
|
|
3
|
return $exp ? $enc: $self->_convert2implicit($enc); |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
} else { |
362
|
|
|
|
|
|
|
# presume a global redefinition; strip any current gap |
363
|
|
|
|
|
|
|
# characters in the sequence so they don't corrupt the |
364
|
|
|
|
|
|
|
# encoding |
365
|
3
|
|
|
|
|
7
|
my $dna = $self->seq; |
366
|
3
|
|
|
|
|
10
|
$dna =~ s/[\.\-]//g; |
367
|
3
|
|
|
|
|
7
|
$self->seq($dna, 'dna'); |
368
|
3
|
|
|
|
|
5
|
$self->{_encoding} = $enc; |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
} else { |
371
|
0
|
|
|
|
|
0
|
$self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}"); |
|
0
|
|
|
|
|
0
|
|
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
|
374
|
7
|
|
|
|
|
12
|
$self->_recheck_encoding(); |
375
|
|
|
|
|
|
|
|
376
|
7
|
100
|
|
|
|
20
|
return $exp ? $self->{_encoding} : $self->_convert2implicit($self->{_encoding}); |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
sub _convert2implicit { |
381
|
8
|
|
|
8
|
|
11
|
my ($self, $enc) = @_; |
382
|
|
|
|
|
|
|
|
383
|
8
|
|
|
|
|
26
|
$enc =~ s/[DE]/C/g; |
384
|
8
|
|
|
|
|
9
|
$enc =~ s/[JK]/I/g; |
385
|
8
|
|
|
|
|
9
|
$enc =~ s/V/U/g; |
386
|
|
|
|
|
|
|
|
387
|
8
|
|
|
|
|
29
|
return $enc; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
sub _recheck_encoding { |
392
|
|
|
|
|
|
|
|
393
|
12
|
|
|
12
|
|
11
|
my $self = shift; |
394
|
|
|
|
|
|
|
|
395
|
12
|
|
100
|
|
|
45
|
my @enc = split //, ($self->{_encoding} || ''); |
396
|
|
|
|
|
|
|
|
397
|
12
|
|
|
|
|
27
|
my @nt = split(//, $self->SUPER::seq); |
398
|
12
|
100
|
66
|
|
|
25
|
@nt = reverse @nt if $self->strand && $self->strand < 0; |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# make sure an encoding exists! |
401
|
12
|
100
|
|
|
|
23
|
@enc = ('C') x scalar grep { !/[\.\-]/o } @nt |
|
31
|
|
|
|
|
52
|
|
402
|
|
|
|
|
|
|
unless @enc; |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# check for gaps to be truly present in the sequence |
405
|
|
|
|
|
|
|
# and vice versa |
406
|
12
|
|
|
|
|
11
|
my $i; |
407
|
12
|
|
100
|
|
|
42
|
for ($i = 0 ; $i < @nt && $i < @enc ; $i++) { |
408
|
101
|
100
|
100
|
|
|
558
|
if ($nt[$i] =~ /[\.\-]/o && $enc[$i] !~ m/[GB]/o) { |
|
|
100
|
100
|
|
|
|
|
409
|
5
|
|
|
|
|
19
|
splice(@enc, $i, 0, 'G'); |
410
|
|
|
|
|
|
|
} elsif ($nt[$i] !~ /[\.\-]/o && $enc[$i] =~ m/[GB]/o) { |
411
|
10
|
|
|
|
|
34
|
splice(@nt, $i, 0, '-'); |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
} |
414
|
12
|
50
|
|
|
|
28
|
if ($i < @enc) { |
|
|
100
|
|
|
|
|
|
415
|
|
|
|
|
|
|
# extra encoding; presumably all gaps? |
416
|
0
|
|
|
|
|
0
|
for ( ; $i < @enc ; $i++) { |
417
|
0
|
0
|
|
|
|
0
|
if ($enc[$i] =~ m/[GB]/o) { |
418
|
0
|
|
|
|
|
0
|
push @nt, '-'; |
419
|
|
|
|
|
|
|
} else { |
420
|
0
|
|
|
|
|
0
|
$self->throw("Extraneous encoding info: " . join('', @enc[$i..$#enc])); |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
} |
423
|
|
|
|
|
|
|
} elsif ($i < @nt) { |
424
|
3
|
|
|
|
|
4
|
for ( ; $i < @nt ; $i++) { |
425
|
8
|
100
|
|
|
|
10
|
if ($nt[$i] =~ m/[\.\-]/o) { |
426
|
2
|
|
|
|
|
6
|
push @enc, 'G'; |
427
|
|
|
|
|
|
|
} else { |
428
|
6
|
|
|
|
|
9
|
push @enc, 'C'; |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
} |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
12
|
|
|
|
|
15
|
my @cde_array = qw(C D E); |
434
|
12
|
|
|
|
|
13
|
my @ijk_array = qw(I J K); |
435
|
|
|
|
|
|
|
# convert any leftover implicit coding into explicit coding |
436
|
12
|
|
|
|
|
14
|
my ($Cct, $Ict, $Uct, $Vct, $Vwarned) = (0, 0, 0, 0); |
437
|
12
|
|
|
|
|
18
|
for ($i = 0 ; $i < @enc ; $i++) { |
438
|
109
|
100
|
|
|
|
251
|
if ($enc[$i] =~ m/[CDE]/o) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
439
|
68
|
|
|
|
|
61
|
my $temp_index = $Cct %3; |
440
|
68
|
|
|
|
|
50
|
$enc[$i] = $cde_array[$temp_index]; |
441
|
68
|
|
|
|
|
40
|
$Cct++; $Ict = 0; $Uct = 1; |
|
68
|
|
|
|
|
43
|
|
|
68
|
|
|
|
|
41
|
|
442
|
68
|
50
|
33
|
|
|
150
|
$self->warn("3' untranslated encoding (V) seen prior to other coding symbols") |
443
|
|
|
|
|
|
|
if ($Vct && !$Vwarned++); |
444
|
|
|
|
|
|
|
} elsif ($enc[$i] =~ m/[IJK]/o) { |
445
|
0
|
|
|
|
|
0
|
$enc[$i] = $ijk_array[$Ict % 3]; |
446
|
0
|
|
|
|
|
0
|
$Ict++; $Uct = 1; |
|
0
|
|
|
|
|
0
|
|
447
|
0
|
0
|
0
|
|
|
0
|
$self->warn("3' untranslated encoding (V) seen before other coding symbols") |
448
|
|
|
|
|
|
|
if ($Vct && !$Vwarned++); |
449
|
|
|
|
|
|
|
} elsif ($enc[$i] =~ m/[UV]/o) { |
450
|
0
|
0
|
|
|
|
0
|
if ($Uct == 1) { |
451
|
0
|
|
|
|
|
0
|
$enc[$i] = 'V'; |
452
|
0
|
|
|
|
|
0
|
$Vct = 1; |
453
|
|
|
|
|
|
|
} |
454
|
|
|
|
|
|
|
} elsif ($enc[$i] eq 'B') { |
455
|
4
|
|
|
|
|
4
|
$Cct++; $Ict++ |
|
4
|
|
|
|
|
4
|
|
456
|
|
|
|
|
|
|
} elsif ($enc[$i] eq 'G') { |
457
|
|
|
|
|
|
|
# gap; leave alone |
458
|
|
|
|
|
|
|
} |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
12
|
100
|
66
|
|
|
22
|
@nt = reverse @nt if $self->strand && $self->strand < 0; |
462
|
12
|
|
|
|
|
39
|
$self->seq(join('', @nt), 'dna'); |
463
|
|
|
|
|
|
|
|
464
|
12
|
|
|
|
|
43
|
$self->{_encoding} = join '', @enc; |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
=head2 cds |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
Title : cds |
471
|
|
|
|
|
|
|
Usage : $cds = $obj->cds(-nogaps => 1); |
472
|
|
|
|
|
|
|
Function: obtain the "spliced" DNA sequence, by removing any |
473
|
|
|
|
|
|
|
nucleotides that participate in an UTR, forward frameshift |
474
|
|
|
|
|
|
|
or intron, and replacing any unknown nucleotide implied by |
475
|
|
|
|
|
|
|
a backward frameshift or gap with N's. |
476
|
|
|
|
|
|
|
Returns : a Bio::Seq::EncodedSeq object, with an encoding consisting only |
477
|
|
|
|
|
|
|
of "CCCC..". |
478
|
|
|
|
|
|
|
Args : nogaps - strip any gap characters (resulting from 'G' or 'B' |
479
|
|
|
|
|
|
|
encodings), rather than replacing them with N's. |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=cut |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
sub cds { |
484
|
2
|
|
|
2
|
1
|
5
|
my ($self, @args) = @_; |
485
|
|
|
|
|
|
|
|
486
|
2
|
|
|
|
|
9
|
my ($nogaps, $loc) = $self->_rearrange([qw(NOGAPS LOCATION)], @args); |
487
|
2
|
50
|
|
|
|
7
|
$nogaps = 0 unless defined $nogaps; |
488
|
|
|
|
|
|
|
|
489
|
2
|
50
|
|
|
|
5
|
my @nt = split //, $self->strand < 0 ? $self->revcom->seq : $self->seq; |
490
|
2
|
|
|
|
|
4
|
my @enc = split //, $self->_convert2implicit($self->{_encoding}); |
491
|
|
|
|
|
|
|
|
492
|
2
|
|
|
|
|
3
|
my ($start, $end) = (0, scalar @nt); |
493
|
|
|
|
|
|
|
|
494
|
2
|
50
|
|
|
|
4
|
if ($loc) { |
495
|
0
|
|
|
|
|
0
|
$start = $loc->start; |
496
|
0
|
0
|
|
|
|
0
|
$start++ if $loc->location_type eq 'IN-BETWEEN'; |
497
|
0
|
|
|
|
|
0
|
$start = $self->column_from_residue_number($start); |
498
|
0
|
|
|
|
|
0
|
$start--; |
499
|
|
|
|
|
|
|
|
500
|
0
|
|
|
|
|
0
|
$end = $loc->end; |
501
|
0
|
|
|
|
|
0
|
$end = $self->column_from_residue_number($end); |
502
|
|
|
|
|
|
|
|
503
|
0
|
0
|
|
|
|
0
|
($start, $end) = ($end, $start) if $self->strand < 0; |
504
|
0
|
|
|
|
|
0
|
$start--; |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
|
507
|
2
|
|
|
|
|
5
|
for (my $i = $start ; $i < $end ; $i++) { |
508
|
20
|
50
|
33
|
|
|
114
|
if ($enc[$i] eq 'I' || $enc[$i] eq 'U' || $enc[$i] eq 'F') { |
|
|
100
|
33
|
|
|
|
|
|
|
|
100
|
|
|
|
|
509
|
|
|
|
|
|
|
# remove introns, untranslated and forward frameshift nucleotides |
510
|
0
|
|
|
|
|
0
|
$nt[$i] = undef; |
511
|
|
|
|
|
|
|
} elsif ($enc[$i] eq 'G' || $enc[$i] eq 'B') { |
512
|
|
|
|
|
|
|
# replace gaps and backward frameshifts with N's, unless asked not to. |
513
|
8
|
50
|
|
|
|
16
|
$nt[$i] = $nogaps ? undef : 'N'; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
return ($self->can_call_new ? ref($self) : __PACKAGE__)->new( |
518
|
2
|
50
|
|
|
|
8
|
-seq => join('', grep { defined } @nt[$start..--$end]), |
|
20
|
|
|
|
|
21
|
|
519
|
|
|
|
|
|
|
-start => $self->start, |
520
|
|
|
|
|
|
|
-end => $self->end, |
521
|
|
|
|
|
|
|
-strand => 1, |
522
|
|
|
|
|
|
|
-alphabet => 'dna' ); |
523
|
|
|
|
|
|
|
} |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=head2 translate |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
Title : translate |
529
|
|
|
|
|
|
|
Usage : $prot = $obj->translate(@args); |
530
|
|
|
|
|
|
|
Function: obtain the protein sequence encoded by the underlying DNA |
531
|
|
|
|
|
|
|
sequence; same as $obj->cds()->translate(@args). |
532
|
|
|
|
|
|
|
Returns : a Bio::PrimarySeq object. |
533
|
|
|
|
|
|
|
Args : same as the translate() function of Bio::PrimarySeqI |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=cut |
536
|
|
|
|
|
|
|
|
537
|
1
|
|
|
1
|
1
|
5
|
sub translate { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_) }; |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
=head2 protseq |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
Title : seq |
543
|
|
|
|
|
|
|
Usage : $protseq = $obj->protseq(); |
544
|
|
|
|
|
|
|
Function: obtain the raw protein sequence encoded by the underlying |
545
|
|
|
|
|
|
|
DNA sequence; This is the same as calling |
546
|
|
|
|
|
|
|
$obj->translate()->seq(); |
547
|
|
|
|
|
|
|
Returns : a string of single-letter amino acid codes |
548
|
|
|
|
|
|
|
Args : same as the seq() function of Bio::PrimarySeq; note that this |
549
|
|
|
|
|
|
|
function may not be used to set the protein sequence; see |
550
|
|
|
|
|
|
|
the dnaseq() function for that. |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
=cut |
553
|
|
|
|
|
|
|
|
554
|
0
|
|
|
0
|
1
|
0
|
sub protseq { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_)->seq }; |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
=head2 dnaseq |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
Title : dnaseq |
560
|
|
|
|
|
|
|
Usage : $dnaseq = $obj->dnaseq(); |
561
|
|
|
|
|
|
|
$obj->dnaseq("ACGTGTCGT", "CCCCCCCCC"); |
562
|
|
|
|
|
|
|
$obj->dnaseq(-seq => "ATG", |
563
|
|
|
|
|
|
|
-encoding => "CCC", |
564
|
|
|
|
|
|
|
-location => $loc ); |
565
|
|
|
|
|
|
|
@introns = $obj->$dnaseq(-encoding => 'I') |
566
|
|
|
|
|
|
|
Function: get/set the underlying DNA sequence; will overwrite any |
567
|
|
|
|
|
|
|
current DNA and/or encoding information present. |
568
|
|
|
|
|
|
|
Returns : a string of single-letter nucleotide codes, including any |
569
|
|
|
|
|
|
|
gaps implied by the encoding. |
570
|
|
|
|
|
|
|
Args : seq - the DNA sequence to be used as a replacement |
571
|
|
|
|
|
|
|
encoding - the encoding of the DNA sequence (see the new() |
572
|
|
|
|
|
|
|
constructor); defaults to all 'C' if setting a |
573
|
|
|
|
|
|
|
new DNA sequence. If no new DNA sequence is |
574
|
|
|
|
|
|
|
being provided, then the encoding is used as a |
575
|
|
|
|
|
|
|
"filter" for which to return fragments of |
576
|
|
|
|
|
|
|
non-overlapping DNA that match the encoding. |
577
|
|
|
|
|
|
|
location - optional, the location of the DNA sequence to |
578
|
|
|
|
|
|
|
get/set; defaults to the entire sequence. |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
=cut |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
sub dnaseq { |
583
|
0
|
|
|
0
|
1
|
0
|
my ($self, @args) = @_; |
584
|
0
|
|
|
|
|
0
|
my ($seq, $enc, $loc) = $self->_rearrange([qw(DNASEQ ENCODING LOCATION)], @args); |
585
|
0
|
|
|
|
|
0
|
return $self; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
# need to overload this so that we truncate both the seq and the encoding! |
590
|
|
|
|
|
|
|
sub trunc { |
591
|
1
|
|
|
1
|
1
|
1
|
my ($self, $start, $end) = @_; |
592
|
1
|
|
|
|
|
6
|
my $new = $self->SUPER::trunc($start, $end); |
593
|
1
|
|
|
|
|
1
|
$start--; |
594
|
1
|
|
|
|
|
2
|
my $enc = $self->{_encoding}; |
595
|
1
|
50
|
|
|
|
3
|
$enc = reverse $enc if $self->strand < 0; |
596
|
1
|
|
|
|
|
2
|
$enc = substr($enc, $start, $end - $start); |
597
|
1
|
50
|
|
|
|
3
|
$enc = reverse $enc if $self->strand < 0; |
598
|
1
|
|
|
|
|
3
|
$new->encoding($enc); |
599
|
1
|
|
|
|
|
4
|
return $new; |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
1; |