line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::PrimarySeqI |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Cared for by Ewan Birney |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Ewan Birney |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 NAME |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# Bio::PrimarySeqI is the interface class for sequences. |
22
|
|
|
|
|
|
|
# If you are a newcomer to bioperl, you might want to start with |
23
|
|
|
|
|
|
|
# Bio::Seq documentation. |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# Test if this is a seq object |
26
|
|
|
|
|
|
|
$obj->isa("Bio::PrimarySeqI") || |
27
|
|
|
|
|
|
|
$obj->throw("$obj does not implement the Bio::PrimarySeqI interface"); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# Accessors |
30
|
|
|
|
|
|
|
$string = $obj->seq(); |
31
|
|
|
|
|
|
|
$substring = $obj->subseq(12,50); |
32
|
|
|
|
|
|
|
$display = $obj->display_id(); # for human display |
33
|
|
|
|
|
|
|
$id = $obj->primary_id(); # unique id for this object, |
34
|
|
|
|
|
|
|
# implementation defined |
35
|
|
|
|
|
|
|
$unique_key= $obj->accession_number(); # unique biological id |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# Object manipulation |
39
|
|
|
|
|
|
|
eval { |
40
|
|
|
|
|
|
|
$rev = $obj->revcom(); |
41
|
|
|
|
|
|
|
}; |
42
|
|
|
|
|
|
|
if( $@ ) { |
43
|
|
|
|
|
|
|
$obj->throw( "Could not reverse complement. ". |
44
|
|
|
|
|
|
|
"Probably not DNA. Actual exception\n$@\n" ); |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
$trunc = $obj->trunc(12,50); |
48
|
|
|
|
|
|
|
# $rev and $trunc are Bio::PrimarySeqI compliant objects |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head1 DESCRIPTION |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
This object defines an abstract interface to basic sequence |
54
|
|
|
|
|
|
|
information - for most users of the package the documentation (and |
55
|
|
|
|
|
|
|
methods) in this class are not useful - this is a developers-only |
56
|
|
|
|
|
|
|
class which defines what methods have to be implemented by other Perl |
57
|
|
|
|
|
|
|
objects to comply to the Bio::PrimarySeqI interface. Go "perldoc |
58
|
|
|
|
|
|
|
Bio::Seq" or "man Bio::Seq" for more information on the main class for |
59
|
|
|
|
|
|
|
sequences. |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
PrimarySeq is an object just for the sequence and its name(s), nothing |
62
|
|
|
|
|
|
|
more. Seq is the larger object complete with features. There is a pure |
63
|
|
|
|
|
|
|
perl implementation of this in L. If you just want to |
64
|
|
|
|
|
|
|
use L objects, then please read that module first. This |
65
|
|
|
|
|
|
|
module defines the interface, and is of more interest to people who |
66
|
|
|
|
|
|
|
want to wrap their own Perl Objects/RDBs/FileSystems etc in way that |
67
|
|
|
|
|
|
|
they "are" bioperl sequence objects, even though it is not using Perl |
68
|
|
|
|
|
|
|
to store the sequence etc. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
This interface defines what bioperl considers necessary to "be" a |
71
|
|
|
|
|
|
|
sequence, without providing an implementation of this, an |
72
|
|
|
|
|
|
|
implementation is provided in L. If you want to provide |
73
|
|
|
|
|
|
|
a Bio::PrimarySeq-compliant object which in fact wraps another |
74
|
|
|
|
|
|
|
object/database/out-of-perl experience, then this is the correct thing |
75
|
|
|
|
|
|
|
to wrap, generally by providing a wrapper class which would inherit |
76
|
|
|
|
|
|
|
from your object and this Bio::PrimarySeqI interface. The wrapper class |
77
|
|
|
|
|
|
|
then would have methods lists in the "Implementation Specific |
78
|
|
|
|
|
|
|
Functions" which would provide these methods for your object. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head1 FEEDBACK |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=head2 Mailing Lists |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
85
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
86
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
89
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head2 Support |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
I |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
98
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
99
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
100
|
|
|
|
|
|
|
with code and data examples if at all possible. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=head2 Reporting Bugs |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
105
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
106
|
|
|
|
|
|
|
web: |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=head1 AUTHOR - Ewan Birney |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Email birney@ebi.ac.uk |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head1 APPENDIX |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
117
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=cut |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
package Bio::PrimarySeqI; |
123
|
203
|
|
|
203
|
|
1329
|
use strict; |
|
203
|
|
|
|
|
384
|
|
|
203
|
|
|
|
|
5180
|
|
124
|
203
|
|
|
203
|
|
62691
|
use Bio::Tools::CodonTable; |
|
203
|
|
|
|
|
467
|
|
|
203
|
|
|
|
|
5722
|
|
125
|
|
|
|
|
|
|
|
126
|
203
|
|
|
203
|
|
1308
|
use base qw(Bio::Root::RootI); |
|
203
|
|
|
|
|
363
|
|
|
203
|
|
|
|
|
420893
|
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=head1 Implementation-specific Functions |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
These functions are the ones that a specific implementation must |
132
|
|
|
|
|
|
|
define. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head2 seq |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
Title : seq |
137
|
|
|
|
|
|
|
Usage : $string = $obj->seq() |
138
|
|
|
|
|
|
|
Function: Returns the sequence as a string of letters. The |
139
|
|
|
|
|
|
|
case of the letters is left up to the implementer. |
140
|
|
|
|
|
|
|
Suggested cases are upper case for proteins and lower case for |
141
|
|
|
|
|
|
|
DNA sequence (IUPAC standard), but implementations are suggested to |
142
|
|
|
|
|
|
|
keep an open mind about case (some users... want mixed case!) |
143
|
|
|
|
|
|
|
Returns : A scalar |
144
|
|
|
|
|
|
|
Status : Virtual |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=cut |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub seq { |
149
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
150
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented(); |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=head2 subseq |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
Title : subseq |
157
|
|
|
|
|
|
|
Usage : $substring = $obj->subseq(10,40); |
158
|
|
|
|
|
|
|
Function: Returns the subseq from start to end, where the first base |
159
|
|
|
|
|
|
|
is 1 and the number is inclusive, i.e. 1-2 are the first two |
160
|
|
|
|
|
|
|
bases of the sequence. |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Start cannot be larger than end but can be equal. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
Returns : A string |
165
|
|
|
|
|
|
|
Args : |
166
|
|
|
|
|
|
|
Status : Virtual |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=cut |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub subseq { |
171
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
172
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented(); |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head2 display_id |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Title : display_id |
179
|
|
|
|
|
|
|
Usage : $id_string = $obj->display_id(); |
180
|
|
|
|
|
|
|
Function: Returns the display id, also known as the common name of the Sequence |
181
|
|
|
|
|
|
|
object. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
The semantics of this is that it is the most likely string |
184
|
|
|
|
|
|
|
to be used as an identifier of the sequence, and likely to |
185
|
|
|
|
|
|
|
have "human" readability. The id is equivalent to the ID |
186
|
|
|
|
|
|
|
field of the GenBank/EMBL databanks and the id field of the |
187
|
|
|
|
|
|
|
Swissprot/sptrembl database. In fasta format, the >(\S+) is |
188
|
|
|
|
|
|
|
presumed to be the id, though some people overload the id |
189
|
|
|
|
|
|
|
to embed other information. Bioperl does not use any |
190
|
|
|
|
|
|
|
embedded information in the ID field, and people are |
191
|
|
|
|
|
|
|
encouraged to use other mechanisms (accession field for |
192
|
|
|
|
|
|
|
example, or extending the sequence object) to solve this. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
Notice that $seq->id() maps to this function, mainly for |
195
|
|
|
|
|
|
|
legacy/convenience reasons. |
196
|
|
|
|
|
|
|
Returns : A string |
197
|
|
|
|
|
|
|
Args : None |
198
|
|
|
|
|
|
|
Status : Virtual |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=cut |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
sub display_id { |
203
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
204
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented(); |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=head2 accession_number |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
Title : accession_number |
211
|
|
|
|
|
|
|
Usage : $unique_biological_key = $obj->accession_number; |
212
|
|
|
|
|
|
|
Function: Returns the unique biological id for a sequence, commonly |
213
|
|
|
|
|
|
|
called the accession_number. For sequences from established |
214
|
|
|
|
|
|
|
databases, the implementors should try to use the correct |
215
|
|
|
|
|
|
|
accession number. Notice that primary_id() provides the |
216
|
|
|
|
|
|
|
unique id for the implementation, allowing multiple objects |
217
|
|
|
|
|
|
|
to have the same accession number in a particular implementation. |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
For sequences with no accession number, this method should return |
220
|
|
|
|
|
|
|
"unknown". |
221
|
|
|
|
|
|
|
Returns : A string |
222
|
|
|
|
|
|
|
Args : None |
223
|
|
|
|
|
|
|
Status : Virtual |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=cut |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub accession_number { |
228
|
0
|
|
|
0
|
1
|
0
|
my ($self,@args) = @_; |
229
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented(); |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=head2 primary_id |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
Title : primary_id |
236
|
|
|
|
|
|
|
Usage : $unique_implementation_key = $obj->primary_id; |
237
|
|
|
|
|
|
|
Function: Returns the unique id for this object in this |
238
|
|
|
|
|
|
|
implementation. This allows implementations to manage their |
239
|
|
|
|
|
|
|
own object ids in a way the implementation can control |
240
|
|
|
|
|
|
|
clients can expect one id to map to one object. |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
For sequences with no accession number, this method should |
243
|
|
|
|
|
|
|
return a stringified memory location. |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
Returns : A string |
246
|
|
|
|
|
|
|
Args : None |
247
|
|
|
|
|
|
|
Status : Virtual |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=cut |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
sub primary_id { |
252
|
0
|
|
|
0
|
1
|
0
|
my ($self,@args) = @_; |
253
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented(); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head2 can_call_new |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Title : can_call_new |
260
|
|
|
|
|
|
|
Usage : if( $obj->can_call_new ) { |
261
|
|
|
|
|
|
|
$newobj = $obj->new( %param ); |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
Function: Can_call_new returns 1 or 0 depending |
264
|
|
|
|
|
|
|
on whether an implementation allows new |
265
|
|
|
|
|
|
|
constructor to be called. If a new constructor |
266
|
|
|
|
|
|
|
is allowed, then it should take the followed hashed |
267
|
|
|
|
|
|
|
constructor list. |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
$myobject->new( -seq => $sequence_as_string, |
270
|
|
|
|
|
|
|
-display_id => $id |
271
|
|
|
|
|
|
|
-accession_number => $accession |
272
|
|
|
|
|
|
|
-alphabet => 'dna', |
273
|
|
|
|
|
|
|
); |
274
|
|
|
|
|
|
|
Returns : 1 or 0 |
275
|
|
|
|
|
|
|
Args : |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=cut |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
sub can_call_new { |
281
|
0
|
|
|
0
|
1
|
0
|
my ($self,@args) = @_; |
282
|
|
|
|
|
|
|
# we default to 0 here |
283
|
0
|
|
|
|
|
0
|
return 0; |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
=head2 alphabet |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
Title : alphabet |
290
|
|
|
|
|
|
|
Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } |
291
|
|
|
|
|
|
|
Function: Returns the type of sequence being one of |
292
|
|
|
|
|
|
|
'dna', 'rna' or 'protein'. This is case sensitive. |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
This is not called "type" because this would cause |
295
|
|
|
|
|
|
|
upgrade problems from the 0.5 and earlier Seq objects. |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
Returns : A string either 'dna','rna','protein'. NB - the object must |
298
|
|
|
|
|
|
|
make a call of the alphabet, if there is no alphabet specified it |
299
|
|
|
|
|
|
|
has to guess. |
300
|
|
|
|
|
|
|
Args : None |
301
|
|
|
|
|
|
|
Status : Virtual |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
=cut |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
sub alphabet { |
306
|
0
|
|
|
0
|
1
|
0
|
my ( $self ) = @_; |
307
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented(); |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=head2 moltype |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
Title : moltype |
314
|
|
|
|
|
|
|
Usage : Deprecated. Use alphabet() instead. |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
=cut |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
sub moltype { |
319
|
0
|
|
|
0
|
1
|
0
|
my ($self,@args) = @_; |
320
|
0
|
|
|
|
|
0
|
$self->warn("moltype: pre v1.0 method. Calling alphabet() instead..."); |
321
|
0
|
|
|
|
|
0
|
return $self->alphabet(@args); |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=head1 Implementation-optional Functions |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
The following functions rely on the above functions. An |
328
|
|
|
|
|
|
|
implementing class does not need to provide these functions, as they |
329
|
|
|
|
|
|
|
will be provided by this class, but is free to override these |
330
|
|
|
|
|
|
|
functions. |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
The revcom(), trunc(), and translate() methods create new sequence |
333
|
|
|
|
|
|
|
objects. They will call new() on the class of the sequence object |
334
|
|
|
|
|
|
|
instance passed as argument, unless can_call_new() returns FALSE. In |
335
|
|
|
|
|
|
|
the latter case a Bio::PrimarySeq object will be created. Implementors |
336
|
|
|
|
|
|
|
which really want to control how objects are created (eg, for object |
337
|
|
|
|
|
|
|
persistence over a database, or objects in a CORBA framework), they |
338
|
|
|
|
|
|
|
are encouraged to override these methods |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
=head2 revcom |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
Title : revcom |
343
|
|
|
|
|
|
|
Usage : $rev = $seq->revcom() |
344
|
|
|
|
|
|
|
Function: Produces a new Bio::PrimarySeqI implementing object which |
345
|
|
|
|
|
|
|
is the reversed complement of the sequence. For protein |
346
|
|
|
|
|
|
|
sequences this throws an exception of "Sequence is a |
347
|
|
|
|
|
|
|
protein. Cannot revcom". |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
The id is the same id as the original sequence, and the |
350
|
|
|
|
|
|
|
accession number is also identical. If someone wants to |
351
|
|
|
|
|
|
|
track that this sequence has be reversed, it needs to |
352
|
|
|
|
|
|
|
define its own extensions. |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
To do an inplace edit of an object you can go: |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
$seq = $seq->revcom(); |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
This of course, causes Perl to handle the garbage |
359
|
|
|
|
|
|
|
collection of the old object, but it is roughly speaking as |
360
|
|
|
|
|
|
|
efficient as an inplace edit. |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
Returns : A new (fresh) Bio::PrimarySeqI object |
363
|
|
|
|
|
|
|
Args : None |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=cut |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
sub revcom { |
369
|
11271
|
|
|
11271
|
1
|
16428
|
my ($self) = @_; |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' |
372
|
|
|
|
|
|
|
# or 'Bio::Seq::LargeSeq', if not take advantage of |
373
|
|
|
|
|
|
|
# Bio::Root::clone to get an object copy |
374
|
11271
|
|
|
|
|
13935
|
my $out; |
375
|
11271
|
50
|
33
|
|
|
75361
|
if ( $self->isa('Bio::Seq::LargePrimarySeq') |
376
|
|
|
|
|
|
|
or $self->isa('Bio::Seq::LargeSeq') |
377
|
|
|
|
|
|
|
) { |
378
|
0
|
|
|
|
|
0
|
my ($seqclass, $opts) = $self->_setup_class; |
379
|
0
|
|
|
|
|
0
|
$out = $seqclass->new( |
380
|
|
|
|
|
|
|
-seq => $self->_revcom_from_string($self->seq, $self->alphabet), |
381
|
|
|
|
|
|
|
-is_circular => $self->is_circular, |
382
|
|
|
|
|
|
|
-display_id => $self->display_id, |
383
|
|
|
|
|
|
|
-accession_number => $self->accession_number, |
384
|
|
|
|
|
|
|
-alphabet => $self->alphabet, |
385
|
|
|
|
|
|
|
-desc => $self->desc, |
386
|
|
|
|
|
|
|
-verbose => $self->verbose, |
387
|
|
|
|
|
|
|
%$opts, |
388
|
|
|
|
|
|
|
); |
389
|
|
|
|
|
|
|
} else { |
390
|
11271
|
|
|
|
|
25905
|
$out = $self->clone; |
391
|
11271
|
|
|
|
|
22724
|
$out->seq( $out->_revcom_from_string($out->seq, $out->alphabet) ); |
392
|
|
|
|
|
|
|
} |
393
|
11271
|
|
|
|
|
24517
|
return $out; |
394
|
|
|
|
|
|
|
} |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
sub _revcom_from_string { |
398
|
11337
|
|
|
11337
|
|
18888
|
my ($self, $string, $alphabet) = @_; |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# Check that reverse-complementing makes sense |
401
|
11337
|
50
|
|
|
|
19610
|
if( $alphabet eq 'protein' ) { |
402
|
0
|
|
|
|
|
0
|
$self->throw("Sequence is a protein. Cannot revcom."); |
403
|
|
|
|
|
|
|
} |
404
|
11337
|
50
|
66
|
|
|
19400
|
if( $alphabet ne 'dna' && $alphabet ne 'rna' ) { |
405
|
0
|
|
|
|
|
0
|
my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ". |
406
|
|
|
|
|
|
|
"but unsure if this is right."; |
407
|
0
|
0
|
|
|
|
0
|
if( $self->can('warn') ) { |
408
|
0
|
|
|
|
|
0
|
$self->warn($msg); |
409
|
|
|
|
|
|
|
} else { |
410
|
0
|
|
|
|
|
0
|
warn("[$self] $msg"); |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# If sequence is RNA, map to DNA (then map back later) |
415
|
11337
|
100
|
|
|
|
16784
|
if( $alphabet eq 'rna' ) { |
416
|
1
|
|
|
|
|
4
|
$string =~ tr/uU/tT/; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
# Reverse-complement now |
420
|
11337
|
|
|
|
|
16325
|
$string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; |
421
|
11337
|
|
|
|
|
17617
|
$string = CORE::reverse $string; |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# Map back RNA to DNA |
424
|
11337
|
100
|
|
|
|
17149
|
if( $alphabet eq 'rna' ) { |
425
|
1
|
|
|
|
|
2
|
$string =~ tr/tT/uU/; |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
11337
|
|
|
|
|
21064
|
return $string; |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=head2 trunc |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
Title : trunc |
435
|
|
|
|
|
|
|
Usage : $subseq = $myseq->trunc(10,100); |
436
|
|
|
|
|
|
|
Function: Provides a truncation of a sequence. |
437
|
|
|
|
|
|
|
Returns : A fresh Bio::PrimarySeqI implementing object. |
438
|
|
|
|
|
|
|
Args : Two integers denoting first and last base of the sub-sequence. |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=cut |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
sub trunc { |
444
|
324
|
|
|
324
|
1
|
570
|
my ($self,$start,$end) = @_; |
445
|
|
|
|
|
|
|
|
446
|
324
|
|
|
|
|
368
|
my $str; |
447
|
324
|
100
|
66
|
|
|
1638
|
if( defined $start && ref($start) && |
|
|
50
|
66
|
|
|
|
|
|
|
50
|
|
|
|
|
|
448
|
|
|
|
|
|
|
$start->isa('Bio::LocationI') ) { |
449
|
2
|
|
|
|
|
7
|
$str = $self->subseq($start); # start is a location actually |
450
|
|
|
|
|
|
|
} elsif( !$end ) { |
451
|
0
|
|
|
|
|
0
|
$self->throw("trunc start,end -- there was no end for $start"); |
452
|
|
|
|
|
|
|
} elsif( $end < $start ) { |
453
|
0
|
|
|
|
|
0
|
my $msg = "start [$start] is greater than end [$end]. \n". |
454
|
|
|
|
|
|
|
"If you want to truncated and reverse complement, \n". |
455
|
|
|
|
|
|
|
"you must call trunc followed by revcom. Sorry."; |
456
|
0
|
|
|
|
|
0
|
$self->throw($msg); |
457
|
|
|
|
|
|
|
} else { |
458
|
322
|
|
|
|
|
829
|
$str = $self->subseq($start,$end); |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' |
462
|
|
|
|
|
|
|
# or 'Bio::Seq::LargeSeq', if not take advantage of |
463
|
|
|
|
|
|
|
# Bio::Root::clone to get an object copy |
464
|
324
|
|
|
|
|
398
|
my $out; |
465
|
324
|
100
|
66
|
|
|
3481
|
if ( $self->isa('Bio::Seq::LargePrimarySeq') |
|
|
|
100
|
|
|
|
|
466
|
|
|
|
|
|
|
or $self->isa('Bio::Seq::LargeSeq') |
467
|
|
|
|
|
|
|
or $self->isa('Bio::Seq::RichSeq') |
468
|
|
|
|
|
|
|
) { |
469
|
8
|
|
|
|
|
33
|
my ($seqclass, $opts) = $self->_setup_class; |
470
|
8
|
|
|
|
|
37
|
$out = $seqclass->new( |
471
|
|
|
|
|
|
|
-seq => $str, |
472
|
|
|
|
|
|
|
-is_circular => $self->is_circular, |
473
|
|
|
|
|
|
|
-display_id => $self->display_id, |
474
|
|
|
|
|
|
|
-accession_number => $self->accession_number, |
475
|
|
|
|
|
|
|
-alphabet => $self->alphabet, |
476
|
|
|
|
|
|
|
-desc => $self->desc, |
477
|
|
|
|
|
|
|
-verbose => $self->verbose, |
478
|
|
|
|
|
|
|
%$opts, |
479
|
|
|
|
|
|
|
); |
480
|
|
|
|
|
|
|
} else { |
481
|
316
|
|
|
|
|
900
|
$out = $self->clone; |
482
|
316
|
|
|
|
|
778
|
$out->seq($str); |
483
|
|
|
|
|
|
|
} |
484
|
324
|
|
|
|
|
834
|
return $out; |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
=head2 translate |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
Title : translate |
491
|
|
|
|
|
|
|
Usage : $protein_seq_obj = $dna_seq_obj->translate |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
Or if you expect a complete coding sequence (CDS) translation, |
494
|
|
|
|
|
|
|
with initiator at the beginning and terminator at the end: |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
$protein_seq_obj = $cds_seq_obj->translate(-complete => 1); |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
Or if you want translate() to find the first initiation |
499
|
|
|
|
|
|
|
codon and return the corresponding protein: |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
$protein_seq_obj = $cds_seq_obj->translate(-orf => 1); |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
Function: Provides the translation of the DNA sequence using full |
504
|
|
|
|
|
|
|
IUPAC ambiguities in DNA/RNA and amino acid codes. |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
The complete CDS translation is identical to EMBL/TREMBL |
507
|
|
|
|
|
|
|
database translation. Note that the trailing terminator |
508
|
|
|
|
|
|
|
character is removed before returning the translated protein |
509
|
|
|
|
|
|
|
object. |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
Note: if you set $dna_seq_obj->verbose(1) you will get a |
512
|
|
|
|
|
|
|
warning if the first codon is not a valid initiator. |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
Returns : A Bio::PrimarySeqI implementing object |
515
|
|
|
|
|
|
|
Args : -terminator |
516
|
|
|
|
|
|
|
character for terminator, default '*' |
517
|
|
|
|
|
|
|
-unknown |
518
|
|
|
|
|
|
|
character for unknown, default 'X' |
519
|
|
|
|
|
|
|
-frame |
520
|
|
|
|
|
|
|
positive integer frame shift (in bases), default 0 |
521
|
|
|
|
|
|
|
-codontable_id |
522
|
|
|
|
|
|
|
integer codon table id, default 1 |
523
|
|
|
|
|
|
|
-complete |
524
|
|
|
|
|
|
|
boolean, if true, complete CDS is expected. default false |
525
|
|
|
|
|
|
|
-complete_codons |
526
|
|
|
|
|
|
|
boolean, if true, codons which are incomplete are translated if a |
527
|
|
|
|
|
|
|
suitable amino acid is found. For instance, if the incomplete |
528
|
|
|
|
|
|
|
codon is 'GG', the completed codon is 'GGN', which is glycine |
529
|
|
|
|
|
|
|
(G). Defaults to 'false'; setting '-complete' also makes this |
530
|
|
|
|
|
|
|
true. |
531
|
|
|
|
|
|
|
-throw |
532
|
|
|
|
|
|
|
boolean, throw exception if ORF not complete, default false |
533
|
|
|
|
|
|
|
-orf |
534
|
|
|
|
|
|
|
if 'longest', find longest ORF. other true value, find |
535
|
|
|
|
|
|
|
first ORF. default 0 |
536
|
|
|
|
|
|
|
-codontable |
537
|
|
|
|
|
|
|
optional L object to use for |
538
|
|
|
|
|
|
|
translation |
539
|
|
|
|
|
|
|
-start |
540
|
|
|
|
|
|
|
optional three-character string to force as initiation |
541
|
|
|
|
|
|
|
codon (e.g. 'atg'). If unset, start codons are |
542
|
|
|
|
|
|
|
determined by the CodonTable. Case insensitive. |
543
|
|
|
|
|
|
|
-offset |
544
|
|
|
|
|
|
|
optional positive integer offset for fuzzy locations. |
545
|
|
|
|
|
|
|
if set, must be either 1, 2, or 3 |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=head3 Notes |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
The -start argument only applies when -orf is set to 1. By default all |
550
|
|
|
|
|
|
|
initiation codons found in the given codon table are used but when |
551
|
|
|
|
|
|
|
"start" is set to some codon this codon will be used exclusively as |
552
|
|
|
|
|
|
|
the initiation codon. Note that the default codon table (NCBI |
553
|
|
|
|
|
|
|
"Standard") has 3 initiation codons! |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
By default translate() translates termination codons to the some |
556
|
|
|
|
|
|
|
character (default is *), both internal and trailing codons. Setting |
557
|
|
|
|
|
|
|
"-complete" to 1 tells translate() to remove the trailing character. |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
-offset is used for seqfeatures which contain the the \codon_start tag |
560
|
|
|
|
|
|
|
and can be set to 1, 2, or 3. This is the offset by which the |
561
|
|
|
|
|
|
|
sequence translation starts relative to the first base of the feature |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
For details on codon tables used by translate() see L. |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an |
566
|
|
|
|
|
|
|
element in an array: |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
1: character for terminator (optional), defaults to '*'. |
569
|
|
|
|
|
|
|
2: character for unknown amino acid (optional), defaults to 'X'. |
570
|
|
|
|
|
|
|
3: frame (optional), valid values are 0, 1, 2, defaults to 0. |
571
|
|
|
|
|
|
|
4: codon table id (optional), defaults to 1. |
572
|
|
|
|
|
|
|
5: complete coding sequence expected, defaults to 0 (false). |
573
|
|
|
|
|
|
|
6: boolean, throw exception if not complete coding sequence |
574
|
|
|
|
|
|
|
(true), defaults to warning (false) |
575
|
|
|
|
|
|
|
7: codontable, a custom Bio::Tools::CodonTable object (optional). |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=cut |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
sub translate { |
580
|
287
|
|
|
287
|
1
|
2273
|
my ($self,@args) = @_; |
581
|
287
|
|
|
|
|
529
|
my ($terminator, $unknown, $frame, $codonTableId, $complete, |
582
|
|
|
|
|
|
|
$complete_codons, $throw, $codonTable, $orf, $start_codon, $offset); |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
## new API with named parameters, post 1.5.1 |
585
|
287
|
100
|
100
|
|
|
774
|
if ($args[0] && $args[0] =~ /^-[A-Z]+/i) { |
586
|
24
|
|
|
|
|
109
|
($terminator, $unknown, $frame, $codonTableId, $complete, |
587
|
|
|
|
|
|
|
$complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) = |
588
|
|
|
|
|
|
|
$self->_rearrange([qw(TERMINATOR |
589
|
|
|
|
|
|
|
UNKNOWN |
590
|
|
|
|
|
|
|
FRAME |
591
|
|
|
|
|
|
|
CODONTABLE_ID |
592
|
|
|
|
|
|
|
COMPLETE |
593
|
|
|
|
|
|
|
COMPLETE_CODONS |
594
|
|
|
|
|
|
|
THROW |
595
|
|
|
|
|
|
|
CODONTABLE |
596
|
|
|
|
|
|
|
ORF |
597
|
|
|
|
|
|
|
START |
598
|
|
|
|
|
|
|
OFFSET)], @args); |
599
|
|
|
|
|
|
|
## old API, 1.5.1 and preceding versions |
600
|
|
|
|
|
|
|
} else { |
601
|
263
|
|
|
|
|
501
|
($terminator, $unknown, $frame, $codonTableId, |
602
|
|
|
|
|
|
|
$complete, $throw, $codonTable, $offset) = @args; |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
## Initialize termination codon, unknown codon, codon table id, frame |
606
|
287
|
100
|
66
|
|
|
776
|
$terminator = '*' unless (defined($terminator) and $terminator ne ''); |
607
|
287
|
100
|
66
|
|
|
652
|
$unknown = "X" unless (defined($unknown) and $unknown ne ''); |
608
|
287
|
100
|
66
|
|
|
635
|
$frame = 0 unless (defined($frame) and $frame ne ''); |
609
|
287
|
100
|
66
|
|
|
649
|
$codonTableId = 1 unless (defined($codonTableId) and $codonTableId ne ''); |
610
|
287
|
|
100
|
|
|
1208
|
$complete_codons ||= $complete || 0; |
|
|
|
100
|
|
|
|
|
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
## Get a CodonTable, error if custom CodonTable is invalid |
613
|
287
|
100
|
|
|
|
429
|
if ($codonTable) { |
614
|
1
|
50
|
|
|
|
5
|
$self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable) |
615
|
|
|
|
|
|
|
unless $codonTable->isa('Bio::Tools::CodonTable'); |
616
|
|
|
|
|
|
|
} else { |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
# shouldn't this be cached? Seems wasteful to have a new instance |
619
|
|
|
|
|
|
|
# every time... |
620
|
286
|
|
|
|
|
1187
|
$codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId); |
621
|
|
|
|
|
|
|
} |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
## Error if alphabet is "protein" |
624
|
287
|
50
|
|
|
|
719
|
$self->throw("Can't translate an amino acid sequence.") if |
625
|
|
|
|
|
|
|
($self->alphabet =~ /protein/i); |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
## Error if -start parameter isn't a valid codon |
628
|
287
|
100
|
|
|
|
606
|
if ($start_codon) { |
629
|
1
|
50
|
|
|
|
5
|
$self->throw("Invalid start codon: $start_codon.") if |
630
|
|
|
|
|
|
|
( $start_codon !~ /^[A-Z]{3}$/i ); |
631
|
|
|
|
|
|
|
} |
632
|
|
|
|
|
|
|
|
633
|
287
|
|
|
|
|
338
|
my $seq; |
634
|
287
|
50
|
|
|
|
452
|
if ($offset) { |
635
|
0
|
0
|
|
|
|
0
|
$self->throw("Offset must be 1, 2, or 3.") if |
636
|
|
|
|
|
|
|
( $offset !~ /^[123]$/ ); |
637
|
0
|
|
|
|
|
0
|
my ($start, $end) = ($offset, $self->length); |
638
|
0
|
|
|
|
|
0
|
($seq) = $self->subseq($start, $end); |
639
|
|
|
|
|
|
|
} else { |
640
|
287
|
|
|
|
|
575
|
($seq) = $self->seq(); |
641
|
|
|
|
|
|
|
} |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
## ignore frame if an ORF is supposed to be found |
644
|
287
|
100
|
|
|
|
572
|
if ( $orf ) { |
645
|
12
|
100
|
|
|
|
40
|
my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' ); |
646
|
11
|
|
|
|
|
30
|
$seq = $self->_orf_sequence( $seq, $orf_region ); |
647
|
|
|
|
|
|
|
} else { |
648
|
|
|
|
|
|
|
## use frame, error if frame is not 0, 1 or 2 |
649
|
275
|
50
|
100
|
|
|
629
|
$self->throw("Valid values for frame are 0, 1, or 2, not $frame.") |
|
|
|
66
|
|
|
|
|
650
|
|
|
|
|
|
|
unless ($frame == 0 or $frame == 1 or $frame == 2); |
651
|
275
|
|
|
|
|
804
|
$seq = substr($seq,$frame); |
652
|
|
|
|
|
|
|
} |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
## Translate it |
655
|
286
|
|
|
|
|
830
|
my $output = $codonTable->translate($seq, $complete_codons); |
656
|
|
|
|
|
|
|
# Use user-input terminator/unknown |
657
|
286
|
|
|
|
|
1414
|
$output =~ s/\*/$terminator/g; |
658
|
286
|
|
|
|
|
540
|
$output =~ s/X/$unknown/g; |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
## Only if we are expecting to translate a complete coding region |
661
|
286
|
100
|
|
|
|
601
|
if ($complete) { |
662
|
9
|
|
|
|
|
21
|
my $id = $self->display_id; |
663
|
|
|
|
|
|
|
# remove the terminator character |
664
|
9
|
50
|
|
|
|
30
|
if( substr($output,-1,1) eq $terminator ) { |
665
|
9
|
|
|
|
|
17
|
chop $output; |
666
|
|
|
|
|
|
|
} else { |
667
|
0
|
0
|
|
|
|
0
|
$throw && $self->throw("Seq [$id]: Not using a valid terminator codon!"); |
668
|
0
|
|
|
|
|
0
|
$self->warn("Seq [$id]: Not using a valid terminator codon!"); |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
# test if there are terminator characters inside the protein sequence! |
671
|
9
|
100
|
|
|
|
71
|
if ($output =~ /\Q$terminator\E/) { |
672
|
2
|
|
100
|
|
|
12
|
$id ||= ''; |
673
|
2
|
50
|
|
|
|
22
|
$throw && $self->throw("Seq [$id]: Terminator codon inside CDS!"); |
674
|
0
|
|
|
|
|
0
|
$self->warn("Seq [$id]: Terminator codon inside CDS!"); |
675
|
|
|
|
|
|
|
} |
676
|
|
|
|
|
|
|
# if the initiator codon is not ATG, the amino acid needs to be changed to M |
677
|
7
|
100
|
|
|
|
18
|
if ( substr($output,0,1) ne 'M' ) { |
678
|
3
|
50
|
|
|
|
9
|
if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) { |
|
|
0
|
|
|
|
|
|
679
|
3
|
|
|
|
|
6
|
$output = 'M'. substr($output,1); |
680
|
|
|
|
|
|
|
} elsif ($throw) { |
681
|
0
|
|
|
|
|
0
|
$self->throw("Seq [$id]: Not using a valid initiator codon!"); |
682
|
|
|
|
|
|
|
} else { |
683
|
0
|
|
|
|
|
0
|
$self->warn("Seq [$id]: Not using a valid initiator codon!"); |
684
|
|
|
|
|
|
|
} |
685
|
|
|
|
|
|
|
} |
686
|
|
|
|
|
|
|
} |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' |
689
|
|
|
|
|
|
|
# or 'Bio::Seq::LargeSeq', if not take advantage of |
690
|
|
|
|
|
|
|
# Bio::Root::clone to get an object copy |
691
|
284
|
|
|
|
|
348
|
my $out; |
692
|
284
|
100
|
100
|
|
|
3112
|
if ( $self->isa('Bio::Seq::LargePrimarySeq') |
693
|
|
|
|
|
|
|
or $self->isa('Bio::Seq::LargeSeq') |
694
|
|
|
|
|
|
|
) { |
695
|
3
|
|
|
|
|
13
|
my ($seqclass, $opts) = $self->_setup_class; |
696
|
3
|
|
|
|
|
12
|
$out = $seqclass->new( |
697
|
|
|
|
|
|
|
-seq => $output, |
698
|
|
|
|
|
|
|
-is_circular => $self->is_circular, |
699
|
|
|
|
|
|
|
-display_id => $self->display_id, |
700
|
|
|
|
|
|
|
-accession_number => $self->accession_number, |
701
|
|
|
|
|
|
|
-alphabet => 'protein', |
702
|
|
|
|
|
|
|
-desc => $self->desc, |
703
|
|
|
|
|
|
|
-verbose => $self->verbose, |
704
|
|
|
|
|
|
|
%$opts, |
705
|
|
|
|
|
|
|
); |
706
|
|
|
|
|
|
|
} else { |
707
|
281
|
|
|
|
|
986
|
$out = $self->clone; |
708
|
281
|
|
|
|
|
939
|
$out->seq($output); |
709
|
281
|
|
|
|
|
601
|
$out->alphabet('protein'); |
710
|
|
|
|
|
|
|
} |
711
|
284
|
|
|
|
|
1263
|
return $out; |
712
|
|
|
|
|
|
|
} |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
=head2 transcribe() |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
Title : transcribe |
718
|
|
|
|
|
|
|
Usage : $xseq = $seq->transcribe |
719
|
|
|
|
|
|
|
Function: Convert base T to base U |
720
|
|
|
|
|
|
|
Returns : PrimarySeqI object of alphabet 'rna' or |
721
|
|
|
|
|
|
|
undef if $seq->alphabet ne 'dna' |
722
|
|
|
|
|
|
|
Args : |
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
=cut |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
sub transcribe { |
727
|
2
|
|
|
2
|
1
|
34
|
my $self = shift; |
728
|
2
|
100
|
|
|
|
6
|
return unless $self->alphabet eq 'dna'; |
729
|
1
|
|
|
|
|
5
|
my $s = $self->seq; |
730
|
1
|
|
|
|
|
3
|
$s =~ tr/tT/uU/; |
731
|
1
|
|
50
|
|
|
3
|
my $desc = $self->desc || ''; |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' |
734
|
|
|
|
|
|
|
# or 'Bio::Seq::LargeSeq', if not take advantage of |
735
|
|
|
|
|
|
|
# Bio::Root::clone to get an object copy |
736
|
1
|
|
|
|
|
1
|
my $out; |
737
|
1
|
50
|
33
|
|
|
12
|
if ( $self->isa('Bio::Seq::LargePrimarySeq') |
738
|
|
|
|
|
|
|
or $self->isa('Bio::Seq::LargeSeq') |
739
|
|
|
|
|
|
|
) { |
740
|
0
|
|
|
|
|
0
|
my ($seqclass, $opts) = $self->_setup_class; |
741
|
0
|
|
|
|
|
0
|
$out = $seqclass->new( |
742
|
|
|
|
|
|
|
-seq => $s, |
743
|
|
|
|
|
|
|
-is_circular => $self->is_circular, |
744
|
|
|
|
|
|
|
-display_id => $self->display_id, |
745
|
|
|
|
|
|
|
-accession_number => $self->accession_number, |
746
|
|
|
|
|
|
|
-alphabet => 'rna', |
747
|
|
|
|
|
|
|
-desc => "${desc}[TRANSCRIBED]", |
748
|
|
|
|
|
|
|
-verbose => $self->verbose, |
749
|
|
|
|
|
|
|
%$opts, |
750
|
|
|
|
|
|
|
); |
751
|
|
|
|
|
|
|
} else { |
752
|
1
|
|
|
|
|
5
|
$out = $self->clone; |
753
|
1
|
|
|
|
|
4
|
$out->seq($s); |
754
|
1
|
|
|
|
|
4
|
$out->alphabet('rna'); |
755
|
1
|
|
|
|
|
5
|
$out->desc($desc . "[TRANSCRIBED]"); |
756
|
|
|
|
|
|
|
} |
757
|
1
|
|
|
|
|
6
|
return $out; |
758
|
|
|
|
|
|
|
} |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
=head2 rev_transcribe() |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
Title : rev_transcribe |
764
|
|
|
|
|
|
|
Usage : $rtseq = $seq->rev_transcribe |
765
|
|
|
|
|
|
|
Function: Convert base U to base T |
766
|
|
|
|
|
|
|
Returns : PrimarySeqI object of alphabet 'dna' or |
767
|
|
|
|
|
|
|
undef if $seq->alphabet ne 'rna' |
768
|
|
|
|
|
|
|
Args : |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
=cut |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
sub rev_transcribe { |
773
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
774
|
1
|
50
|
|
|
|
4
|
return unless $self->alphabet eq 'rna'; |
775
|
1
|
|
|
|
|
5
|
my $s = $self->seq; |
776
|
1
|
|
|
|
|
4
|
$s =~ tr/uU/tT/; |
777
|
1
|
|
50
|
|
|
4
|
my $desc = $self->desc || ''; |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
# Create a new fresh object if $self is 'Bio::Seq::LargePrimarySeq' |
780
|
|
|
|
|
|
|
# or 'Bio::Seq::LargeSeq', if not take advantage of |
781
|
|
|
|
|
|
|
# Bio::Root::clone to get an object copy |
782
|
1
|
|
|
|
|
3
|
my $out; |
783
|
1
|
50
|
33
|
|
|
18
|
if ( $self->isa('Bio::Seq::LargePrimarySeq') |
784
|
|
|
|
|
|
|
or $self->isa('Bio::Seq::LargeSeq') |
785
|
|
|
|
|
|
|
) { |
786
|
0
|
|
|
|
|
0
|
my ($seqclass, $opts) = $self->_setup_class; |
787
|
0
|
|
|
|
|
0
|
$out = $seqclass->new( |
788
|
|
|
|
|
|
|
-seq => $s, |
789
|
|
|
|
|
|
|
-is_circular => $self->is_circular, |
790
|
|
|
|
|
|
|
-display_id => $self->display_id, |
791
|
|
|
|
|
|
|
-accession_number => $self->accession_number, |
792
|
|
|
|
|
|
|
-alphabet => 'dna', |
793
|
|
|
|
|
|
|
-desc => $self->desc . "[REVERSE TRANSCRIBED]", |
794
|
|
|
|
|
|
|
-verbose => $self->verbose, |
795
|
|
|
|
|
|
|
%$opts, |
796
|
|
|
|
|
|
|
); |
797
|
|
|
|
|
|
|
} else { |
798
|
1
|
|
|
|
|
8
|
$out = $self->clone; |
799
|
1
|
|
|
|
|
3
|
$out->seq($s); |
800
|
1
|
|
|
|
|
3
|
$out->alphabet('dna'); |
801
|
1
|
|
|
|
|
4
|
$out->desc($desc . "[REVERSE TRANSCRIBED]"); |
802
|
|
|
|
|
|
|
} |
803
|
1
|
|
|
|
|
5
|
return $out; |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
=head2 id |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
Title : id |
810
|
|
|
|
|
|
|
Usage : $id = $seq->id() |
811
|
|
|
|
|
|
|
Function: ID of the sequence. This should normally be (and actually is in |
812
|
|
|
|
|
|
|
the implementation provided here) just a synonym for display_id(). |
813
|
|
|
|
|
|
|
Returns : A string. |
814
|
|
|
|
|
|
|
Args : |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
=cut |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
sub id { |
819
|
1
|
|
|
1
|
1
|
2
|
my ($self)= @_; |
820
|
1
|
|
|
|
|
4
|
return $self->display_id(); |
821
|
|
|
|
|
|
|
} |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
=head2 length |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
Title : length |
827
|
|
|
|
|
|
|
Usage : $len = $seq->length() |
828
|
|
|
|
|
|
|
Function: |
829
|
|
|
|
|
|
|
Returns : Integer representing the length of the sequence. |
830
|
|
|
|
|
|
|
Args : |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
=cut |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
sub length { |
835
|
0
|
|
|
0
|
1
|
0
|
my ($self)= @_; |
836
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented(); |
837
|
|
|
|
|
|
|
} |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
=head2 desc |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
Title : desc |
843
|
|
|
|
|
|
|
Usage : $seq->desc($newval); |
844
|
|
|
|
|
|
|
$description = $seq->desc(); |
845
|
|
|
|
|
|
|
Function: Get/set description text for a seq object |
846
|
|
|
|
|
|
|
Returns : Value of desc |
847
|
|
|
|
|
|
|
Args : newvalue (optional) |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
=cut |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
sub desc { |
852
|
0
|
|
|
0
|
1
|
0
|
shift->throw_not_implemented(); |
853
|
|
|
|
|
|
|
} |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
=head2 is_circular |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
Title : is_circular |
859
|
|
|
|
|
|
|
Usage : if( $obj->is_circular) { # Do something } |
860
|
|
|
|
|
|
|
Function: Returns true if the molecule is circular |
861
|
|
|
|
|
|
|
Returns : Boolean value |
862
|
|
|
|
|
|
|
Args : none |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
=cut |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
sub is_circular { |
867
|
0
|
|
|
0
|
1
|
0
|
shift->throw_not_implemented; |
868
|
|
|
|
|
|
|
} |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
=head1 Private functions |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
These are some private functions for the PrimarySeqI interface. You do not |
874
|
|
|
|
|
|
|
need to implement these functions |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
=head2 _find_orfs_nucleotide |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
Title : _find_orfs_nucleotide |
879
|
|
|
|
|
|
|
Usage : |
880
|
|
|
|
|
|
|
Function: Finds ORF starting at 1st initiation codon in nucleotide sequence. |
881
|
|
|
|
|
|
|
The ORF is not required to have a termination codon. |
882
|
|
|
|
|
|
|
Example : |
883
|
|
|
|
|
|
|
Returns : a list of string coordinates of ORF locations (0-based half-open), |
884
|
|
|
|
|
|
|
sorted descending by length (so that the longest is first) |
885
|
|
|
|
|
|
|
as: [ start, end, frame, length ], [ start, end, frame, length ], ... |
886
|
|
|
|
|
|
|
Args : Nucleotide sequence, |
887
|
|
|
|
|
|
|
CodonTable object, |
888
|
|
|
|
|
|
|
(optional) alternative initiation codon (e.g. 'ATA'), |
889
|
|
|
|
|
|
|
(optional) boolean that, if true, stops after finding the |
890
|
|
|
|
|
|
|
first available ORF |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
=cut |
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
sub _find_orfs_nucleotide { |
895
|
14
|
|
|
14
|
|
25
|
my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_; |
896
|
14
|
|
|
|
|
24
|
$sequence = uc $sequence; |
897
|
14
|
100
|
|
|
|
24
|
$start_codon = uc $start_codon if $start_codon; |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
my $is_start = $start_codon |
900
|
12
|
|
|
12
|
|
24
|
? sub { shift eq $start_codon } |
901
|
14
|
100
|
|
697
|
|
54
|
: sub { $codon_table->is_start_codon( shift ) }; |
|
697
|
|
|
|
|
1007
|
|
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
# stores the begin index of the currently-running ORF in each |
904
|
|
|
|
|
|
|
# reading frame |
905
|
14
|
|
|
|
|
26
|
my @current_orf_start = (-1,-1,-1); |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
#< stores coordinates of longest observed orf (so far) in each |
908
|
|
|
|
|
|
|
# reading frame |
909
|
14
|
|
|
|
|
15
|
my @orfs; |
910
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
# go through each base of the sequence, and each reading frame for each base |
912
|
14
|
|
|
|
|
21
|
my $seqlen = CORE::length $sequence; |
913
|
14
|
|
|
|
|
11
|
my @start_frame_order; |
914
|
14
|
|
|
|
|
32
|
for( my $j = 0; $j <= $seqlen-3; $j++ ) { |
915
|
1239
|
|
|
|
|
1387
|
my $frame = $j % 3; |
916
|
|
|
|
|
|
|
|
917
|
1239
|
|
|
|
|
1388
|
my $this_codon = substr( $sequence, $j, 3 ); |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
# if in an orf and this is either a stop codon or the last in-frame codon in the string |
920
|
1239
|
100
|
|
|
|
1832
|
if ( $current_orf_start[$frame] >= 0 ) { |
|
|
100
|
|
|
|
|
|
921
|
530
|
100
|
100
|
|
|
762
|
if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) { |
922
|
|
|
|
|
|
|
# record ORF start, end (half-open), length, and frame |
923
|
44
|
|
|
|
|
86
|
my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame ); |
924
|
44
|
|
|
|
|
64
|
my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] ); |
925
|
|
|
|
|
|
|
|
926
|
44
|
100
|
100
|
|
|
83
|
if ($first_only && $frame == $start_frame_order[0]) { |
927
|
10
|
100
|
|
|
|
23
|
$self->warn( "Translating partial ORF " |
928
|
|
|
|
|
|
|
.$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf )) |
929
|
|
|
|
|
|
|
.' from end of nucleotide sequence' |
930
|
|
|
|
|
|
|
) if $is_last_codon_in_frame; |
931
|
9
|
|
|
|
|
40
|
return \@this_orf; |
932
|
|
|
|
|
|
|
} |
933
|
34
|
|
|
|
|
48
|
push @orfs, \@this_orf; |
934
|
34
|
|
|
|
|
71
|
$current_orf_start[$frame] = -1; |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
# if this is a start codon |
938
|
|
|
|
|
|
|
elsif ( $is_start->($this_codon) ) { |
939
|
44
|
|
|
|
|
50
|
$current_orf_start[$frame] = $j; |
940
|
44
|
|
|
|
|
81
|
push @start_frame_order, $frame; |
941
|
|
|
|
|
|
|
} |
942
|
|
|
|
|
|
|
} |
943
|
|
|
|
|
|
|
|
944
|
4
|
|
|
|
|
22
|
return sort { $b->[2] <=> $a->[2] } @orfs; |
|
88
|
|
|
|
|
95
|
|
945
|
|
|
|
|
|
|
} |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
sub _truncate_seq { |
949
|
4
|
|
|
4
|
|
6
|
my ($self, $seq) = @_; |
950
|
4
|
50
|
|
|
|
19
|
return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq; |
951
|
|
|
|
|
|
|
} |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
sub _orf_sequence { |
955
|
15
|
|
|
15
|
|
24
|
my ($self, $seq, $orf ) = @_; |
956
|
15
|
50
|
|
|
|
25
|
return '' unless $orf; |
957
|
15
|
|
|
|
|
40
|
return substr( $seq, $orf->[0], $orf->[2] ) |
958
|
|
|
|
|
|
|
} |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
=head2 _attempt_to_load_Seq |
962
|
|
|
|
|
|
|
|
963
|
|
|
|
|
|
|
Title : _attempt_to_load_Seq |
964
|
|
|
|
|
|
|
Usage : |
965
|
|
|
|
|
|
|
Function: |
966
|
|
|
|
|
|
|
Example : |
967
|
|
|
|
|
|
|
Returns : |
968
|
|
|
|
|
|
|
Args : |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
=cut |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
sub _attempt_to_load_Seq { |
973
|
0
|
|
|
0
|
|
0
|
my ($self) = @_; |
974
|
|
|
|
|
|
|
|
975
|
0
|
0
|
|
|
|
0
|
if( $main::{'Bio::PrimarySeq'} ) { |
976
|
0
|
|
|
|
|
0
|
return 1; |
977
|
|
|
|
|
|
|
} else { |
978
|
0
|
|
|
|
|
0
|
eval { |
979
|
0
|
|
|
|
|
0
|
require Bio::PrimarySeq; |
980
|
|
|
|
|
|
|
}; |
981
|
0
|
0
|
|
|
|
0
|
if( $@ ) { |
982
|
0
|
|
|
|
|
0
|
my $text = "Bio::PrimarySeq could not be loaded for [$self]\n". |
983
|
|
|
|
|
|
|
"This indicates that you are using Bio::PrimarySeqI ". |
984
|
|
|
|
|
|
|
"without Bio::PrimarySeq loaded or without providing a ". |
985
|
|
|
|
|
|
|
"complete implementation.\nThe most likely problem is that there ". |
986
|
|
|
|
|
|
|
"has been a misconfiguration of the bioperl environment\n". |
987
|
|
|
|
|
|
|
"Actual exception:\n\n"; |
988
|
0
|
|
|
|
|
0
|
$self->throw("$text$@\n"); |
989
|
0
|
|
|
|
|
0
|
return 0; |
990
|
|
|
|
|
|
|
} |
991
|
0
|
|
|
|
|
0
|
return 1; |
992
|
|
|
|
|
|
|
} |
993
|
|
|
|
|
|
|
} |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
sub _setup_class { |
997
|
|
|
|
|
|
|
# Return name of class and setup some default parameters |
998
|
11
|
|
|
11
|
|
17
|
my ($self) = @_; |
999
|
11
|
|
|
|
|
14
|
my $seqclass; |
1000
|
11
|
50
|
|
|
|
32
|
if ($self->can_call_new()) { |
1001
|
11
|
|
|
|
|
19
|
$seqclass = ref($self); |
1002
|
|
|
|
|
|
|
} else { |
1003
|
0
|
|
|
|
|
0
|
$seqclass = 'Bio::PrimarySeq'; |
1004
|
0
|
|
|
|
|
0
|
$self->_attempt_to_load_Seq(); |
1005
|
|
|
|
|
|
|
} |
1006
|
11
|
|
|
|
|
17
|
my %opts; |
1007
|
11
|
50
|
|
|
|
73
|
if ($seqclass eq 'Bio::PrimarySeq') { |
1008
|
|
|
|
|
|
|
# Since sequence is in a Seq object, it has already been validated. |
1009
|
|
|
|
|
|
|
# We do not need to validate its trunc(), revcom(), etc |
1010
|
0
|
|
|
|
|
0
|
$opts{ -direct } = 1; |
1011
|
|
|
|
|
|
|
} |
1012
|
11
|
|
|
|
|
31
|
return $seqclass, \%opts; |
1013
|
|
|
|
|
|
|
} |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
1; |