| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# bioperl module for Bio::PrimarySeq |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
# Cared for by Ewan Birney |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Copyright Ewan Birney |
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::PrimarySeq - Bioperl lightweight sequence object |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# Bio::SeqIO for file reading, Bio::DB::GenBank for |
|
21
|
|
|
|
|
|
|
# database reading |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Bio::Seq; |
|
24
|
|
|
|
|
|
|
use Bio::SeqIO; |
|
25
|
|
|
|
|
|
|
use Bio::DB::GenBank; |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# make from memory |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
$seqobj = Bio::PrimarySeq->new ( |
|
30
|
|
|
|
|
|
|
-seq => 'ATGGGGTGGGCGGTGGGTGGTTTG', |
|
31
|
|
|
|
|
|
|
-id => 'GeneFragment-12', |
|
32
|
|
|
|
|
|
|
-accession_number => 'X78121', |
|
33
|
|
|
|
|
|
|
-alphabet => 'dna', |
|
34
|
|
|
|
|
|
|
-is_circular => 1, |
|
35
|
|
|
|
|
|
|
); |
|
36
|
|
|
|
|
|
|
print "Sequence ", $seqobj->id(), " with accession ", |
|
37
|
|
|
|
|
|
|
$seqobj->accession_number, "\n"; |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# read from file |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
$inputstream = Bio::SeqIO->new( |
|
42
|
|
|
|
|
|
|
-file => "myseq.fa", |
|
43
|
|
|
|
|
|
|
-format => 'Fasta', |
|
44
|
|
|
|
|
|
|
); |
|
45
|
|
|
|
|
|
|
$seqobj = $inputstream->next_seq(); |
|
46
|
|
|
|
|
|
|
print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n"; |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# to get out parts of the sequence. |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
print "Sequence ", $seqobj->id(), " with accession ", |
|
51
|
|
|
|
|
|
|
$seqobj->accession_number, " and desc ", $seqobj->desc, "\n"; |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
$string = $seqobj->seq(); |
|
54
|
|
|
|
|
|
|
$string2 = $seqobj->subseq(1,40); |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
PrimarySeq is a lightweight sequence object, storing the sequence, its |
|
59
|
|
|
|
|
|
|
name, a computer-useful unique name, and other fundamental attributes. |
|
60
|
|
|
|
|
|
|
It does not contain sequence features or other information. To have a |
|
61
|
|
|
|
|
|
|
sequence with sequence features you should use the Seq object which uses |
|
62
|
|
|
|
|
|
|
this object. |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Although new users will use Bio::PrimarySeq a lot, in general you will |
|
65
|
|
|
|
|
|
|
be using it from the Bio::Seq object. For more information on Bio::Seq |
|
66
|
|
|
|
|
|
|
see L. For interest you might like to know that |
|
67
|
|
|
|
|
|
|
Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls |
|
68
|
|
|
|
|
|
|
to do with sequence to it (the has-a relationship lets us get out of a |
|
69
|
|
|
|
|
|
|
otherwise nasty cyclical reference in Perl which would leak memory). |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Sequence objects are defined by the Bio::PrimarySeqI interface, and this |
|
72
|
|
|
|
|
|
|
object is a pure Perl implementation of the interface. If that's |
|
73
|
|
|
|
|
|
|
gibberish to you, don't worry. The take home message is that this |
|
74
|
|
|
|
|
|
|
object is the bioperl default sequence object, but other people can |
|
75
|
|
|
|
|
|
|
use their own objects as sequences if they so wish. If you are |
|
76
|
|
|
|
|
|
|
interested in wrapping your own objects as compliant Bioperl sequence |
|
77
|
|
|
|
|
|
|
objects, then you should read the Bio::PrimarySeqI documentation |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
The documentation of this object is a merge of the Bio::PrimarySeq and |
|
80
|
|
|
|
|
|
|
Bio::PrimarySeqI documentation. This allows all the methods which you can |
|
81
|
|
|
|
|
|
|
call on sequence objects here. |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=head1 FEEDBACK |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=head2 Mailing Lists |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
|
88
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
|
89
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
|
92
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=head2 Support |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
I |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
|
101
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
|
102
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
|
103
|
|
|
|
|
|
|
with code and data examples if at all possible. |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head2 Reporting Bugs |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
|
108
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
|
109
|
|
|
|
|
|
|
web: |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 AUTHOR - Ewan Birney |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Email birney@ebi.ac.uk |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head1 APPENDIX |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
|
120
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=cut |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
package Bio::PrimarySeq; |
|
127
|
|
|
|
|
|
|
|
|
128
|
203
|
|
|
203
|
|
9205
|
use strict; |
|
|
203
|
|
|
|
|
407
|
|
|
|
203
|
|
|
|
|
9701
|
|
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~'; |
|
131
|
|
|
|
|
|
|
our $GAP_SYMBOLS = '-~'; |
|
132
|
|
|
|
|
|
|
|
|
133
|
203
|
|
|
|
|
69635
|
use base qw(Bio::Root::Root Bio::PrimarySeqI |
|
134
|
203
|
|
|
203
|
|
984
|
Bio::IdentifiableI Bio::DescribableI); |
|
|
203
|
|
|
|
|
339
|
|
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# Setup the allowed values for alphabet() |
|
138
|
|
|
|
|
|
|
my %valid_type = map {$_, 1} qw( dna rna protein ); |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=head2 new |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
Title : new |
|
144
|
|
|
|
|
|
|
Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT', |
|
145
|
|
|
|
|
|
|
-id => 'human_id', |
|
146
|
|
|
|
|
|
|
-accession_number => 'AL000012', |
|
147
|
|
|
|
|
|
|
); |
|
148
|
|
|
|
|
|
|
Function: Returns a new primary seq object from |
|
149
|
|
|
|
|
|
|
basic constructors, being a string for the sequence |
|
150
|
|
|
|
|
|
|
and strings for id and accession_number. |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Note that you can provide an empty sequence string. However, in |
|
153
|
|
|
|
|
|
|
this case you MUST specify the type of sequence you wish to |
|
154
|
|
|
|
|
|
|
initialize by the parameter -alphabet. See alphabet() for possible |
|
155
|
|
|
|
|
|
|
values. |
|
156
|
|
|
|
|
|
|
Returns : a new Bio::PrimarySeq object |
|
157
|
|
|
|
|
|
|
Args : -seq => sequence string |
|
158
|
|
|
|
|
|
|
-ref_to_seq => ... or reference to a sequence string |
|
159
|
|
|
|
|
|
|
-display_id => display id of the sequence (locus name) |
|
160
|
|
|
|
|
|
|
-accession_number => accession number |
|
161
|
|
|
|
|
|
|
-primary_id => primary id (Genbank id) |
|
162
|
|
|
|
|
|
|
-version => version number |
|
163
|
|
|
|
|
|
|
-namespace => the namespace for the accession |
|
164
|
|
|
|
|
|
|
-authority => the authority for the namespace |
|
165
|
|
|
|
|
|
|
-description => description text |
|
166
|
|
|
|
|
|
|
-desc => alias for description |
|
167
|
|
|
|
|
|
|
-alphabet => skip alphabet guess and set it to dna, rna or protein |
|
168
|
|
|
|
|
|
|
-id => alias for display id |
|
169
|
|
|
|
|
|
|
-is_circular => boolean to indicate that sequence is circular |
|
170
|
|
|
|
|
|
|
-direct => boolean to directly set sequences. The next time -seq, |
|
171
|
|
|
|
|
|
|
seq() or -ref_to_seq is use, the sequence will not be |
|
172
|
|
|
|
|
|
|
validated. Be careful with this... |
|
173
|
|
|
|
|
|
|
-nowarnonempty => boolean to avoid warning when sequence is empty |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=cut |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub new { |
|
178
|
14902
|
|
|
14902
|
1
|
52355
|
my ($class, @args) = @_; |
|
179
|
14902
|
|
|
|
|
35553
|
my $self = $class->SUPER::new(@args); |
|
180
|
14902
|
|
|
|
|
63847
|
my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description, |
|
181
|
|
|
|
|
|
|
$alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len, |
|
182
|
|
|
|
|
|
|
$nowarnonempty) = |
|
183
|
|
|
|
|
|
|
$self->_rearrange([qw(SEQ |
|
184
|
|
|
|
|
|
|
DISPLAY_ID |
|
185
|
|
|
|
|
|
|
ACCESSION_NUMBER |
|
186
|
|
|
|
|
|
|
PRIMARY_ID |
|
187
|
|
|
|
|
|
|
NAMESPACE |
|
188
|
|
|
|
|
|
|
AUTHORITY |
|
189
|
|
|
|
|
|
|
VERSION |
|
190
|
|
|
|
|
|
|
OBJECT_ID |
|
191
|
|
|
|
|
|
|
DESC |
|
192
|
|
|
|
|
|
|
DESCRIPTION |
|
193
|
|
|
|
|
|
|
ALPHABET |
|
194
|
|
|
|
|
|
|
ID |
|
195
|
|
|
|
|
|
|
IS_CIRCULAR |
|
196
|
|
|
|
|
|
|
DIRECT |
|
197
|
|
|
|
|
|
|
REF_TO_SEQ |
|
198
|
|
|
|
|
|
|
LENGTH |
|
199
|
|
|
|
|
|
|
NOWARNONEMPTY |
|
200
|
|
|
|
|
|
|
)], |
|
201
|
|
|
|
|
|
|
@args); |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# Private var _nowarnonempty, needs to be set before calling _guess_alphabet |
|
204
|
14902
|
|
|
|
|
40571
|
$self->{'_nowarnonempty'} = $nowarnonempty; |
|
205
|
14902
|
|
|
|
|
21600
|
$self->{'_direct'} = $direct; |
|
206
|
|
|
|
|
|
|
|
|
207
|
14902
|
100
|
100
|
|
|
31238
|
if( defined $id && defined $given_id ) { |
|
208
|
6
|
50
|
|
|
|
24
|
if( $id ne $given_id ) { |
|
209
|
0
|
|
|
|
|
0
|
$self->throw("Provided both id and display_id constructors: [$id] [$given_id]"); |
|
210
|
|
|
|
|
|
|
} |
|
211
|
|
|
|
|
|
|
} |
|
212
|
14902
|
100
|
|
|
|
24895
|
if( defined $given_id ) { $id = $given_id; } |
|
|
11743
|
|
|
|
|
16264
|
|
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# Bernd's idea: set ids now for more informative invalid sequence messages |
|
215
|
14902
|
100
|
|
|
|
38606
|
defined $id && $self->display_id($id); |
|
216
|
14902
|
100
|
|
|
|
23366
|
$acc && $self->accession_number($acc); |
|
217
|
14902
|
100
|
|
|
|
21643
|
defined $pid && $self->primary_id($pid); |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# Set alphabet now to avoid guessing it later, when sequence is set |
|
220
|
14902
|
100
|
|
|
|
33192
|
$alphabet && $self->alphabet($alphabet); |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# Set the length before the seq. If there is a seq, length will be updated later |
|
223
|
14902
|
|
100
|
|
|
38898
|
$self->{'length'} = $len || 0; |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# Set the sequence (but also alphabet and length) |
|
226
|
14902
|
100
|
|
|
|
23413
|
if ($ref_to_seq) { |
|
227
|
1
|
|
|
|
|
3
|
$self->_set_seq_by_ref($ref_to_seq, $alphabet); |
|
228
|
|
|
|
|
|
|
} else { |
|
229
|
14901
|
100
|
|
|
|
22472
|
if (defined $seq) { |
|
230
|
|
|
|
|
|
|
# Note: the sequence string may be empty |
|
231
|
14189
|
|
|
|
|
22432
|
$self->seq($seq); |
|
232
|
|
|
|
|
|
|
} |
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
|
|
235
|
14900
|
100
|
|
|
|
28823
|
$desc && $self->desc($desc); |
|
236
|
14900
|
100
|
|
|
|
21908
|
$description && $self->description($description); |
|
237
|
14900
|
100
|
|
|
|
21629
|
$ns && $self->namespace($ns); |
|
238
|
14900
|
100
|
|
|
|
21017
|
$auth && $self->authority($auth); |
|
239
|
|
|
|
|
|
|
# Any variable that can have a value "0" must be tested with defined |
|
240
|
|
|
|
|
|
|
# or it will fail to be added to the new object |
|
241
|
14900
|
100
|
|
|
|
22094
|
defined($v) && $self->version($v); |
|
242
|
14900
|
50
|
|
|
|
23177
|
defined($oid) && $self->object_id($oid); |
|
243
|
14900
|
100
|
|
|
|
21161
|
defined($is_circular) && $self->is_circular($is_circular); |
|
244
|
|
|
|
|
|
|
|
|
245
|
14900
|
|
|
|
|
46894
|
return $self; |
|
246
|
|
|
|
|
|
|
} |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head2 seq |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
Title : seq |
|
252
|
|
|
|
|
|
|
Usage : $string = $seqobj->seq(); |
|
253
|
|
|
|
|
|
|
Function: Get or set the sequence as a string of letters. The case of |
|
254
|
|
|
|
|
|
|
the letters is left up to the implementer. Suggested cases are |
|
255
|
|
|
|
|
|
|
upper case for proteins and lower case for DNA sequence (IUPAC |
|
256
|
|
|
|
|
|
|
standard), but you should not rely on this. An error is thrown if |
|
257
|
|
|
|
|
|
|
the sequence contains invalid characters: see validate_seq(). |
|
258
|
|
|
|
|
|
|
Returns : A scalar |
|
259
|
|
|
|
|
|
|
Args : - Optional new sequence value (a string) to set |
|
260
|
|
|
|
|
|
|
- Optional alphabet (it is guessed by default) |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=cut |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
sub seq { |
|
265
|
168331
|
|
|
168331
|
1
|
253516
|
my ($self, @args) = @_; |
|
266
|
|
|
|
|
|
|
|
|
267
|
168331
|
100
|
|
|
|
233023
|
if( scalar @args == 0 ) { |
|
268
|
140950
|
|
|
|
|
342302
|
return $self->{'seq'}; |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
|
|
271
|
27381
|
|
|
|
|
40587
|
my ($seq_str, $alphabet) = @args; |
|
272
|
27381
|
50
|
|
|
|
41124
|
if (@args) { |
|
273
|
27381
|
|
|
|
|
44455
|
$self->_set_seq_by_ref(\$seq_str, $alphabet); |
|
274
|
|
|
|
|
|
|
} |
|
275
|
|
|
|
|
|
|
|
|
276
|
27378
|
|
|
|
|
44649
|
return $self->{'seq'}; |
|
277
|
|
|
|
|
|
|
} |
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
sub _set_seq_by_ref { |
|
281
|
|
|
|
|
|
|
# Set a sequence by reference. A reference is used to avoid the cost of |
|
282
|
|
|
|
|
|
|
# copying the sequence (which can be very large) between functions. |
|
283
|
27382
|
|
|
27382
|
|
37124
|
my ($self, $seq_str_ref, $alphabet) = @_; |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# Validate sequence if sequence is not empty and we are not in direct mode |
|
286
|
27382
|
100
|
100
|
|
|
82370
|
if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) { |
|
287
|
27210
|
|
|
|
|
45989
|
$self->validate_seq($$seq_str_ref, 1); |
|
288
|
|
|
|
|
|
|
} |
|
289
|
27379
|
|
|
|
|
36974
|
delete $self->{'_direct'}; # next sequence will have to be validated |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
# Record sequence length |
|
292
|
27379
|
|
100
|
|
|
50196
|
my $len = CORE::length($$seq_str_ref || ''); |
|
293
|
27379
|
|
100
|
|
|
59229
|
my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0); |
|
294
|
|
|
|
|
|
|
# Note: if the new seq is empty or undef, this is not considered a change |
|
295
|
27379
|
100
|
|
|
|
43842
|
delete $self->{'_freeze_length'} if $is_changed_seq; |
|
296
|
27379
|
100
|
|
|
|
46339
|
$self->{'length'} = $len if not exists $self->{'_freeze_length'}; |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# Set sequence |
|
299
|
27379
|
|
|
|
|
38134
|
$self->{'seq'} = $$seq_str_ref; |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# Set or guess alphabet |
|
302
|
27379
|
100
|
100
|
|
|
65173
|
if ($alphabet) { |
|
|
|
100
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# Alphabet specified, set it no matter what |
|
304
|
18
|
|
|
|
|
31
|
$self->alphabet($alphabet); |
|
305
|
|
|
|
|
|
|
} elsif ($is_changed_seq || (! defined($self->alphabet()))) { |
|
306
|
|
|
|
|
|
|
# If we changed a previous sequence to a new one or if there is no |
|
307
|
|
|
|
|
|
|
# alphabet yet at all, we need to guess the (possibly new) alphabet |
|
308
|
15706
|
|
|
|
|
25638
|
$self->_guess_alphabet(); |
|
309
|
|
|
|
|
|
|
} # else (seq not changed and alphabet was defined) do nothing |
|
310
|
|
|
|
|
|
|
|
|
311
|
27379
|
|
|
|
|
37368
|
return 1; |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=head2 validate_seq |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
Title : validate_seq |
|
318
|
|
|
|
|
|
|
Usage : if(! $seqobj->validate_seq($seq_str) ) { |
|
319
|
|
|
|
|
|
|
print "sequence $seq_str is not valid for an object of |
|
320
|
|
|
|
|
|
|
alphabet ",$seqobj->alphabet, "\n"; |
|
321
|
|
|
|
|
|
|
} |
|
322
|
|
|
|
|
|
|
Function: Test that the given sequence is valid, i.e. contains only valid |
|
323
|
|
|
|
|
|
|
characters. The allowed characters are all letters (A-Z) and '-','.', |
|
324
|
|
|
|
|
|
|
'*','?','=' and '~'. Spaces are not valid. Note that this |
|
325
|
|
|
|
|
|
|
implementation does not take alphabet() into account and that empty |
|
326
|
|
|
|
|
|
|
sequences are considered valid. |
|
327
|
|
|
|
|
|
|
Returns : 1 if the supplied sequence string is valid, 0 otherwise. |
|
328
|
|
|
|
|
|
|
Args : - Sequence string to be validated |
|
329
|
|
|
|
|
|
|
- Boolean to optionally throw an error if the sequence is invalid |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
=cut |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub validate_seq { |
|
334
|
23109
|
|
|
23109
|
1
|
33264
|
my ($self, $seqstr, $throw) = @_; |
|
335
|
23109
|
100
|
100
|
|
|
134477
|
if ( (defined $seqstr ) && |
|
336
|
|
|
|
|
|
|
($seqstr !~ /^[$MATCHPATTERN]*$/) ) { |
|
337
|
8
|
100
|
|
|
|
18
|
if ($throw) { |
|
338
|
3
|
|
50
|
|
|
8
|
$self->throw("Failed validation of sequence '".(defined($self->id) || |
|
339
|
|
|
|
|
|
|
'[unidentified sequence]')."'. Invalid characters were: " . |
|
340
|
|
|
|
|
|
|
join('',($seqstr =~ /[^$MATCHPATTERN]/g))); |
|
341
|
|
|
|
|
|
|
} |
|
342
|
5
|
|
|
|
|
25
|
return 0; |
|
343
|
|
|
|
|
|
|
} |
|
344
|
23101
|
|
|
|
|
37534
|
return 1; |
|
345
|
|
|
|
|
|
|
} |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=head2 subseq |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Title : subseq |
|
351
|
|
|
|
|
|
|
Usage : $substring = $seqobj->subseq(10,40); |
|
352
|
|
|
|
|
|
|
$substring = $seqobj->subseq(10,40,'nogap'); |
|
353
|
|
|
|
|
|
|
$substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga'); |
|
354
|
|
|
|
|
|
|
$substring = $seqobj->subseq($location_obj); |
|
355
|
|
|
|
|
|
|
$substring = $seqobj->subseq($location_obj, -nogap => 1); |
|
356
|
|
|
|
|
|
|
Function: Return the subseq from start to end, where the first sequence |
|
357
|
|
|
|
|
|
|
character has coordinate 1 number is inclusive, ie 1-2 are the |
|
358
|
|
|
|
|
|
|
first two characters of the sequence. The given start coordinate |
|
359
|
|
|
|
|
|
|
has to be larger than the end, even if the sequence is circular. |
|
360
|
|
|
|
|
|
|
Returns : a string |
|
361
|
|
|
|
|
|
|
Args : integer for start position |
|
362
|
|
|
|
|
|
|
integer for end position |
|
363
|
|
|
|
|
|
|
OR |
|
364
|
|
|
|
|
|
|
Bio::LocationI location for subseq (strand honored) |
|
365
|
|
|
|
|
|
|
Specify -NOGAP=>1 to return subseq with gap characters removed |
|
366
|
|
|
|
|
|
|
Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned |
|
367
|
|
|
|
|
|
|
with $new_subseq in the sequence object |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=cut |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
sub subseq { |
|
372
|
10043
|
|
|
10043
|
1
|
10712
|
my $self = shift; |
|
373
|
10043
|
|
|
|
|
12891
|
my @args = @_; |
|
374
|
10043
|
|
|
|
|
24111
|
my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START |
|
375
|
|
|
|
|
|
|
END |
|
376
|
|
|
|
|
|
|
NOGAP |
|
377
|
|
|
|
|
|
|
REPLACE_WITH)], @args); |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
# If -replace_with is specified, validate the replacement sequence |
|
380
|
10043
|
100
|
|
|
|
18441
|
if (defined $replace) { |
|
381
|
2
|
100
|
|
|
|
4
|
$self->validate_seq( $replace ) || |
|
382
|
|
|
|
|
|
|
$self->throw("Replacement sequence does not look valid"); |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
|
|
385
|
10042
|
100
|
66
|
|
|
29026
|
if( ref($start) && $start->isa('Bio::LocationI') ) { |
|
|
|
50
|
33
|
|
|
|
|
|
386
|
52
|
|
|
|
|
60
|
my $loc = $start; |
|
387
|
52
|
|
|
|
|
75
|
my $seq = ''; |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
# For Split objects if Guide Strand is negative, |
|
390
|
|
|
|
|
|
|
# pass the sublocations in reverse |
|
391
|
52
|
|
|
|
|
59
|
my $order = 0; |
|
392
|
52
|
100
|
|
|
|
114
|
if ($loc->isa('Bio::Location::SplitLocationI')) { |
|
393
|
|
|
|
|
|
|
# guide_strand can return undef, so don't compare directly |
|
394
|
|
|
|
|
|
|
# to avoid 'uninitialized value' warning |
|
395
|
44
|
100
|
|
|
|
103
|
my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0; |
|
396
|
44
|
100
|
|
|
|
82
|
$order = ($guide_strand == -1) ? -1 : 0; |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
# Reversing order using ->each_Location(-1) does not work well for |
|
399
|
|
|
|
|
|
|
# cut by origin-splits (like "complement(join(1900..END,START..50))"), |
|
400
|
|
|
|
|
|
|
# so use "reverse" instead |
|
401
|
52
|
100
|
|
|
|
128
|
my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location; |
|
402
|
52
|
|
|
|
|
70
|
foreach my $subloc (@sublocs) { |
|
403
|
120
|
|
|
|
|
237
|
my $piece = $self->subseq(-start => $subloc->start(), |
|
404
|
|
|
|
|
|
|
-end => $subloc->end(), |
|
405
|
|
|
|
|
|
|
-replace_with => $replace, |
|
406
|
|
|
|
|
|
|
-nogap => $nogap); |
|
407
|
120
|
100
|
|
|
|
210
|
$piece =~ s/[$GAP_SYMBOLS]//g if $nogap; |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# strand can return undef, so don't compare directly |
|
410
|
|
|
|
|
|
|
# to avoid 'uninitialized value' warning |
|
411
|
120
|
100
|
|
|
|
236
|
my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0; |
|
412
|
120
|
100
|
|
|
|
219
|
if ($strand < 0) { |
|
413
|
59
|
|
|
|
|
98
|
$piece = $self->_revcom_from_string($piece, $self->alphabet); |
|
414
|
|
|
|
|
|
|
} |
|
415
|
120
|
|
|
|
|
227
|
$seq .= $piece; |
|
416
|
|
|
|
|
|
|
} |
|
417
|
52
|
|
|
|
|
224
|
return $seq; |
|
418
|
|
|
|
|
|
|
} elsif( defined $start && defined $end ) { |
|
419
|
9990
|
50
|
|
|
|
13464
|
if( $start > $end ){ |
|
420
|
0
|
|
|
|
|
0
|
$self->throw("Bad start,end parameters. Start [$start] has to be ". |
|
421
|
|
|
|
|
|
|
"less than end [$end]"); |
|
422
|
|
|
|
|
|
|
} |
|
423
|
9990
|
50
|
|
|
|
12541
|
if( $start <= 0 ) { |
|
424
|
0
|
|
|
|
|
0
|
$self->throw("Bad start parameter ($start). Start must be positive."); |
|
425
|
|
|
|
|
|
|
} |
|
426
|
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# Remove one from start, and then length is end-start |
|
428
|
9990
|
|
|
|
|
9376
|
$start--; |
|
429
|
|
|
|
|
|
|
|
|
430
|
9990
|
|
|
|
|
8557
|
my $seqstr; |
|
431
|
9990
|
100
|
|
|
|
11248
|
if (defined $replace) { |
|
432
|
1
|
|
|
|
|
4
|
$seqstr = substr $self->{seq}, $start, $end-$start, $replace; |
|
433
|
|
|
|
|
|
|
} else { |
|
434
|
9989
|
|
|
|
|
17443
|
$seqstr = substr $self->{seq}, $start, $end-$start; |
|
435
|
|
|
|
|
|
|
} |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
|
|
438
|
9990
|
100
|
|
|
|
14489
|
if ($end > $self->length) { |
|
439
|
1
|
50
|
|
|
|
2
|
if ($self->is_circular) { |
|
440
|
1
|
|
|
|
|
2
|
my $start = 0; |
|
441
|
1
|
|
|
|
|
3
|
my $end = $end - $self->length; |
|
442
|
|
|
|
|
|
|
|
|
443
|
1
|
|
|
|
|
2
|
my $appendstr; |
|
444
|
1
|
50
|
|
|
|
2
|
if (defined $replace) { |
|
445
|
0
|
|
|
|
|
0
|
$appendstr = substr $self->{seq}, $start, $end-$start, $replace; |
|
446
|
|
|
|
|
|
|
} else { |
|
447
|
1
|
|
|
|
|
2
|
$appendstr = substr $self->{seq}, $start, $end-$start; |
|
448
|
|
|
|
|
|
|
} |
|
449
|
|
|
|
|
|
|
|
|
450
|
1
|
|
|
|
|
2
|
$seqstr .= $appendstr; |
|
451
|
|
|
|
|
|
|
} else { |
|
452
|
0
|
|
|
|
|
0
|
$self->throw("Bad end parameter ($end). End must be less than ". |
|
453
|
|
|
|
|
|
|
"the total length of sequence (total=".$self->length.")") |
|
454
|
|
|
|
|
|
|
} |
|
455
|
|
|
|
|
|
|
} |
|
456
|
|
|
|
|
|
|
|
|
457
|
9990
|
100
|
|
|
|
13400
|
$seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap); |
|
458
|
9990
|
|
|
|
|
21032
|
return $seqstr; |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
} else { |
|
461
|
0
|
|
|
|
|
0
|
$self->warn("Incorrect parameters to subseq - must be two integers or ". |
|
462
|
|
|
|
|
|
|
"a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap); |
|
463
|
0
|
|
|
|
|
0
|
return; |
|
464
|
|
|
|
|
|
|
} |
|
465
|
|
|
|
|
|
|
} |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
=head2 length |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
Title : length |
|
471
|
|
|
|
|
|
|
Usage : $len = $seqobj->length(); |
|
472
|
|
|
|
|
|
|
Function: Get the stored length of the sequence in number of symbols (bases |
|
473
|
|
|
|
|
|
|
or amino acids). In some circumstances, you can also set this attribute: |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
1. For empty sequences, you can set the length to anything you want: |
|
476
|
|
|
|
|
|
|
my $seqobj = Bio::PrimarySeq->new( -length => 123 ); |
|
477
|
|
|
|
|
|
|
my $len = $seqobj->len; # 123 |
|
478
|
|
|
|
|
|
|
2. To save memory when using very long sequences, you can set the |
|
479
|
|
|
|
|
|
|
length of the sequence to the length of the sequence (and nothing |
|
480
|
|
|
|
|
|
|
else): |
|
481
|
|
|
|
|
|
|
my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence |
|
482
|
|
|
|
|
|
|
# process $seqobj... then after you're done with it |
|
483
|
|
|
|
|
|
|
$seqobj->length($seqobj->length); |
|
484
|
|
|
|
|
|
|
$seqobj->seq(undef); # free memory! |
|
485
|
|
|
|
|
|
|
my $len = $seqobj->len; # 1 Mbp |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
Note that if you set seq() to a value other than undef at any time, |
|
488
|
|
|
|
|
|
|
the length attribute will be reset. |
|
489
|
|
|
|
|
|
|
Returns : integer representing the length of the sequence. |
|
490
|
|
|
|
|
|
|
Args : Optionally, the value on set |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=cut |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
sub length { |
|
495
|
30706
|
|
|
30706
|
1
|
43613
|
my ($self, $val) = @_; |
|
496
|
30706
|
100
|
|
|
|
39800
|
if (defined $val) { |
|
497
|
5
|
|
|
|
|
10
|
my $len = $self->{'length'}; |
|
498
|
5
|
100
|
100
|
|
|
17
|
if ($len && ($len != $val)) { |
|
499
|
1
|
|
|
|
|
6
|
$self->throw("Can not set the length to $val, current length value is $len"); |
|
500
|
|
|
|
|
|
|
} |
|
501
|
4
|
|
|
|
|
7
|
$self->{'length'} = $val; |
|
502
|
4
|
|
|
|
|
8
|
$self->{'_freeze_length'} = undef; |
|
503
|
|
|
|
|
|
|
} |
|
504
|
30705
|
|
|
|
|
49670
|
return $self->{'length'}; |
|
505
|
|
|
|
|
|
|
} |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head2 display_id |
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
Title : display_id or display_name |
|
511
|
|
|
|
|
|
|
Usage : $id_string = $seqobj->display_id(); |
|
512
|
|
|
|
|
|
|
Function: Get or set the display id, aka the common name of the sequence object. |
|
513
|
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
The semantics of this is that it is the most likely string to |
|
515
|
|
|
|
|
|
|
be used as an identifier of the sequence, and likely to have |
|
516
|
|
|
|
|
|
|
"human" readability. The id is equivalent to the ID field of |
|
517
|
|
|
|
|
|
|
the GenBank/EMBL databanks and the id field of the |
|
518
|
|
|
|
|
|
|
Swissprot/sptrembl database. In fasta format, the >(\S+) is |
|
519
|
|
|
|
|
|
|
presumed to be the id, though some people overload the id to |
|
520
|
|
|
|
|
|
|
embed other information. Bioperl does not use any embedded |
|
521
|
|
|
|
|
|
|
information in the ID field, and people are encouraged to use |
|
522
|
|
|
|
|
|
|
other mechanisms (accession field for example, or extending |
|
523
|
|
|
|
|
|
|
the sequence object) to solve this. |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
With the new Bio::DescribeableI interface, display_name aliases |
|
526
|
|
|
|
|
|
|
to this method. |
|
527
|
|
|
|
|
|
|
Returns : A string for the display ID |
|
528
|
|
|
|
|
|
|
Args : Optional string for the display ID to set |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
=cut |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
sub display_id { |
|
533
|
23738
|
|
|
23738
|
1
|
37244
|
my ($self, $value) = @_; |
|
534
|
23738
|
100
|
|
|
|
34722
|
if( defined $value) { |
|
535
|
13993
|
|
|
|
|
21316
|
$self->{'display_id'} = $value; |
|
536
|
|
|
|
|
|
|
} |
|
537
|
23738
|
|
|
|
|
42199
|
return $self->{'display_id'}; |
|
538
|
|
|
|
|
|
|
} |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=head2 accession_number |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
Title : accession_number or object_id |
|
544
|
|
|
|
|
|
|
Usage : $unique_key = $seqobj->accession_number; |
|
545
|
|
|
|
|
|
|
Function: Returns the unique biological id for a sequence, commonly |
|
546
|
|
|
|
|
|
|
called the accession_number. For sequences from established |
|
547
|
|
|
|
|
|
|
databases, the implementors should try to use the correct |
|
548
|
|
|
|
|
|
|
accession number. Notice that primary_id() provides the |
|
549
|
|
|
|
|
|
|
unique id for the implementation, allowing multiple objects |
|
550
|
|
|
|
|
|
|
to have the same accession number in a particular implementation. |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
For sequences with no accession number, this method should |
|
553
|
|
|
|
|
|
|
return "unknown". |
|
554
|
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
[Note this method name is likely to change in 1.3] |
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
With the new Bio::IdentifiableI interface, this is aliased |
|
558
|
|
|
|
|
|
|
to object_id |
|
559
|
|
|
|
|
|
|
Returns : A string |
|
560
|
|
|
|
|
|
|
Args : A string (optional) for setting |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=cut |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
sub accession_number { |
|
565
|
900
|
|
|
900
|
1
|
1935
|
my( $self, $acc ) = @_; |
|
566
|
900
|
100
|
|
|
|
1794
|
if (defined $acc) { |
|
567
|
654
|
|
|
|
|
1127
|
$self->{'accession_number'} = $acc; |
|
568
|
|
|
|
|
|
|
} else { |
|
569
|
246
|
|
|
|
|
417
|
$acc = $self->{'accession_number'}; |
|
570
|
246
|
100
|
|
|
|
557
|
$acc = 'unknown' unless defined $acc; |
|
571
|
|
|
|
|
|
|
} |
|
572
|
900
|
|
|
|
|
1772
|
return $acc; |
|
573
|
|
|
|
|
|
|
} |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=head2 primary_id |
|
577
|
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
Title : primary_id |
|
579
|
|
|
|
|
|
|
Usage : $unique_key = $seqobj->primary_id; |
|
580
|
|
|
|
|
|
|
Function: Returns the unique id for this object in this |
|
581
|
|
|
|
|
|
|
implementation. This allows implementations to manage their |
|
582
|
|
|
|
|
|
|
own object ids in a way the implementation can control |
|
583
|
|
|
|
|
|
|
clients can expect one id to map to one object. |
|
584
|
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
For sequences with no natural primary id, this method |
|
586
|
|
|
|
|
|
|
should return a stringified memory location. |
|
587
|
|
|
|
|
|
|
Returns : A string |
|
588
|
|
|
|
|
|
|
Args : A string (optional, for setting) |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=cut |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
sub primary_id { |
|
593
|
511
|
|
|
511
|
1
|
913
|
my $self = shift; |
|
594
|
|
|
|
|
|
|
|
|
595
|
511
|
100
|
|
|
|
1004
|
if(@_) { |
|
596
|
453
|
|
|
|
|
717
|
$self->{'primary_id'} = shift; |
|
597
|
|
|
|
|
|
|
} |
|
598
|
511
|
100
|
|
|
|
1059
|
if( ! defined($self->{'primary_id'}) ) { |
|
599
|
21
|
|
|
|
|
250
|
return "$self"; |
|
600
|
|
|
|
|
|
|
} |
|
601
|
490
|
|
|
|
|
667
|
return $self->{'primary_id'}; |
|
602
|
|
|
|
|
|
|
} |
|
603
|
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=head2 alphabet |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
Title : alphabet |
|
608
|
|
|
|
|
|
|
Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something } |
|
609
|
|
|
|
|
|
|
Function: Get/set the alphabet of sequence, one of |
|
610
|
|
|
|
|
|
|
'dna', 'rna' or 'protein'. This is case sensitive. |
|
611
|
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
This is not called because this would cause |
|
613
|
|
|
|
|
|
|
upgrade problems from the 0.5 and earlier Seq objects. |
|
614
|
|
|
|
|
|
|
Returns : a string either 'dna','rna','protein'. NB - the object must |
|
615
|
|
|
|
|
|
|
make a call of the type - if there is no alphabet specified it |
|
616
|
|
|
|
|
|
|
has to guess. |
|
617
|
|
|
|
|
|
|
Args : optional string to set : 'dna' | 'rna' | 'protein' |
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
=cut |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
sub alphabet { |
|
623
|
54742
|
|
|
54742
|
1
|
83427
|
my ($self,$value) = @_; |
|
624
|
54742
|
100
|
|
|
|
78401
|
if (defined $value) { |
|
625
|
27849
|
|
|
|
|
40535
|
$value = lc $value; |
|
626
|
27849
|
50
|
|
|
|
49051
|
unless ( $valid_type{$value} ) { |
|
627
|
0
|
|
|
|
|
0
|
$self->throw("Alphabet '$value' is not a valid alphabet (". |
|
628
|
|
|
|
|
|
|
join(',', map "'$_'", sort keys %valid_type) .") lowercase"); |
|
629
|
|
|
|
|
|
|
} |
|
630
|
27849
|
|
|
|
|
39547
|
$self->{'alphabet'} = $value; |
|
631
|
|
|
|
|
|
|
} |
|
632
|
54742
|
|
|
|
|
98214
|
return $self->{'alphabet'}; |
|
633
|
|
|
|
|
|
|
} |
|
634
|
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=head2 desc |
|
637
|
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
Title : desc or description |
|
639
|
|
|
|
|
|
|
Usage : $seqobj->desc($newval); |
|
640
|
|
|
|
|
|
|
Function: Get/set description of the sequence. |
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
'description' is an alias for this for compliance with the |
|
643
|
|
|
|
|
|
|
Bio::DescribeableI interface. |
|
644
|
|
|
|
|
|
|
Returns : value of desc (a string) |
|
645
|
|
|
|
|
|
|
Args : newvalue (a string or undef, optional) |
|
646
|
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=cut |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
sub desc{ |
|
651
|
1711
|
|
|
1711
|
1
|
8288
|
my $self = shift; |
|
652
|
|
|
|
|
|
|
|
|
653
|
1711
|
100
|
|
|
|
4325
|
return $self->{'desc'} = shift if @_; |
|
654
|
454
|
|
|
|
|
1637
|
return $self->{'desc'}; |
|
655
|
|
|
|
|
|
|
} |
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
=head2 can_call_new |
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
Title : can_call_new |
|
661
|
|
|
|
|
|
|
Usage : |
|
662
|
|
|
|
|
|
|
Function: |
|
663
|
|
|
|
|
|
|
Example : |
|
664
|
|
|
|
|
|
|
Returns : true |
|
665
|
|
|
|
|
|
|
Args : |
|
666
|
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
=cut |
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
sub can_call_new { |
|
670
|
10
|
|
|
10
|
1
|
98
|
my ($self) = @_; |
|
671
|
|
|
|
|
|
|
|
|
672
|
10
|
|
|
|
|
26
|
return 1; |
|
673
|
|
|
|
|
|
|
} |
|
674
|
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=head2 id |
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
Title : id |
|
679
|
|
|
|
|
|
|
Usage : $id = $seqobj->id(); |
|
680
|
|
|
|
|
|
|
Function: This is mapped on display_id |
|
681
|
|
|
|
|
|
|
Example : |
|
682
|
|
|
|
|
|
|
Returns : |
|
683
|
|
|
|
|
|
|
Args : |
|
684
|
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
=cut |
|
686
|
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
sub id { |
|
688
|
9010
|
|
|
9010
|
1
|
19459
|
return shift->display_id(@_); |
|
689
|
|
|
|
|
|
|
} |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
=head2 is_circular |
|
693
|
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
Title : is_circular |
|
695
|
|
|
|
|
|
|
Usage : if( $seqobj->is_circular) { # Do something } |
|
696
|
|
|
|
|
|
|
Function: Returns true if the molecule is circular |
|
697
|
|
|
|
|
|
|
Returns : Boolean value |
|
698
|
|
|
|
|
|
|
Args : none |
|
699
|
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=cut |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
sub is_circular{ |
|
703
|
143408
|
|
|
143408
|
1
|
140853
|
my $self = shift; |
|
704
|
143408
|
100
|
|
|
|
189359
|
return $self->{'is_circular'} = shift if @_; |
|
705
|
143382
|
|
|
|
|
272558
|
return $self->{'is_circular'}; |
|
706
|
|
|
|
|
|
|
} |
|
707
|
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
=head1 Methods for Bio::IdentifiableI compliance |
|
710
|
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
=head2 object_id |
|
712
|
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
Title : object_id |
|
714
|
|
|
|
|
|
|
Usage : $string = $seqobj->object_id(); |
|
715
|
|
|
|
|
|
|
Function: Get or set a string which represents the stable primary identifier |
|
716
|
|
|
|
|
|
|
in this namespace of this object. For DNA sequences this |
|
717
|
|
|
|
|
|
|
is its accession_number, similarly for protein sequences. |
|
718
|
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
This is aliased to accession_number(). |
|
720
|
|
|
|
|
|
|
Returns : A scalar |
|
721
|
|
|
|
|
|
|
Args : Optional object ID to set. |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
=cut |
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
sub object_id { |
|
726
|
4
|
|
|
4
|
1
|
10
|
return shift->accession_number(@_); |
|
727
|
|
|
|
|
|
|
} |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=head2 version |
|
731
|
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
Title : version |
|
733
|
|
|
|
|
|
|
Usage : $version = $seqobj->version(); |
|
734
|
|
|
|
|
|
|
Function: Get or set a number which differentiates between versions of |
|
735
|
|
|
|
|
|
|
the same object. Higher numbers are considered to be |
|
736
|
|
|
|
|
|
|
later and more relevant, but a single object described |
|
737
|
|
|
|
|
|
|
the same identifier should represent the same concept. |
|
738
|
|
|
|
|
|
|
Returns : A number |
|
739
|
|
|
|
|
|
|
Args : Optional version to set. |
|
740
|
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=cut |
|
742
|
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
sub version{ |
|
744
|
3582
|
|
|
3582
|
1
|
4940
|
my ($self,$value) = @_; |
|
745
|
3582
|
100
|
|
|
|
5374
|
if( defined $value) { |
|
746
|
292
|
|
|
|
|
761
|
$self->{'_version'} = $value; |
|
747
|
|
|
|
|
|
|
} |
|
748
|
3582
|
|
|
|
|
6760
|
return $self->{'_version'}; |
|
749
|
|
|
|
|
|
|
} |
|
750
|
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
=head2 authority |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
Title : authority |
|
755
|
|
|
|
|
|
|
Usage : $authority = $seqobj->authority(); |
|
756
|
|
|
|
|
|
|
Function: Get or set a string which represents the organisation which |
|
757
|
|
|
|
|
|
|
granted the namespace, written as the DNS name of the |
|
758
|
|
|
|
|
|
|
organisation (eg, wormbase.org). |
|
759
|
|
|
|
|
|
|
Returns : A scalar |
|
760
|
|
|
|
|
|
|
Args : Optional authority to set. |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
=cut |
|
763
|
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
sub authority { |
|
765
|
91
|
|
|
91
|
1
|
111
|
my ($self, $value) = @_; |
|
766
|
91
|
100
|
|
|
|
117
|
if( defined $value) { |
|
767
|
86
|
|
|
|
|
110
|
$self->{'authority'} = $value; |
|
768
|
|
|
|
|
|
|
} |
|
769
|
91
|
|
|
|
|
118
|
return $self->{'authority'}; |
|
770
|
|
|
|
|
|
|
} |
|
771
|
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=head2 namespace |
|
774
|
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
Title : namespace |
|
776
|
|
|
|
|
|
|
Usage : $string = $seqobj->namespace(); |
|
777
|
|
|
|
|
|
|
Function: Get or set a string representing the name space this identifier |
|
778
|
|
|
|
|
|
|
is valid in, often the database name or the name describing the |
|
779
|
|
|
|
|
|
|
collection. |
|
780
|
|
|
|
|
|
|
Returns : A scalar |
|
781
|
|
|
|
|
|
|
Args : Optional namespace to set. |
|
782
|
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=cut |
|
784
|
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
sub namespace{ |
|
786
|
527
|
|
|
527
|
1
|
847
|
my ($self,$value) = @_; |
|
787
|
527
|
100
|
|
|
|
837
|
if( defined $value) { |
|
788
|
493
|
|
|
|
|
937
|
$self->{'namespace'} = $value; |
|
789
|
|
|
|
|
|
|
} |
|
790
|
527
|
|
100
|
|
|
1154
|
return $self->{'namespace'} || ""; |
|
791
|
|
|
|
|
|
|
} |
|
792
|
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
=head1 Methods for Bio::DescribableI compliance |
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
This comprises of display_name and description. |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
=head2 display_name |
|
799
|
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
Title : display_name |
|
801
|
|
|
|
|
|
|
Usage : $string = $seqobj->display_name(); |
|
802
|
|
|
|
|
|
|
Function: Get or set a string which is what should be displayed to the user. |
|
803
|
|
|
|
|
|
|
The string should have no spaces (ideally, though a cautious |
|
804
|
|
|
|
|
|
|
user of this interface would not assume this) and should be |
|
805
|
|
|
|
|
|
|
less than thirty characters (though again, double checking |
|
806
|
|
|
|
|
|
|
this is a good idea). |
|
807
|
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
This is aliased to display_id(). |
|
809
|
|
|
|
|
|
|
Returns : A string for the display name |
|
810
|
|
|
|
|
|
|
Args : Optional string for the display name to set. |
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=cut |
|
813
|
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
sub display_name { |
|
815
|
2
|
|
|
2
|
1
|
7
|
return shift->display_id(@_); |
|
816
|
|
|
|
|
|
|
} |
|
817
|
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head2 description |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
Title : description |
|
822
|
|
|
|
|
|
|
Usage : $string = $seqobj->description(); |
|
823
|
|
|
|
|
|
|
Function: Get or set a text string suitable for displaying to the user a |
|
824
|
|
|
|
|
|
|
description. This string is likely to have spaces, but |
|
825
|
|
|
|
|
|
|
should not have any newlines or formatting - just plain |
|
826
|
|
|
|
|
|
|
text. The string should not be greater than 255 characters |
|
827
|
|
|
|
|
|
|
and clients can feel justified at truncating strings at 255 |
|
828
|
|
|
|
|
|
|
characters for the purposes of display. |
|
829
|
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
This is aliased to desc(). |
|
831
|
|
|
|
|
|
|
Returns : A string for the description |
|
832
|
|
|
|
|
|
|
Args : Optional string for the description to set. |
|
833
|
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
=cut |
|
835
|
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
sub description { |
|
837
|
91
|
|
|
91
|
1
|
202
|
return shift->desc(@_); |
|
838
|
|
|
|
|
|
|
} |
|
839
|
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
=head1 Methods Inherited from Bio::PrimarySeqI |
|
842
|
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
These methods are available on Bio::PrimarySeq, although they are |
|
844
|
|
|
|
|
|
|
actually implemented on Bio::PrimarySeqI |
|
845
|
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
=head2 revcom |
|
847
|
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
Title : revcom |
|
849
|
|
|
|
|
|
|
Usage : $rev = $seqobj->revcom(); |
|
850
|
|
|
|
|
|
|
Function: Produces a new Bio::SeqI implementing object which |
|
851
|
|
|
|
|
|
|
is the reversed complement of the sequence. For protein |
|
852
|
|
|
|
|
|
|
sequences this throws an exception of |
|
853
|
|
|
|
|
|
|
"Sequence is a protein. Cannot revcom". |
|
854
|
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
The id is the same id as the original sequence, and the |
|
856
|
|
|
|
|
|
|
accession number is also identical. If someone wants to |
|
857
|
|
|
|
|
|
|
track that this sequence has be reversed, it needs to |
|
858
|
|
|
|
|
|
|
define its own extensions. |
|
859
|
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
To do an inplace edit of an object you can go: |
|
861
|
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
$seqobj = $seqobj->revcom(); |
|
863
|
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
This of course, causes Perl to handle the garbage |
|
865
|
|
|
|
|
|
|
collection of the old object, but it is roughly speaking as |
|
866
|
|
|
|
|
|
|
efficient as an inplace edit. |
|
867
|
|
|
|
|
|
|
Returns : A new (fresh) Bio::SeqI object |
|
868
|
|
|
|
|
|
|
Args : none |
|
869
|
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
=head2 trunc |
|
871
|
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
Title : trunc |
|
873
|
|
|
|
|
|
|
Usage : $subseq = $myseq->trunc(10,100); |
|
874
|
|
|
|
|
|
|
Function: Provides a truncation of a sequence, |
|
875
|
|
|
|
|
|
|
Returns : A fresh Bio::SeqI implementing object. |
|
876
|
|
|
|
|
|
|
Args : Numbers for the start and end positions |
|
877
|
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
=head1 Internal methods |
|
879
|
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
These are internal methods to PrimarySeq |
|
881
|
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
=head2 _guess_alphabet |
|
883
|
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
Title : _guess_alphabet |
|
885
|
|
|
|
|
|
|
Usage : |
|
886
|
|
|
|
|
|
|
Function: Automatically guess and set the type of sequence: dna, rna, protein |
|
887
|
|
|
|
|
|
|
or '' if the sequence was empty. This method first removes dots (.), |
|
888
|
|
|
|
|
|
|
dashes (-) and question marks (?) before guessing the alphabet |
|
889
|
|
|
|
|
|
|
using the IUPAC conventions for ambiguous residues. Since the DNA and |
|
890
|
|
|
|
|
|
|
RNA characters are also valid characters for proteins, there is |
|
891
|
|
|
|
|
|
|
no foolproof way of determining the right alphabet. This is our best |
|
892
|
|
|
|
|
|
|
guess only! |
|
893
|
|
|
|
|
|
|
Returns : string 'dna', 'rna', 'protein' or ''. |
|
894
|
|
|
|
|
|
|
Args : none |
|
895
|
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=cut |
|
897
|
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
sub _guess_alphabet { |
|
899
|
15781
|
|
|
15781
|
|
20920
|
my ($self) = @_; |
|
900
|
|
|
|
|
|
|
# Guess alphabet |
|
901
|
15781
|
|
|
|
|
25791
|
my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'}); |
|
902
|
|
|
|
|
|
|
# Set alphabet unless it is unknown |
|
903
|
15781
|
100
|
|
|
|
39836
|
$self->alphabet($alphabet) if $alphabet; |
|
904
|
15781
|
|
|
|
|
20107
|
return $alphabet; |
|
905
|
|
|
|
|
|
|
} |
|
906
|
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
sub _guess_alphabet_from_string { |
|
909
|
|
|
|
|
|
|
# Get the alphabet from a sequence string |
|
910
|
18778
|
|
|
18778
|
|
26845
|
my ($self, $str, $nowarnonempty) = @_; |
|
911
|
|
|
|
|
|
|
|
|
912
|
18778
|
100
|
|
|
|
29371
|
$nowarnonempty = 0 if not defined $nowarnonempty; |
|
913
|
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
# Remove chars that clearly don't denote nucleic or amino acids |
|
915
|
18778
|
|
|
|
|
59226
|
$str =~ s/[-.?]//gi; |
|
916
|
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
# Check for sequences without valid letters |
|
918
|
18778
|
|
|
|
|
19706
|
my $alphabet; |
|
919
|
18778
|
|
|
|
|
21524
|
my $total = CORE::length($str); |
|
920
|
18778
|
100
|
|
|
|
28552
|
if( $total == 0 ) { |
|
921
|
2
|
100
|
|
|
|
5
|
if (not $nowarnonempty) { |
|
922
|
1
|
|
|
|
|
7
|
$self->warn("Got a sequence without letters. Could not guess alphabet"); |
|
923
|
|
|
|
|
|
|
} |
|
924
|
2
|
|
|
|
|
5
|
$alphabet = ''; |
|
925
|
|
|
|
|
|
|
} |
|
926
|
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
# Determine alphabet now |
|
928
|
18778
|
100
|
|
|
|
28130
|
if (not defined $alphabet) { |
|
929
|
18776
|
100
|
|
|
|
35567
|
if ($str =~ m/[EFIJLOPQXZ]/i) { |
|
930
|
|
|
|
|
|
|
# Start with a safe method to find proteins. |
|
931
|
|
|
|
|
|
|
# Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z |
|
932
|
1758
|
|
|
|
|
2464
|
$alphabet = 'protein'; |
|
933
|
|
|
|
|
|
|
} else { |
|
934
|
|
|
|
|
|
|
# Alphabet is unsure, could still be DNA, RNA or protein |
|
935
|
|
|
|
|
|
|
# DNA and RNA contain mostly A, T, U, G, C and N, but the other |
|
936
|
|
|
|
|
|
|
# letters they use are also among the 15 valid letters that a |
|
937
|
|
|
|
|
|
|
# protein sequence can contain at this stage. Make our best guess |
|
938
|
|
|
|
|
|
|
# based on sequence composition. If it contains over 70% of ACGTUN, |
|
939
|
|
|
|
|
|
|
# it is likely nucleic. |
|
940
|
17018
|
100
|
|
|
|
40208
|
if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) { |
|
941
|
16011
|
100
|
|
|
|
29402
|
if ( $str =~ m/U/i ) { |
|
942
|
53
|
|
|
|
|
92
|
$alphabet = 'rna'; |
|
943
|
|
|
|
|
|
|
} else { |
|
944
|
15958
|
|
|
|
|
19898
|
$alphabet = 'dna'; |
|
945
|
|
|
|
|
|
|
} |
|
946
|
|
|
|
|
|
|
} else { |
|
947
|
1007
|
|
|
|
|
1435
|
$alphabet = 'protein'; |
|
948
|
|
|
|
|
|
|
} |
|
949
|
|
|
|
|
|
|
} |
|
950
|
|
|
|
|
|
|
} |
|
951
|
|
|
|
|
|
|
|
|
952
|
18778
|
|
|
|
|
31618
|
return $alphabet; |
|
953
|
|
|
|
|
|
|
} |
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
############################################################################ |
|
957
|
|
|
|
|
|
|
# aliases due to name changes or to compensate for our lack of consistency # |
|
958
|
|
|
|
|
|
|
############################################################################ |
|
959
|
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
sub accession { |
|
961
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
|
962
|
|
|
|
|
|
|
|
|
963
|
0
|
|
|
|
|
|
$self->warn(ref($self)."::accession is deprecated, ". |
|
964
|
|
|
|
|
|
|
"use accession_number() instead"); |
|
965
|
0
|
|
|
|
|
|
return $self->accession_number(@_); |
|
966
|
|
|
|
|
|
|
} |
|
967
|
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
1; |