line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# bioperl module for Bio::PrimarySeq |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Cared for by Ewan Birney |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Ewan Birney |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::PrimarySeq - Bioperl lightweight sequence object |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# Bio::SeqIO for file reading, Bio::DB::GenBank for |
21
|
|
|
|
|
|
|
# database reading |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Bio::Seq; |
24
|
|
|
|
|
|
|
use Bio::SeqIO; |
25
|
|
|
|
|
|
|
use Bio::DB::GenBank; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# make from memory |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
$seqobj = Bio::PrimarySeq->new ( |
30
|
|
|
|
|
|
|
-seq => 'ATGGGGTGGGCGGTGGGTGGTTTG', |
31
|
|
|
|
|
|
|
-id => 'GeneFragment-12', |
32
|
|
|
|
|
|
|
-accession_number => 'X78121', |
33
|
|
|
|
|
|
|
-alphabet => 'dna', |
34
|
|
|
|
|
|
|
-is_circular => 1, |
35
|
|
|
|
|
|
|
); |
36
|
|
|
|
|
|
|
print "Sequence ", $seqobj->id(), " with accession ", |
37
|
|
|
|
|
|
|
$seqobj->accession_number, "\n"; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# read from file |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
$inputstream = Bio::SeqIO->new( |
42
|
|
|
|
|
|
|
-file => "myseq.fa", |
43
|
|
|
|
|
|
|
-format => 'Fasta', |
44
|
|
|
|
|
|
|
); |
45
|
|
|
|
|
|
|
$seqobj = $inputstream->next_seq(); |
46
|
|
|
|
|
|
|
print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n"; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# to get out parts of the sequence. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
print "Sequence ", $seqobj->id(), " with accession ", |
51
|
|
|
|
|
|
|
$seqobj->accession_number, " and desc ", $seqobj->desc, "\n"; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
$string = $seqobj->seq(); |
54
|
|
|
|
|
|
|
$string2 = $seqobj->subseq(1,40); |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 DESCRIPTION |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
PrimarySeq is a lightweight sequence object, storing the sequence, its |
59
|
|
|
|
|
|
|
name, a computer-useful unique name, and other fundamental attributes. |
60
|
|
|
|
|
|
|
It does not contain sequence features or other information. To have a |
61
|
|
|
|
|
|
|
sequence with sequence features you should use the Seq object which uses |
62
|
|
|
|
|
|
|
this object. |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Although new users will use Bio::PrimarySeq a lot, in general you will |
65
|
|
|
|
|
|
|
be using it from the Bio::Seq object. For more information on Bio::Seq |
66
|
|
|
|
|
|
|
see L. For interest you might like to know that |
67
|
|
|
|
|
|
|
Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls |
68
|
|
|
|
|
|
|
to do with sequence to it (the has-a relationship lets us get out of a |
69
|
|
|
|
|
|
|
otherwise nasty cyclical reference in Perl which would leak memory). |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Sequence objects are defined by the Bio::PrimarySeqI interface, and this |
72
|
|
|
|
|
|
|
object is a pure Perl implementation of the interface. If that's |
73
|
|
|
|
|
|
|
gibberish to you, don't worry. The take home message is that this |
74
|
|
|
|
|
|
|
object is the bioperl default sequence object, but other people can |
75
|
|
|
|
|
|
|
use their own objects as sequences if they so wish. If you are |
76
|
|
|
|
|
|
|
interested in wrapping your own objects as compliant Bioperl sequence |
77
|
|
|
|
|
|
|
objects, then you should read the Bio::PrimarySeqI documentation |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
The documentation of this object is a merge of the Bio::PrimarySeq and |
80
|
|
|
|
|
|
|
Bio::PrimarySeqI documentation. This allows all the methods which you can |
81
|
|
|
|
|
|
|
call on sequence objects here. |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=head1 FEEDBACK |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=head2 Mailing Lists |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
88
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
89
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
92
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=head2 Support |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
I |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
101
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
102
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
103
|
|
|
|
|
|
|
with code and data examples if at all possible. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head2 Reporting Bugs |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
108
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
109
|
|
|
|
|
|
|
web: |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head1 AUTHOR - Ewan Birney |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Email birney@ebi.ac.uk |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
=head1 APPENDIX |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
120
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=cut |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
package Bio::PrimarySeq; |
127
|
|
|
|
|
|
|
|
128
|
203
|
|
|
203
|
|
8216
|
use strict; |
|
203
|
|
|
|
|
316
|
|
|
203
|
|
|
|
|
8952
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~'; |
131
|
|
|
|
|
|
|
our $GAP_SYMBOLS = '-~'; |
132
|
|
|
|
|
|
|
|
133
|
203
|
|
|
|
|
76876
|
use base qw(Bio::Root::Root Bio::PrimarySeqI |
134
|
203
|
|
|
203
|
|
657
|
Bio::IdentifiableI Bio::DescribableI); |
|
203
|
|
|
|
|
282
|
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# Setup the allowed values for alphabet() |
138
|
|
|
|
|
|
|
my %valid_type = map {$_, 1} qw( dna rna protein ); |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=head2 new |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
Title : new |
144
|
|
|
|
|
|
|
Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT', |
145
|
|
|
|
|
|
|
-id => 'human_id', |
146
|
|
|
|
|
|
|
-accession_number => 'AL000012', |
147
|
|
|
|
|
|
|
); |
148
|
|
|
|
|
|
|
Function: Returns a new primary seq object from |
149
|
|
|
|
|
|
|
basic constructors, being a string for the sequence |
150
|
|
|
|
|
|
|
and strings for id and accession_number. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Note that you can provide an empty sequence string. However, in |
153
|
|
|
|
|
|
|
this case you MUST specify the type of sequence you wish to |
154
|
|
|
|
|
|
|
initialize by the parameter -alphabet. See alphabet() for possible |
155
|
|
|
|
|
|
|
values. |
156
|
|
|
|
|
|
|
Returns : a new Bio::PrimarySeq object |
157
|
|
|
|
|
|
|
Args : -seq => sequence string |
158
|
|
|
|
|
|
|
-ref_to_seq => ... or reference to a sequence string |
159
|
|
|
|
|
|
|
-display_id => display id of the sequence (locus name) |
160
|
|
|
|
|
|
|
-accession_number => accession number |
161
|
|
|
|
|
|
|
-primary_id => primary id (Genbank id) |
162
|
|
|
|
|
|
|
-version => version number |
163
|
|
|
|
|
|
|
-namespace => the namespace for the accession |
164
|
|
|
|
|
|
|
-authority => the authority for the namespace |
165
|
|
|
|
|
|
|
-description => description text |
166
|
|
|
|
|
|
|
-desc => alias for description |
167
|
|
|
|
|
|
|
-alphabet => skip alphabet guess and set it to dna, rna or protein |
168
|
|
|
|
|
|
|
-id => alias for display id |
169
|
|
|
|
|
|
|
-is_circular => boolean to indicate that sequence is circular |
170
|
|
|
|
|
|
|
-direct => boolean to directly set sequences. The next time -seq, |
171
|
|
|
|
|
|
|
seq() or -ref_to_seq is use, the sequence will not be |
172
|
|
|
|
|
|
|
validated. Be careful with this... |
173
|
|
|
|
|
|
|
-nowarnonempty => boolean to avoid warning when sequence is empty |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=cut |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub new { |
178
|
14995
|
|
|
14995
|
1
|
38180
|
my ($class, @args) = @_; |
179
|
14995
|
|
|
|
|
28824
|
my $self = $class->SUPER::new(@args); |
180
|
14995
|
|
|
|
|
55120
|
my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description, |
181
|
|
|
|
|
|
|
$alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len, |
182
|
|
|
|
|
|
|
$nowarnonempty) = |
183
|
|
|
|
|
|
|
$self->_rearrange([qw(SEQ |
184
|
|
|
|
|
|
|
DISPLAY_ID |
185
|
|
|
|
|
|
|
ACCESSION_NUMBER |
186
|
|
|
|
|
|
|
PRIMARY_ID |
187
|
|
|
|
|
|
|
NAMESPACE |
188
|
|
|
|
|
|
|
AUTHORITY |
189
|
|
|
|
|
|
|
VERSION |
190
|
|
|
|
|
|
|
OBJECT_ID |
191
|
|
|
|
|
|
|
DESC |
192
|
|
|
|
|
|
|
DESCRIPTION |
193
|
|
|
|
|
|
|
ALPHABET |
194
|
|
|
|
|
|
|
ID |
195
|
|
|
|
|
|
|
IS_CIRCULAR |
196
|
|
|
|
|
|
|
DIRECT |
197
|
|
|
|
|
|
|
REF_TO_SEQ |
198
|
|
|
|
|
|
|
LENGTH |
199
|
|
|
|
|
|
|
NOWARNONEMPTY |
200
|
|
|
|
|
|
|
)], |
201
|
|
|
|
|
|
|
@args); |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# Private var _nowarnonempty, needs to be set before calling _guess_alphabet |
204
|
14995
|
|
|
|
|
44412
|
$self->{'_nowarnonempty'} = $nowarnonempty; |
205
|
14995
|
|
|
|
|
13599
|
$self->{'_direct'} = $direct; |
206
|
|
|
|
|
|
|
|
207
|
14995
|
100
|
100
|
|
|
30508
|
if( defined $id && defined $given_id ) { |
208
|
6
|
50
|
|
|
|
12
|
if( $id ne $given_id ) { |
209
|
0
|
|
|
|
|
0
|
$self->throw("Provided both id and display_id constructors: [$id] [$given_id]"); |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
} |
212
|
14995
|
100
|
|
|
|
19768
|
if( defined $given_id ) { $id = $given_id; } |
|
11740
|
|
|
|
|
9756
|
|
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# Bernd's idea: set ids now for more informative invalid sequence messages |
215
|
14995
|
100
|
|
|
|
30132
|
defined $id && $self->display_id($id); |
216
|
14995
|
100
|
|
|
|
20689
|
$acc && $self->accession_number($acc); |
217
|
14995
|
100
|
|
|
|
19517
|
defined $pid && $self->primary_id($pid); |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# Set alphabet now to avoid guessing it later, when sequence is set |
220
|
14995
|
100
|
|
|
|
26207
|
$alphabet && $self->alphabet($alphabet); |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# Set the length before the seq. If there is a seq, length will be updated later |
223
|
14995
|
|
100
|
|
|
38849
|
$self->{'length'} = $len || 0; |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# Set the sequence (but also alphabet and length) |
226
|
14995
|
100
|
|
|
|
18557
|
if ($ref_to_seq) { |
227
|
1
|
|
|
|
|
2
|
$self->_set_seq_by_ref($ref_to_seq, $alphabet); |
228
|
|
|
|
|
|
|
} else { |
229
|
14994
|
100
|
|
|
|
20259
|
if (defined $seq) { |
230
|
|
|
|
|
|
|
# Note: the sequence string may be empty |
231
|
14268
|
|
|
|
|
19504
|
$self->seq($seq); |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
14993
|
100
|
|
|
|
20389
|
$desc && $self->desc($desc); |
236
|
14993
|
100
|
|
|
|
17842
|
$description && $self->description($description); |
237
|
14993
|
100
|
|
|
|
17912
|
$ns && $self->namespace($ns); |
238
|
14993
|
100
|
|
|
|
18510
|
$auth && $self->authority($auth); |
239
|
|
|
|
|
|
|
# Any variable that can have a value "0" must be tested with defined |
240
|
|
|
|
|
|
|
# or it will fail to be added to the new object |
241
|
14993
|
100
|
|
|
|
18812
|
defined($v) && $self->version($v); |
242
|
14993
|
50
|
|
|
|
19852
|
defined($oid) && $self->object_id($oid); |
243
|
14993
|
100
|
|
|
|
17984
|
defined($is_circular) && $self->is_circular($is_circular); |
244
|
|
|
|
|
|
|
|
245
|
14993
|
|
|
|
|
39727
|
return $self; |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head2 seq |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
Title : seq |
252
|
|
|
|
|
|
|
Usage : $string = $seqobj->seq(); |
253
|
|
|
|
|
|
|
Function: Get or set the sequence as a string of letters. The case of |
254
|
|
|
|
|
|
|
the letters is left up to the implementer. Suggested cases are |
255
|
|
|
|
|
|
|
upper case for proteins and lower case for DNA sequence (IUPAC |
256
|
|
|
|
|
|
|
standard), but you should not rely on this. An error is thrown if |
257
|
|
|
|
|
|
|
the sequence contains invalid characters: see validate_seq(). |
258
|
|
|
|
|
|
|
Returns : A scalar |
259
|
|
|
|
|
|
|
Args : - Optional new sequence value (a string) to set |
260
|
|
|
|
|
|
|
- Optional alphabet (it is guessed by default) |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
=cut |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
sub seq { |
265
|
168456
|
|
|
168456
|
1
|
175702
|
my ($self, @args) = @_; |
266
|
|
|
|
|
|
|
|
267
|
168456
|
100
|
|
|
|
208552
|
if( scalar @args == 0 ) { |
268
|
141004
|
|
|
|
|
300461
|
return $self->{'seq'}; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
27452
|
|
|
|
|
25481
|
my ($seq_str, $alphabet) = @args; |
272
|
27452
|
50
|
|
|
|
35374
|
if (@args) { |
273
|
27452
|
|
|
|
|
35435
|
$self->_set_seq_by_ref(\$seq_str, $alphabet); |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
27449
|
|
|
|
|
37834
|
return $self->{'seq'}; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
sub _set_seq_by_ref { |
281
|
|
|
|
|
|
|
# Set a sequence by reference. A reference is used to avoid the cost of |
282
|
|
|
|
|
|
|
# copying the sequence (which can be very large) between functions. |
283
|
27453
|
|
|
27453
|
|
22933
|
my ($self, $seq_str_ref, $alphabet) = @_; |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# Validate sequence if sequence is not empty and we are not in direct mode |
286
|
27453
|
100
|
100
|
|
|
83254
|
if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) { |
287
|
27281
|
|
|
|
|
38720
|
$self->validate_seq($$seq_str_ref, 1); |
288
|
|
|
|
|
|
|
} |
289
|
27450
|
|
|
|
|
28453
|
delete $self->{'_direct'}; # next sequence will have to be validated |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
# Record sequence length |
292
|
27450
|
|
100
|
|
|
39888
|
my $len = CORE::length($$seq_str_ref || ''); |
293
|
27450
|
|
100
|
|
|
53983
|
my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0); |
294
|
|
|
|
|
|
|
# Note: if the new seq is empty or undef, this is not considered a change |
295
|
27450
|
100
|
|
|
|
37237
|
delete $self->{'_freeze_length'} if $is_changed_seq; |
296
|
27450
|
100
|
|
|
|
39525
|
$self->{'length'} = $len if not exists $self->{'_freeze_length'}; |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# Set sequence |
299
|
27450
|
|
|
|
|
27382
|
$self->{'seq'} = $$seq_str_ref; |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
# Set or guess alphabet |
302
|
27450
|
100
|
100
|
|
|
66844
|
if ($alphabet) { |
|
|
100
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# Alphabet specified, set it no matter what |
304
|
18
|
|
|
|
|
24
|
$self->alphabet($alphabet); |
305
|
|
|
|
|
|
|
} elsif ($is_changed_seq || (! defined($self->alphabet()))) { |
306
|
|
|
|
|
|
|
# If we changed a previous sequence to a new one or if there is no |
307
|
|
|
|
|
|
|
# alphabet yet at all, we need to guess the (possibly new) alphabet |
308
|
15780
|
|
|
|
|
21838
|
$self->_guess_alphabet(); |
309
|
|
|
|
|
|
|
} # else (seq not changed and alphabet was defined) do nothing |
310
|
|
|
|
|
|
|
|
311
|
27450
|
|
|
|
|
28287
|
return 1; |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=head2 validate_seq |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
Title : validate_seq |
318
|
|
|
|
|
|
|
Usage : if(! $seqobj->validate_seq($seq_str) ) { |
319
|
|
|
|
|
|
|
print "sequence $seq_str is not valid for an object of |
320
|
|
|
|
|
|
|
alphabet ",$seqobj->alphabet, "\n"; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
Function: Test that the given sequence is valid, i.e. contains only valid |
323
|
|
|
|
|
|
|
characters. The allowed characters are all letters (A-Z) and '-','.', |
324
|
|
|
|
|
|
|
'*','?','=' and '~'. Spaces are not valid. Note that this |
325
|
|
|
|
|
|
|
implementation does not take alphabet() into account and that empty |
326
|
|
|
|
|
|
|
sequences are considered valid. |
327
|
|
|
|
|
|
|
Returns : 1 if the supplied sequence string is valid, 0 otherwise. |
328
|
|
|
|
|
|
|
Args : - Sequence string to be validated |
329
|
|
|
|
|
|
|
- Boolean to optionally throw an error if the sequence is invalid |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
=cut |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub validate_seq { |
334
|
23098
|
|
|
23098
|
1
|
20666
|
my ($self, $seqstr, $throw) = @_; |
335
|
23098
|
100
|
100
|
|
|
134851
|
if ( (defined $seqstr ) && |
336
|
|
|
|
|
|
|
($seqstr !~ /^[$MATCHPATTERN]*$/) ) { |
337
|
8
|
100
|
|
|
|
14
|
if ($throw) { |
338
|
3
|
|
50
|
|
|
7
|
$self->throw("Failed validation of sequence '".(defined($self->id) || |
339
|
|
|
|
|
|
|
'[unidentified sequence]')."'. Invalid characters were: " . |
340
|
|
|
|
|
|
|
join('',($seqstr =~ /[^$MATCHPATTERN]/g))); |
341
|
|
|
|
|
|
|
} |
342
|
5
|
|
|
|
|
19
|
return 0; |
343
|
|
|
|
|
|
|
} |
344
|
23090
|
|
|
|
|
26157
|
return 1; |
345
|
|
|
|
|
|
|
} |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=head2 subseq |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
Title : subseq |
351
|
|
|
|
|
|
|
Usage : $substring = $seqobj->subseq(10,40); |
352
|
|
|
|
|
|
|
$substring = $seqobj->subseq(10,40,'nogap'); |
353
|
|
|
|
|
|
|
$substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga'); |
354
|
|
|
|
|
|
|
$substring = $seqobj->subseq($location_obj); |
355
|
|
|
|
|
|
|
$substring = $seqobj->subseq($location_obj, -nogap => 1); |
356
|
|
|
|
|
|
|
Function: Return the subseq from start to end, where the first sequence |
357
|
|
|
|
|
|
|
character has coordinate 1 number is inclusive, ie 1-2 are the |
358
|
|
|
|
|
|
|
first two characters of the sequence. The given start coordinate |
359
|
|
|
|
|
|
|
has to be larger than the end, even if the sequence is circular. |
360
|
|
|
|
|
|
|
Returns : a string |
361
|
|
|
|
|
|
|
Args : integer for start position |
362
|
|
|
|
|
|
|
integer for end position |
363
|
|
|
|
|
|
|
OR |
364
|
|
|
|
|
|
|
Bio::LocationI location for subseq (strand honored) |
365
|
|
|
|
|
|
|
Specify -NOGAP=>1 to return subseq with gap characters removed |
366
|
|
|
|
|
|
|
Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned |
367
|
|
|
|
|
|
|
with $new_subseq in the sequence object |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=cut |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
sub subseq { |
372
|
10039
|
|
|
10039
|
1
|
6913
|
my $self = shift; |
373
|
10039
|
|
|
|
|
9450
|
my @args = @_; |
374
|
10039
|
|
|
|
|
19216
|
my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START |
375
|
|
|
|
|
|
|
END |
376
|
|
|
|
|
|
|
NOGAP |
377
|
|
|
|
|
|
|
REPLACE_WITH)], @args); |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
# If -replace_with is specified, validate the replacement sequence |
380
|
10039
|
100
|
|
|
|
14740
|
if (defined $replace) { |
381
|
2
|
100
|
|
|
|
5
|
$self->validate_seq( $replace ) || |
382
|
|
|
|
|
|
|
$self->throw("Replacement sequence does not look valid"); |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
10038
|
100
|
66
|
|
|
32809
|
if( ref($start) && $start->isa('Bio::LocationI') ) { |
|
|
50
|
33
|
|
|
|
|
386
|
52
|
|
|
|
|
41
|
my $loc = $start; |
387
|
52
|
|
|
|
|
42
|
my $seq = ''; |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
# For Split objects if Guide Strand is negative, |
390
|
|
|
|
|
|
|
# pass the sublocations in reverse |
391
|
52
|
|
|
|
|
44
|
my $order = 0; |
392
|
52
|
100
|
|
|
|
119
|
if ($loc->isa('Bio::Location::SplitLocationI')) { |
393
|
|
|
|
|
|
|
# guide_strand can return undef, so don't compare directly |
394
|
|
|
|
|
|
|
# to avoid 'uninitialized value' warning |
395
|
44
|
100
|
|
|
|
70
|
my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0; |
396
|
44
|
100
|
|
|
|
52
|
$order = ($guide_strand == -1) ? -1 : 0; |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
# Reversing order using ->each_Location(-1) does not work well for |
399
|
|
|
|
|
|
|
# cut by origin-splits (like "complement(join(1900..END,START..50))"), |
400
|
|
|
|
|
|
|
# so use "reverse" instead |
401
|
52
|
100
|
|
|
|
108
|
my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location; |
402
|
52
|
|
|
|
|
57
|
foreach my $subloc (@sublocs) { |
403
|
120
|
|
|
|
|
244
|
my $piece = $self->subseq(-start => $subloc->start(), |
404
|
|
|
|
|
|
|
-end => $subloc->end(), |
405
|
|
|
|
|
|
|
-replace_with => $replace, |
406
|
|
|
|
|
|
|
-nogap => $nogap); |
407
|
120
|
100
|
|
|
|
177
|
$piece =~ s/[$GAP_SYMBOLS]//g if $nogap; |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# strand can return undef, so don't compare directly |
410
|
|
|
|
|
|
|
# to avoid 'uninitialized value' warning |
411
|
120
|
100
|
|
|
|
176
|
my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0; |
412
|
120
|
100
|
|
|
|
178
|
if ($strand < 0) { |
413
|
59
|
|
|
|
|
77
|
$piece = $self->_revcom_from_string($piece, $self->alphabet); |
414
|
|
|
|
|
|
|
} |
415
|
120
|
|
|
|
|
158
|
$seq .= $piece; |
416
|
|
|
|
|
|
|
} |
417
|
52
|
|
|
|
|
196
|
return $seq; |
418
|
|
|
|
|
|
|
} elsif( defined $start && defined $end ) { |
419
|
9986
|
50
|
|
|
|
11682
|
if( $start > $end ){ |
420
|
0
|
|
|
|
|
0
|
$self->throw("Bad start,end parameters. Start [$start] has to be ". |
421
|
|
|
|
|
|
|
"less than end [$end]"); |
422
|
|
|
|
|
|
|
} |
423
|
9986
|
50
|
|
|
|
11248
|
if( $start <= 0 ) { |
424
|
0
|
|
|
|
|
0
|
$self->throw("Bad start parameter ($start). Start must be positive."); |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
# Remove one from start, and then length is end-start |
428
|
9986
|
|
|
|
|
6381
|
$start--; |
429
|
|
|
|
|
|
|
|
430
|
9986
|
|
|
|
|
5771
|
my $seqstr; |
431
|
9986
|
100
|
|
|
|
9392
|
if (defined $replace) { |
432
|
1
|
|
|
|
|
4
|
$seqstr = substr $self->{seq}, $start, $end-$start, $replace; |
433
|
|
|
|
|
|
|
} else { |
434
|
9985
|
|
|
|
|
12949
|
$seqstr = substr $self->{seq}, $start, $end-$start; |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
|
438
|
9986
|
100
|
|
|
|
11061
|
if ($end > $self->length) { |
439
|
1
|
50
|
|
|
|
3
|
if ($self->is_circular) { |
440
|
1
|
|
|
|
|
2
|
my $start = 0; |
441
|
1
|
|
|
|
|
2
|
my $end = $end - $self->length; |
442
|
|
|
|
|
|
|
|
443
|
1
|
|
|
|
|
1
|
my $appendstr; |
444
|
1
|
50
|
|
|
|
2
|
if (defined $replace) { |
445
|
0
|
|
|
|
|
0
|
$appendstr = substr $self->{seq}, $start, $end-$start, $replace; |
446
|
|
|
|
|
|
|
} else { |
447
|
1
|
|
|
|
|
2
|
$appendstr = substr $self->{seq}, $start, $end-$start; |
448
|
|
|
|
|
|
|
} |
449
|
|
|
|
|
|
|
|
450
|
1
|
|
|
|
|
1
|
$seqstr .= $appendstr; |
451
|
|
|
|
|
|
|
} else { |
452
|
0
|
|
|
|
|
0
|
$self->throw("Bad end parameter ($end). End must be less than ". |
453
|
|
|
|
|
|
|
"the total length of sequence (total=".$self->length.")") |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
9986
|
100
|
|
|
|
11462
|
$seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap); |
458
|
9986
|
|
|
|
|
16173
|
return $seqstr; |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
} else { |
461
|
0
|
|
|
|
|
0
|
$self->warn("Incorrect parameters to subseq - must be two integers or ". |
462
|
|
|
|
|
|
|
"a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap); |
463
|
0
|
|
|
|
|
0
|
return; |
464
|
|
|
|
|
|
|
} |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
=head2 length |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
Title : length |
471
|
|
|
|
|
|
|
Usage : $len = $seqobj->length(); |
472
|
|
|
|
|
|
|
Function: Get the stored length of the sequence in number of symbols (bases |
473
|
|
|
|
|
|
|
or amino acids). In some circumstances, you can also set this attribute: |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
1. For empty sequences, you can set the length to anything you want: |
476
|
|
|
|
|
|
|
my $seqobj = Bio::PrimarySeq->new( -length => 123 ); |
477
|
|
|
|
|
|
|
my $len = $seqobj->len; # 123 |
478
|
|
|
|
|
|
|
2. To save memory when using very long sequences, you can set the |
479
|
|
|
|
|
|
|
length of the sequence to the length of the sequence (and nothing |
480
|
|
|
|
|
|
|
else): |
481
|
|
|
|
|
|
|
my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence |
482
|
|
|
|
|
|
|
# process $seqobj... then after you're done with it |
483
|
|
|
|
|
|
|
$seqobj->length($seqobj->length); |
484
|
|
|
|
|
|
|
$seqobj->seq(undef); # free memory! |
485
|
|
|
|
|
|
|
my $len = $seqobj->len; # 1 Mbp |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
Note that if you set seq() to a value other than undef at any time, |
488
|
|
|
|
|
|
|
the length attribute will be reset. |
489
|
|
|
|
|
|
|
Returns : integer representing the length of the sequence. |
490
|
|
|
|
|
|
|
Args : Optionally, the value on set |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=cut |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
sub length { |
495
|
30694
|
|
|
30694
|
1
|
28476
|
my ($self, $val) = @_; |
496
|
30694
|
100
|
|
|
|
35597
|
if (defined $val) { |
497
|
5
|
|
|
|
|
8
|
my $len = $self->{'length'}; |
498
|
5
|
100
|
100
|
|
|
19
|
if ($len && ($len != $val)) { |
499
|
1
|
|
|
|
|
5
|
$self->throw("Can not set the length to $val, current length value is $len"); |
500
|
|
|
|
|
|
|
} |
501
|
4
|
|
|
|
|
6
|
$self->{'length'} = $val; |
502
|
4
|
|
|
|
|
9
|
$self->{'_freeze_length'} = undef; |
503
|
|
|
|
|
|
|
} |
504
|
30693
|
|
|
|
|
42354
|
return $self->{'length'}; |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=head2 display_id |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
Title : display_id or display_name |
511
|
|
|
|
|
|
|
Usage : $id_string = $seqobj->display_id(); |
512
|
|
|
|
|
|
|
Function: Get or set the display id, aka the common name of the sequence object. |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
The semantics of this is that it is the most likely string to |
515
|
|
|
|
|
|
|
be used as an identifier of the sequence, and likely to have |
516
|
|
|
|
|
|
|
"human" readability. The id is equivalent to the ID field of |
517
|
|
|
|
|
|
|
the GenBank/EMBL databanks and the id field of the |
518
|
|
|
|
|
|
|
Swissprot/sptrembl database. In fasta format, the >(\S+) is |
519
|
|
|
|
|
|
|
presumed to be the id, though some people overload the id to |
520
|
|
|
|
|
|
|
embed other information. Bioperl does not use any embedded |
521
|
|
|
|
|
|
|
information in the ID field, and people are encouraged to use |
522
|
|
|
|
|
|
|
other mechanisms (accession field for example, or extending |
523
|
|
|
|
|
|
|
the sequence object) to solve this. |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
With the new Bio::DescribeableI interface, display_name aliases |
526
|
|
|
|
|
|
|
to this method. |
527
|
|
|
|
|
|
|
Returns : A string for the display ID |
528
|
|
|
|
|
|
|
Args : Optional string for the display ID to set |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
=cut |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
sub display_id { |
533
|
23860
|
|
|
23860
|
1
|
25368
|
my ($self, $value) = @_; |
534
|
23860
|
100
|
|
|
|
30150
|
if( defined $value) { |
535
|
14090
|
|
|
|
|
15683
|
$self->{'display_id'} = $value; |
536
|
|
|
|
|
|
|
} |
537
|
23860
|
|
|
|
|
34354
|
return $self->{'display_id'}; |
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=head2 accession_number |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
Title : accession_number or object_id |
544
|
|
|
|
|
|
|
Usage : $unique_key = $seqobj->accession_number; |
545
|
|
|
|
|
|
|
Function: Returns the unique biological id for a sequence, commonly |
546
|
|
|
|
|
|
|
called the accession_number. For sequences from established |
547
|
|
|
|
|
|
|
databases, the implementors should try to use the correct |
548
|
|
|
|
|
|
|
accession number. Notice that primary_id() provides the |
549
|
|
|
|
|
|
|
unique id for the implemetation, allowing multiple objects |
550
|
|
|
|
|
|
|
to have the same accession number in a particular implementation. |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
For sequences with no accession number, this method should |
553
|
|
|
|
|
|
|
return "unknown". |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
[Note this method name is likely to change in 1.3] |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
With the new Bio::IdentifiableI interface, this is aliased |
558
|
|
|
|
|
|
|
to object_id |
559
|
|
|
|
|
|
|
Returns : A string |
560
|
|
|
|
|
|
|
Args : A string (optional) for setting |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
=cut |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
sub accession_number { |
565
|
900
|
|
|
900
|
1
|
1667
|
my( $self, $acc ) = @_; |
566
|
900
|
100
|
|
|
|
1453
|
if (defined $acc) { |
567
|
654
|
|
|
|
|
947
|
$self->{'accession_number'} = $acc; |
568
|
|
|
|
|
|
|
} else { |
569
|
246
|
|
|
|
|
315
|
$acc = $self->{'accession_number'}; |
570
|
246
|
100
|
|
|
|
519
|
$acc = 'unknown' unless defined $acc; |
571
|
|
|
|
|
|
|
} |
572
|
900
|
|
|
|
|
1540
|
return $acc; |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=head2 primary_id |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
Title : primary_id |
579
|
|
|
|
|
|
|
Usage : $unique_key = $seqobj->primary_id; |
580
|
|
|
|
|
|
|
Function: Returns the unique id for this object in this |
581
|
|
|
|
|
|
|
implementation. This allows implementations to manage their |
582
|
|
|
|
|
|
|
own object ids in a way the implementaiton can control |
583
|
|
|
|
|
|
|
clients can expect one id to map to one object. |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
For sequences with no natural primary id, this method |
586
|
|
|
|
|
|
|
should return a stringified memory location. |
587
|
|
|
|
|
|
|
Returns : A string |
588
|
|
|
|
|
|
|
Args : A string (optional, for setting) |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
=cut |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
sub primary_id { |
593
|
505
|
|
|
505
|
1
|
743
|
my $self = shift; |
594
|
|
|
|
|
|
|
|
595
|
505
|
100
|
|
|
|
1187
|
if(@_) { |
596
|
447
|
|
|
|
|
665
|
$self->{'primary_id'} = shift; |
597
|
|
|
|
|
|
|
} |
598
|
505
|
100
|
|
|
|
880
|
if( ! defined($self->{'primary_id'}) ) { |
599
|
21
|
|
|
|
|
172
|
return "$self"; |
600
|
|
|
|
|
|
|
} |
601
|
484
|
|
|
|
|
547
|
return $self->{'primary_id'}; |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
=head2 alphabet |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
Title : alphabet |
608
|
|
|
|
|
|
|
Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something } |
609
|
|
|
|
|
|
|
Function: Get/set the alphabet of sequence, one of |
610
|
|
|
|
|
|
|
'dna', 'rna' or 'protein'. This is case sensitive. |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
This is not called because this would cause |
613
|
|
|
|
|
|
|
upgrade problems from the 0.5 and earlier Seq objects. |
614
|
|
|
|
|
|
|
Returns : a string either 'dna','rna','protein'. NB - the object must |
615
|
|
|
|
|
|
|
make a call of the type - if there is no alphabet specified it |
616
|
|
|
|
|
|
|
has to guess. |
617
|
|
|
|
|
|
|
Args : optional string to set : 'dna' | 'rna' | 'protein' |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
=cut |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
sub alphabet { |
623
|
54892
|
|
|
54892
|
1
|
45273
|
my ($self,$value) = @_; |
624
|
54892
|
100
|
|
|
|
67370
|
if (defined $value) { |
625
|
27920
|
|
|
|
|
25250
|
$value = lc $value; |
626
|
27920
|
50
|
|
|
|
41710
|
unless ( $valid_type{$value} ) { |
627
|
0
|
|
|
|
|
0
|
$self->throw("Alphabet '$value' is not a valid alphabet (". |
628
|
|
|
|
|
|
|
join(',', map "'$_'", sort keys %valid_type) .") lowercase"); |
629
|
|
|
|
|
|
|
} |
630
|
27920
|
|
|
|
|
33263
|
$self->{'alphabet'} = $value; |
631
|
|
|
|
|
|
|
} |
632
|
54892
|
|
|
|
|
84771
|
return $self->{'alphabet'}; |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
=head2 desc |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
Title : desc or description |
639
|
|
|
|
|
|
|
Usage : $seqobj->desc($newval); |
640
|
|
|
|
|
|
|
Function: Get/set description of the sequence. |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
'description' is an alias for this for compliance with the |
643
|
|
|
|
|
|
|
Bio::DescribeableI interface. |
644
|
|
|
|
|
|
|
Returns : value of desc (a string) |
645
|
|
|
|
|
|
|
Args : newvalue (a string or undef, optional) |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=cut |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
sub desc{ |
651
|
1711
|
|
|
1711
|
1
|
5959
|
my $self = shift; |
652
|
|
|
|
|
|
|
|
653
|
1711
|
100
|
|
|
|
4125
|
return $self->{'desc'} = shift if @_; |
654
|
454
|
|
|
|
|
1408
|
return $self->{'desc'}; |
655
|
|
|
|
|
|
|
} |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
=head2 can_call_new |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
Title : can_call_new |
661
|
|
|
|
|
|
|
Usage : |
662
|
|
|
|
|
|
|
Function: |
663
|
|
|
|
|
|
|
Example : |
664
|
|
|
|
|
|
|
Returns : true |
665
|
|
|
|
|
|
|
Args : |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
=cut |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
sub can_call_new { |
670
|
10
|
|
|
10
|
1
|
9
|
my ($self) = @_; |
671
|
|
|
|
|
|
|
|
672
|
10
|
|
|
|
|
173
|
return 1; |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=head2 id |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
Title : id |
679
|
|
|
|
|
|
|
Usage : $id = $seqobj->id(); |
680
|
|
|
|
|
|
|
Function: This is mapped on display_id |
681
|
|
|
|
|
|
|
Example : |
682
|
|
|
|
|
|
|
Returns : |
683
|
|
|
|
|
|
|
Args : |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
=cut |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
sub id { |
688
|
9031
|
|
|
9031
|
1
|
17124
|
return shift->display_id(@_); |
689
|
|
|
|
|
|
|
} |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
=head2 is_circular |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
Title : is_circular |
695
|
|
|
|
|
|
|
Usage : if( $seqobj->is_circular) { # Do something } |
696
|
|
|
|
|
|
|
Function: Returns true if the molecule is circular |
697
|
|
|
|
|
|
|
Returns : Boolean value |
698
|
|
|
|
|
|
|
Args : none |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
=cut |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
sub is_circular{ |
703
|
143388
|
|
|
143388
|
1
|
113639
|
my $self = shift; |
704
|
143388
|
100
|
|
|
|
201734
|
return $self->{'is_circular'} = shift if @_; |
705
|
143362
|
|
|
|
|
304910
|
return $self->{'is_circular'}; |
706
|
|
|
|
|
|
|
} |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
=head1 Methods for Bio::IdentifiableI compliance |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
=head2 object_id |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
Title : object_id |
714
|
|
|
|
|
|
|
Usage : $string = $seqobj->object_id(); |
715
|
|
|
|
|
|
|
Function: Get or set a string which represents the stable primary identifier |
716
|
|
|
|
|
|
|
in this namespace of this object. For DNA sequences this |
717
|
|
|
|
|
|
|
is its accession_number, similarly for protein sequences. |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
This is aliased to accession_number(). |
720
|
|
|
|
|
|
|
Returns : A scalar |
721
|
|
|
|
|
|
|
Args : Optional object ID to set. |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
=cut |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
sub object_id { |
726
|
4
|
|
|
4
|
1
|
7
|
return shift->accession_number(@_); |
727
|
|
|
|
|
|
|
} |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=head2 version |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
Title : version |
733
|
|
|
|
|
|
|
Usage : $version = $seqobj->version(); |
734
|
|
|
|
|
|
|
Function: Get or set a number which differentiates between versions of |
735
|
|
|
|
|
|
|
the same object. Higher numbers are considered to be |
736
|
|
|
|
|
|
|
later and more relevant, but a single object described |
737
|
|
|
|
|
|
|
the same identifier should represent the same concept. |
738
|
|
|
|
|
|
|
Returns : A number |
739
|
|
|
|
|
|
|
Args : Optional version to set. |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
=cut |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
sub version{ |
744
|
3582
|
|
|
3582
|
1
|
3079
|
my ($self,$value) = @_; |
745
|
3582
|
100
|
|
|
|
4761
|
if( defined $value) { |
746
|
292
|
|
|
|
|
669
|
$self->{'_version'} = $value; |
747
|
|
|
|
|
|
|
} |
748
|
3582
|
|
|
|
|
5667
|
return $self->{'_version'}; |
749
|
|
|
|
|
|
|
} |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
=head2 authority |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
Title : authority |
755
|
|
|
|
|
|
|
Usage : $authority = $seqobj->authority(); |
756
|
|
|
|
|
|
|
Function: Get or set a string which represents the organisation which |
757
|
|
|
|
|
|
|
granted the namespace, written as the DNS name of the |
758
|
|
|
|
|
|
|
organisation (eg, wormbase.org). |
759
|
|
|
|
|
|
|
Returns : A scalar |
760
|
|
|
|
|
|
|
Args : Optional authority to set. |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
=cut |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
sub authority { |
765
|
91
|
|
|
91
|
1
|
74
|
my ($self, $value) = @_; |
766
|
91
|
100
|
|
|
|
118
|
if( defined $value) { |
767
|
86
|
|
|
|
|
88
|
$self->{'authority'} = $value; |
768
|
|
|
|
|
|
|
} |
769
|
91
|
|
|
|
|
87
|
return $self->{'authority'}; |
770
|
|
|
|
|
|
|
} |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
=head2 namespace |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
Title : namespace |
776
|
|
|
|
|
|
|
Usage : $string = $seqobj->namespace(); |
777
|
|
|
|
|
|
|
Function: Get or set a string representing the name space this identifier |
778
|
|
|
|
|
|
|
is valid in, often the database name or the name describing the |
779
|
|
|
|
|
|
|
collection. |
780
|
|
|
|
|
|
|
Returns : A scalar |
781
|
|
|
|
|
|
|
Args : Optional namespace to set. |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
=cut |
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
sub namespace{ |
786
|
527
|
|
|
527
|
1
|
474
|
my ($self,$value) = @_; |
787
|
527
|
100
|
|
|
|
752
|
if( defined $value) { |
788
|
493
|
|
|
|
|
745
|
$self->{'namespace'} = $value; |
789
|
|
|
|
|
|
|
} |
790
|
527
|
|
100
|
|
|
957
|
return $self->{'namespace'} || ""; |
791
|
|
|
|
|
|
|
} |
792
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
=head1 Methods for Bio::DescribableI compliance |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
This comprises of display_name and description. |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
=head2 display_name |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
Title : display_name |
801
|
|
|
|
|
|
|
Usage : $string = $seqobj->display_name(); |
802
|
|
|
|
|
|
|
Function: Get or set a string which is what should be displayed to the user. |
803
|
|
|
|
|
|
|
The string should have no spaces (ideally, though a cautious |
804
|
|
|
|
|
|
|
user of this interface would not assumme this) and should be |
805
|
|
|
|
|
|
|
less than thirty characters (though again, double checking |
806
|
|
|
|
|
|
|
this is a good idea). |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
This is aliased to display_id(). |
809
|
|
|
|
|
|
|
Returns : A string for the display name |
810
|
|
|
|
|
|
|
Args : Optional string for the display name to set. |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
=cut |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
sub display_name { |
815
|
2
|
|
|
2
|
1
|
5
|
return shift->display_id(@_); |
816
|
|
|
|
|
|
|
} |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head2 description |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
Title : description |
822
|
|
|
|
|
|
|
Usage : $string = $seqobj->description(); |
823
|
|
|
|
|
|
|
Function: Get or set a text string suitable for displaying to the user a |
824
|
|
|
|
|
|
|
description. This string is likely to have spaces, but |
825
|
|
|
|
|
|
|
should not have any newlines or formatting - just plain |
826
|
|
|
|
|
|
|
text. The string should not be greater than 255 characters |
827
|
|
|
|
|
|
|
and clients can feel justified at truncating strings at 255 |
828
|
|
|
|
|
|
|
characters for the purposes of display. |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
This is aliased to desc(). |
831
|
|
|
|
|
|
|
Returns : A string for the description |
832
|
|
|
|
|
|
|
Args : Optional string for the description to set. |
833
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
=cut |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
sub description { |
837
|
91
|
|
|
91
|
1
|
169
|
return shift->desc(@_); |
838
|
|
|
|
|
|
|
} |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
=head1 Methods Inherited from Bio::PrimarySeqI |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
These methods are available on Bio::PrimarySeq, although they are |
844
|
|
|
|
|
|
|
actually implemented on Bio::PrimarySeqI |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
=head2 revcom |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
Title : revcom |
849
|
|
|
|
|
|
|
Usage : $rev = $seqobj->revcom(); |
850
|
|
|
|
|
|
|
Function: Produces a new Bio::SeqI implementing object which |
851
|
|
|
|
|
|
|
is the reversed complement of the sequence. For protein |
852
|
|
|
|
|
|
|
sequences this throws an exception of |
853
|
|
|
|
|
|
|
"Sequence is a protein. Cannot revcom". |
854
|
|
|
|
|
|
|
|
855
|
|
|
|
|
|
|
The id is the same id as the orginal sequence, and the |
856
|
|
|
|
|
|
|
accession number is also indentical. If someone wants to |
857
|
|
|
|
|
|
|
track that this sequence has be reversed, it needs to |
858
|
|
|
|
|
|
|
define its own extensions. |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
To do an inplace edit of an object you can go: |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
$seqobj = $seqobj->revcom(); |
863
|
|
|
|
|
|
|
|
864
|
|
|
|
|
|
|
This of course, causes Perl to handle the garbage |
865
|
|
|
|
|
|
|
collection of the old object, but it is roughly speaking as |
866
|
|
|
|
|
|
|
efficient as an inplace edit. |
867
|
|
|
|
|
|
|
Returns : A new (fresh) Bio::SeqI object |
868
|
|
|
|
|
|
|
Args : none |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
=head2 trunc |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
Title : trunc |
873
|
|
|
|
|
|
|
Usage : $subseq = $myseq->trunc(10,100); |
874
|
|
|
|
|
|
|
Function: Provides a truncation of a sequence, |
875
|
|
|
|
|
|
|
Returns : A fresh Bio::SeqI implementing object. |
876
|
|
|
|
|
|
|
Args : Numbers for the start and end positions |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
=head1 Internal methods |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
These are internal methods to PrimarySeq |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
=head2 _guess_alphabet |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
Title : _guess_alphabet |
885
|
|
|
|
|
|
|
Usage : |
886
|
|
|
|
|
|
|
Function: Automatically guess and set the type of sequence: dna, rna, protein |
887
|
|
|
|
|
|
|
or '' if the sequence was empty. This method first removes dots (.), |
888
|
|
|
|
|
|
|
dashes (-) and question marks (?) before guessing the alphabet |
889
|
|
|
|
|
|
|
using the IUPAC conventions for ambiguous residues. Since the DNA and |
890
|
|
|
|
|
|
|
RNA characters are also valid characters for proteins, there is |
891
|
|
|
|
|
|
|
no foolproof way of determining the right alphabet. This is our best |
892
|
|
|
|
|
|
|
guess only! |
893
|
|
|
|
|
|
|
Returns : string 'dna', 'rna', 'protein' or ''. |
894
|
|
|
|
|
|
|
Args : none |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
=cut |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
sub _guess_alphabet { |
899
|
15855
|
|
|
15855
|
|
13908
|
my ($self) = @_; |
900
|
|
|
|
|
|
|
# Guess alphabet |
901
|
15855
|
|
|
|
|
25959
|
my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'}); |
902
|
|
|
|
|
|
|
# Set alphabet unless it is unknown |
903
|
15855
|
100
|
|
|
|
31327
|
$self->alphabet($alphabet) if $alphabet; |
904
|
15855
|
|
|
|
|
13513
|
return $alphabet; |
905
|
|
|
|
|
|
|
} |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
sub _guess_alphabet_from_string { |
909
|
|
|
|
|
|
|
# Get the alphabet from a sequence string |
910
|
18852
|
|
|
18852
|
|
16758
|
my ($self, $str, $nowarnonempty) = @_; |
911
|
|
|
|
|
|
|
|
912
|
18852
|
100
|
|
|
|
27465
|
$nowarnonempty = 0 if not defined $nowarnonempty; |
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
# Remove chars that clearly don't denote nucleic or amino acids |
915
|
18852
|
|
|
|
|
51527
|
$str =~ s/[-.?]//gi; |
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
# Check for sequences without valid letters |
918
|
18852
|
|
|
|
|
13809
|
my $alphabet; |
919
|
18852
|
|
|
|
|
15175
|
my $total = CORE::length($str); |
920
|
18852
|
100
|
|
|
|
25455
|
if( $total == 0 ) { |
921
|
2
|
100
|
|
|
|
4
|
if (not $nowarnonempty) { |
922
|
1
|
|
|
|
|
6
|
$self->warn("Got a sequence without letters. Could not guess alphabet"); |
923
|
|
|
|
|
|
|
} |
924
|
2
|
|
|
|
|
5
|
$alphabet = ''; |
925
|
|
|
|
|
|
|
} |
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
# Determine alphabet now |
928
|
18852
|
100
|
|
|
|
24499
|
if (not defined $alphabet) { |
929
|
18850
|
100
|
|
|
|
30919
|
if ($str =~ m/[EFIJLOPQXZ]/i) { |
930
|
|
|
|
|
|
|
# Start with a safe method to find proteins. |
931
|
|
|
|
|
|
|
# Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z |
932
|
1762
|
|
|
|
|
1887
|
$alphabet = 'protein'; |
933
|
|
|
|
|
|
|
} else { |
934
|
|
|
|
|
|
|
# Alphabet is unsure, could still be DNA, RNA or protein |
935
|
|
|
|
|
|
|
# DNA and RNA contain mostly A, T, U, G, C and N, but the other |
936
|
|
|
|
|
|
|
# letters they use are also among the 15 valid letters that a |
937
|
|
|
|
|
|
|
# protein sequence can contain at this stage. Make our best guess |
938
|
|
|
|
|
|
|
# based on sequence composition. If it contains over 70% of ACGTUN, |
939
|
|
|
|
|
|
|
# it is likely nucleic. |
940
|
17088
|
100
|
|
|
|
35829
|
if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) { |
941
|
16081
|
100
|
|
|
|
26603
|
if ( $str =~ m/U/i ) { |
942
|
53
|
|
|
|
|
152
|
$alphabet = 'rna'; |
943
|
|
|
|
|
|
|
} else { |
944
|
16028
|
|
|
|
|
15855
|
$alphabet = 'dna'; |
945
|
|
|
|
|
|
|
} |
946
|
|
|
|
|
|
|
} else { |
947
|
1007
|
|
|
|
|
1037
|
$alphabet = 'protein'; |
948
|
|
|
|
|
|
|
} |
949
|
|
|
|
|
|
|
} |
950
|
|
|
|
|
|
|
} |
951
|
|
|
|
|
|
|
|
952
|
18852
|
|
|
|
|
25278
|
return $alphabet; |
953
|
|
|
|
|
|
|
} |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
############################################################################ |
957
|
|
|
|
|
|
|
# aliases due to name changes or to compensate for our lack of consistency # |
958
|
|
|
|
|
|
|
############################################################################ |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
sub accession { |
961
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
962
|
|
|
|
|
|
|
|
963
|
0
|
|
|
|
|
|
$self->warn(ref($self)."::accession is deprecated, ". |
964
|
|
|
|
|
|
|
"use accession_number() instead"); |
965
|
0
|
|
|
|
|
|
return $self->accession_number(@_); |
966
|
|
|
|
|
|
|
} |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
1; |