File Coverage

Bio/PrimarySeq.pm

Criterion	Covered	Total	%
statement	172	183	93.9
branch	121	130	93.0
condition	28	32	87.5
subroutine	24	25	96.0
pod	19	20	95.0
total	364	390	93.3

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# bioperl module for Bio::PrimarySeq
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Ewan Birney
7							#
8							# Copyright Ewan Birney
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::PrimarySeq - Bioperl lightweight sequence object
17
18							=head1 SYNOPSIS
19
20							# Bio::SeqIO for file reading, Bio::DB::GenBank for
21							# database reading
22
23							use Bio::Seq;
24							use Bio::SeqIO;
25							use Bio::DB::GenBank;
26
27							# make from memory
28
29							$seqobj = Bio::PrimarySeq->new (
30							-seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
31							-id => 'GeneFragment-12',
32							-accession_number => 'X78121',
33							-alphabet => 'dna',
34							-is_circular => 1,
35							);
36							print "Sequence ", $seqobj->id(), " with accession ",
37							$seqobj->accession_number, "\n";
38
39							# read from file
40
41							$inputstream = Bio::SeqIO->new(
42							-file => "myseq.fa",
43							-format => 'Fasta',
44							);
45							$seqobj = $inputstream->next_seq();
46							print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
47
48							# to get out parts of the sequence.
49
50							print "Sequence ", $seqobj->id(), " with accession ",
51							$seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
52
53							$string = $seqobj->seq();
54							$string2 = $seqobj->subseq(1,40);
55
56							=head1 DESCRIPTION
57
58							PrimarySeq is a lightweight sequence object, storing the sequence, its
59							name, a computer-useful unique name, and other fundamental attributes.
60							It does not contain sequence features or other information. To have a
61							sequence with sequence features you should use the Seq object which uses
62							this object.
63
64							Although new users will use Bio::PrimarySeq a lot, in general you will
65							be using it from the Bio::Seq object. For more information on Bio::Seq
66							see L. For interest you might like to know that
67							Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
68							to do with sequence to it (the has-a relationship lets us get out of a
69							otherwise nasty cyclical reference in Perl which would leak memory).
70
71							Sequence objects are defined by the Bio::PrimarySeqI interface, and this
72							object is a pure Perl implementation of the interface. If that's
73							gibberish to you, don't worry. The take home message is that this
74							object is the bioperl default sequence object, but other people can
75							use their own objects as sequences if they so wish. If you are
76							interested in wrapping your own objects as compliant Bioperl sequence
77							objects, then you should read the Bio::PrimarySeqI documentation
78
79							The documentation of this object is a merge of the Bio::PrimarySeq and
80							Bio::PrimarySeqI documentation. This allows all the methods which you can
81							call on sequence objects here.
82
83							=head1 FEEDBACK
84
85							=head2 Mailing Lists
86
87							User feedback is an integral part of the evolution of this and other
88							Bioperl modules. Send your comments and suggestions preferably to one
89							of the Bioperl mailing lists. Your participation is much appreciated.
90
91							bioperl-l@bioperl.org - General discussion
92							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
93
94							=head2 Support
95
96							Please direct usage questions or support issues to the mailing list:
97
98							I
99
100							rather than to the module maintainer directly. Many experienced and
101							reponsive experts will be able look at the problem and quickly
102							address it. Please include a thorough description of the problem
103							with code and data examples if at all possible.
104
105							=head2 Reporting Bugs
106
107							Report bugs to the Bioperl bug tracking system to help us keep track
108							the bugs and their resolution. Bug reports can be submitted via the
109							web:
110
111							https://github.com/bioperl/bioperl-live/issues
112
113							=head1 AUTHOR - Ewan Birney
114
115							Email birney@ebi.ac.uk
116
117							=head1 APPENDIX
118
119							The rest of the documentation details each of the object
120							methods. Internal methods are usually preceded with a _
121
122							=cut
123
124
125
126							package Bio::PrimarySeq;
127
128	203			203		8216	use strict;
	203					316
	203					8952
129
130							our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
131							our $GAP_SYMBOLS = '-~';
132
133	203					76876	use base qw(Bio::Root::Root Bio::PrimarySeqI
134	203			203		657	Bio::IdentifiableI Bio::DescribableI);
	203					282
135
136
137							# Setup the allowed values for alphabet()
138							my %valid_type = map {$_, 1} qw( dna rna protein );
139
140
141							=head2 new
142
143							Title : new
144							Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
145							-id => 'human_id',
146							-accession_number => 'AL000012',
147							);
148							Function: Returns a new primary seq object from
149							basic constructors, being a string for the sequence
150							and strings for id and accession_number.
151
152							Note that you can provide an empty sequence string. However, in
153							this case you MUST specify the type of sequence you wish to
154							initialize by the parameter -alphabet. See alphabet() for possible
155							values.
156							Returns : a new Bio::PrimarySeq object
157							Args : -seq => sequence string
158							-ref_to_seq => ... or reference to a sequence string
159							-display_id => display id of the sequence (locus name)
160							-accession_number => accession number
161							-primary_id => primary id (Genbank id)
162							-version => version number
163							-namespace => the namespace for the accession
164							-authority => the authority for the namespace
165							-description => description text
166							-desc => alias for description
167							-alphabet => skip alphabet guess and set it to dna, rna or protein
168							-id => alias for display id
169							-is_circular => boolean to indicate that sequence is circular
170							-direct => boolean to directly set sequences. The next time -seq,
171							seq() or -ref_to_seq is use, the sequence will not be
172							validated. Be careful with this...
173							-nowarnonempty => boolean to avoid warning when sequence is empty
174
175							=cut
176
177							sub new {
178	14995			14995	1	38180	my ($class, @args) = @_;
179	14995					28824	my $self = $class->SUPER::new(@args);
180	14995					55120	my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
181							$alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
182							$nowarnonempty) =
183							$self->_rearrange([qw(SEQ
184							DISPLAY_ID
185							ACCESSION_NUMBER
186							PRIMARY_ID
187							NAMESPACE
188							AUTHORITY
189							VERSION
190							OBJECT_ID
191							DESC
192							DESCRIPTION
193							ALPHABET
194							ID
195							IS_CIRCULAR
196							DIRECT
197							REF_TO_SEQ
198							LENGTH
199							NOWARNONEMPTY
200							)],
201							@args);
202
203							# Private var _nowarnonempty, needs to be set before calling _guess_alphabet
204	14995					44412	$self->{'_nowarnonempty'} = $nowarnonempty;
205	14995					13599	$self->{'_direct'} = $direct;
206
207	14995	100	100			30508	if( defined $id && defined $given_id ) {
208	6	50				12	if( $id ne $given_id ) {
209	0					0	$self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
210							}
211							}
212	14995	100				19768	if( defined $given_id ) { $id = $given_id; }
	11740					9756
213
214							# Bernd's idea: set ids now for more informative invalid sequence messages
215	14995	100				30132	defined $id && $self->display_id($id);
216	14995	100				20689	$acc && $self->accession_number($acc);
217	14995	100				19517	defined $pid && $self->primary_id($pid);
218
219							# Set alphabet now to avoid guessing it later, when sequence is set
220	14995	100				26207	$alphabet && $self->alphabet($alphabet);
221
222							# Set the length before the seq. If there is a seq, length will be updated later
223	14995		100			38849	$self->{'length'} = $len \|\| 0;
224
225							# Set the sequence (but also alphabet and length)
226	14995	100				18557	if ($ref_to_seq) {
227	1					2	$self->_set_seq_by_ref($ref_to_seq, $alphabet);
228							} else {
229	14994	100				20259	if (defined $seq) {
230							# Note: the sequence string may be empty
231	14268					19504	$self->seq($seq);
232							}
233							}
234
235	14993	100				20389	$desc && $self->desc($desc);
236	14993	100				17842	$description && $self->description($description);
237	14993	100				17912	$ns && $self->namespace($ns);
238	14993	100				18510	$auth && $self->authority($auth);
239							# Any variable that can have a value "0" must be tested with defined
240							# or it will fail to be added to the new object
241	14993	100				18812	defined($v) && $self->version($v);
242	14993	50				19852	defined($oid) && $self->object_id($oid);
243	14993	100				17984	defined($is_circular) && $self->is_circular($is_circular);
244
245	14993					39727	return $self;
246							}
247
248
249							=head2 seq
250
251							Title : seq
252							Usage : $string = $seqobj->seq();
253							Function: Get or set the sequence as a string of letters. The case of
254							the letters is left up to the implementer. Suggested cases are
255							upper case for proteins and lower case for DNA sequence (IUPAC
256							standard), but you should not rely on this. An error is thrown if
257							the sequence contains invalid characters: see validate_seq().
258							Returns : A scalar
259							Args : - Optional new sequence value (a string) to set
260							- Optional alphabet (it is guessed by default)
261
262							=cut
263
264							sub seq {
265	168456			168456	1	175702	my ($self, @args) = @_;
266
267	168456	100				208552	if( scalar @args == 0 ) {
268	141004					300461	return $self->{'seq'};
269							}
270
271	27452					25481	my ($seq_str, $alphabet) = @args;
272	27452	50				35374	if (@args) {
273	27452					35435	$self->_set_seq_by_ref(\$seq_str, $alphabet);
274							}
275
276	27449					37834	return $self->{'seq'};
277							}
278
279
280							sub _set_seq_by_ref {
281							# Set a sequence by reference. A reference is used to avoid the cost of
282							# copying the sequence (which can be very large) between functions.
283	27453			27453		22933	my ($self, $seq_str_ref, $alphabet) = @_;
284
285							# Validate sequence if sequence is not empty and we are not in direct mode
286	27453	100	100			83254	if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
287	27281					38720	$self->validate_seq($$seq_str_ref, 1);
288							}
289	27450					28453	delete $self->{'_direct'}; # next sequence will have to be validated
290
291							# Record sequence length
292	27450		100			39888	my $len = CORE::length($$seq_str_ref \|\| '');
293	27450		100			53983	my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
294							# Note: if the new seq is empty or undef, this is not considered a change
295	27450	100				37237	delete $self->{'_freeze_length'} if $is_changed_seq;
296	27450	100				39525	$self->{'length'} = $len if not exists $self->{'_freeze_length'};
297
298							# Set sequence
299	27450					27382	$self->{'seq'} = $$seq_str_ref;
300
301							# Set or guess alphabet
302	27450	100	100			66844	if ($alphabet) {
		100
303							# Alphabet specified, set it no matter what
304	18					24	$self->alphabet($alphabet);
305							} elsif ($is_changed_seq \|\| (! defined($self->alphabet()))) {
306							# If we changed a previous sequence to a new one or if there is no
307							# alphabet yet at all, we need to guess the (possibly new) alphabet
308	15780					21838	$self->_guess_alphabet();
309							} # else (seq not changed and alphabet was defined) do nothing
310
311	27450					28287	return 1;
312							}
313
314
315							=head2 validate_seq
316
317							Title : validate_seq
318							Usage : if(! $seqobj->validate_seq($seq_str) ) {
319							print "sequence $seq_str is not valid for an object of
320							alphabet ",$seqobj->alphabet, "\n";
321							}
322							Function: Test that the given sequence is valid, i.e. contains only valid
323							characters. The allowed characters are all letters (A-Z) and '-','.',
324							'*','?','=' and '~'. Spaces are not valid. Note that this
325							implementation does not take alphabet() into account and that empty
326							sequences are considered valid.
327							Returns : 1 if the supplied sequence string is valid, 0 otherwise.
328							Args : - Sequence string to be validated
329							- Boolean to optionally throw an error if the sequence is invalid
330
331							=cut
332
333							sub validate_seq {
334	23098			23098	1	20666	my ($self, $seqstr, $throw) = @_;
335	23098	100	100			134851	if ( (defined $seqstr ) &&
336							($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
337	8	100				14	if ($throw) {
338	3		50			7	$self->throw("Failed validation of sequence '".(defined($self->id) \|\|
339							'[unidentified sequence]')."'. Invalid characters were: " .
340							join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
341							}
342	5					19	return 0;
343							}
344	23090					26157	return 1;
345							}
346
347
348							=head2 subseq
349
350							Title : subseq
351							Usage : $substring = $seqobj->subseq(10,40);
352							$substring = $seqobj->subseq(10,40,'nogap');
353							$substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
354							$substring = $seqobj->subseq($location_obj);
355							$substring = $seqobj->subseq($location_obj, -nogap => 1);
356							Function: Return the subseq from start to end, where the first sequence
357							character has coordinate 1 number is inclusive, ie 1-2 are the
358							first two characters of the sequence. The given start coordinate
359							has to be larger than the end, even if the sequence is circular.
360							Returns : a string
361							Args : integer for start position
362							integer for end position
363							OR
364							Bio::LocationI location for subseq (strand honored)
365							Specify -NOGAP=>1 to return subseq with gap characters removed
366							Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
367							with $new_subseq in the sequence object
368
369							=cut
370
371							sub subseq {
372	10039			10039	1	6913	my $self = shift;
373	10039					9450	my @args = @_;
374	10039					19216	my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
375							END
376							NOGAP
377							REPLACE_WITH)], @args);
378
379							# If -replace_with is specified, validate the replacement sequence
380	10039	100				14740	if (defined $replace) {
381	2	100				5	$self->validate_seq( $replace ) \|\|
382							$self->throw("Replacement sequence does not look valid");
383							}
384
385	10038	100	66			32809	if( ref($start) && $start->isa('Bio::LocationI') ) {
		50	33
386	52					41	my $loc = $start;
387	52					42	my $seq = '';
388
389							# For Split objects if Guide Strand is negative,
390							# pass the sublocations in reverse
391	52					44	my $order = 0;
392	52	100				119	if ($loc->isa('Bio::Location::SplitLocationI')) {
393							# guide_strand can return undef, so don't compare directly
394							# to avoid 'uninitialized value' warning
395	44	100				70	my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0;
396	44	100				52	$order = ($guide_strand == -1) ? -1 : 0;
397							}
398							# Reversing order using ->each_Location(-1) does not work well for
399							# cut by origin-splits (like "complement(join(1900..END,START..50))"),
400							# so use "reverse" instead
401	52	100				108	my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location;
402	52					57	foreach my $subloc (@sublocs) {
403	120					244	my $piece = $self->subseq(-start => $subloc->start(),
404							-end => $subloc->end(),
405							-replace_with => $replace,
406							-nogap => $nogap);
407	120	100				177	$piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
408
409							# strand can return undef, so don't compare directly
410							# to avoid 'uninitialized value' warning
411	120	100				176	my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0;
412	120	100				178	if ($strand < 0) {
413	59					77	$piece = $self->_revcom_from_string($piece, $self->alphabet);
414							}
415	120					158	$seq .= $piece;
416							}
417	52					196	return $seq;
418							} elsif( defined $start && defined $end ) {
419	9986	50				11682	if( $start > $end ){
420	0					0	$self->throw("Bad start,end parameters. Start [$start] has to be ".
421							"less than end [$end]");
422							}
423	9986	50				11248	if( $start <= 0 ) {
424	0					0	$self->throw("Bad start parameter ($start). Start must be positive.");
425							}
426
427							# Remove one from start, and then length is end-start
428	9986					6381	$start--;
429
430	9986					5771	my $seqstr;
431	9986	100				9392	if (defined $replace) {
432	1					4	$seqstr = substr $self->{seq}, $start, $end-$start, $replace;
433							} else {
434	9985					12949	$seqstr = substr $self->{seq}, $start, $end-$start;
435							}
436
437
438	9986	100				11061	if ($end > $self->length) {
439	1	50				3	if ($self->is_circular) {
440	1					2	my $start = 0;
441	1					2	my $end = $end - $self->length;
442
443	1					1	my $appendstr;
444	1	50				2	if (defined $replace) {
445	0					0	$appendstr = substr $self->{seq}, $start, $end-$start, $replace;
446							} else {
447	1					2	$appendstr = substr $self->{seq}, $start, $end-$start;
448							}
449
450	1					1	$seqstr .= $appendstr;
451							} else {
452	0					0	$self->throw("Bad end parameter ($end). End must be less than ".
453							"the total length of sequence (total=".$self->length.")")
454							}
455							}
456
457	9986	100				11462	$seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
458	9986					16173	return $seqstr;
459
460							} else {
461	0					0	$self->warn("Incorrect parameters to subseq - must be two integers or ".
462							"a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
463	0					0	return;
464							}
465							}
466
467
468							=head2 length
469
470							Title : length
471							Usage : $len = $seqobj->length();
472							Function: Get the stored length of the sequence in number of symbols (bases
473							or amino acids). In some circumstances, you can also set this attribute:
474
475							1. For empty sequences, you can set the length to anything you want:
476							my $seqobj = Bio::PrimarySeq->new( -length => 123 );
477							my $len = $seqobj->len; # 123
478							2. To save memory when using very long sequences, you can set the
479							length of the sequence to the length of the sequence (and nothing
480							else):
481							my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
482							# process $seqobj... then after you're done with it
483							$seqobj->length($seqobj->length);
484							$seqobj->seq(undef); # free memory!
485							my $len = $seqobj->len; # 1 Mbp
486
487							Note that if you set seq() to a value other than undef at any time,
488							the length attribute will be reset.
489							Returns : integer representing the length of the sequence.
490							Args : Optionally, the value on set
491
492							=cut
493
494							sub length {
495	30694			30694	1	28476	my ($self, $val) = @_;
496	30694	100				35597	if (defined $val) {
497	5					8	my $len = $self->{'length'};
498	5	100	100			19	if ($len && ($len != $val)) {
499	1					5	$self->throw("Can not set the length to $val, current length value is $len");
500							}
501	4					6	$self->{'length'} = $val;
502	4					9	$self->{'_freeze_length'} = undef;
503							}
504	30693					42354	return $self->{'length'};
505							}
506
507
508							=head2 display_id
509
510							Title : display_id or display_name
511							Usage : $id_string = $seqobj->display_id();
512							Function: Get or set the display id, aka the common name of the sequence object.
513
514							The semantics of this is that it is the most likely string to
515							be used as an identifier of the sequence, and likely to have
516							"human" readability. The id is equivalent to the ID field of
517							the GenBank/EMBL databanks and the id field of the
518							Swissprot/sptrembl database. In fasta format, the >(\S+) is
519							presumed to be the id, though some people overload the id to
520							embed other information. Bioperl does not use any embedded
521							information in the ID field, and people are encouraged to use
522							other mechanisms (accession field for example, or extending
523							the sequence object) to solve this.
524
525							With the new Bio::DescribeableI interface, display_name aliases
526							to this method.
527							Returns : A string for the display ID
528							Args : Optional string for the display ID to set
529
530							=cut
531
532							sub display_id {
533	23860			23860	1	25368	my ($self, $value) = @_;
534	23860	100				30150	if( defined $value) {
535	14090					15683	$self->{'display_id'} = $value;
536							}
537	23860					34354	return $self->{'display_id'};
538							}
539
540
541							=head2 accession_number
542
543							Title : accession_number or object_id
544							Usage : $unique_key = $seqobj->accession_number;
545							Function: Returns the unique biological id for a sequence, commonly
546							called the accession_number. For sequences from established
547							databases, the implementors should try to use the correct
548							accession number. Notice that primary_id() provides the
549							unique id for the implemetation, allowing multiple objects
550							to have the same accession number in a particular implementation.
551
552							For sequences with no accession number, this method should
553							return "unknown".
554
555							[Note this method name is likely to change in 1.3]
556
557							With the new Bio::IdentifiableI interface, this is aliased
558							to object_id
559							Returns : A string
560							Args : A string (optional) for setting
561
562							=cut
563
564							sub accession_number {
565	900			900	1	1667	my( $self, $acc ) = @_;
566	900	100				1453	if (defined $acc) {
567	654					947	$self->{'accession_number'} = $acc;
568							} else {
569	246					315	$acc = $self->{'accession_number'};
570	246	100				519	$acc = 'unknown' unless defined $acc;
571							}
572	900					1540	return $acc;
573							}
574
575
576							=head2 primary_id
577
578							Title : primary_id
579							Usage : $unique_key = $seqobj->primary_id;
580							Function: Returns the unique id for this object in this
581							implementation. This allows implementations to manage their
582							own object ids in a way the implementaiton can control
583							clients can expect one id to map to one object.
584
585							For sequences with no natural primary id, this method
586							should return a stringified memory location.
587							Returns : A string
588							Args : A string (optional, for setting)
589
590							=cut
591
592							sub primary_id {
593	505			505	1	743	my $self = shift;
594
595	505	100				1187	if(@_) {
596	447					665	$self->{'primary_id'} = shift;
597							}
598	505	100				880	if( ! defined($self->{'primary_id'}) ) {
599	21					172	return "$self";
600							}
601	484					547	return $self->{'primary_id'};
602							}
603
604
605							=head2 alphabet
606
607							Title : alphabet
608							Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
609							Function: Get/set the alphabet of sequence, one of
610							'dna', 'rna' or 'protein'. This is case sensitive.
611
612							This is not called because this would cause
613							upgrade problems from the 0.5 and earlier Seq objects.
614							Returns : a string either 'dna','rna','protein'. NB - the object must
615							make a call of the type - if there is no alphabet specified it
616							has to guess.
617							Args : optional string to set : 'dna' \| 'rna' \| 'protein'
618
619
620							=cut
621
622							sub alphabet {
623	54892			54892	1	45273	my ($self,$value) = @_;
624	54892	100				67370	if (defined $value) {
625	27920					25250	$value = lc $value;
626	27920	50				41710	unless ( $valid_type{$value} ) {
627	0					0	$self->throw("Alphabet '$value' is not a valid alphabet (".
628							join(',', map "'$_'", sort keys %valid_type) .") lowercase");
629							}
630	27920					33263	$self->{'alphabet'} = $value;
631							}
632	54892					84771	return $self->{'alphabet'};
633							}
634
635
636							=head2 desc
637
638							Title : desc or description
639							Usage : $seqobj->desc($newval);
640							Function: Get/set description of the sequence.
641
642							'description' is an alias for this for compliance with the
643							Bio::DescribeableI interface.
644							Returns : value of desc (a string)
645							Args : newvalue (a string or undef, optional)
646
647
648							=cut
649
650							sub desc{
651	1711			1711	1	5959	my $self = shift;
652
653	1711	100				4125	return $self->{'desc'} = shift if @_;
654	454					1408	return $self->{'desc'};
655							}
656
657
658							=head2 can_call_new
659
660							Title : can_call_new
661							Usage :
662							Function:
663							Example :
664							Returns : true
665							Args :
666
667							=cut
668
669							sub can_call_new {
670	10			10	1	9	my ($self) = @_;
671
672	10					173	return 1;
673							}
674
675
676							=head2 id
677
678							Title : id
679							Usage : $id = $seqobj->id();
680							Function: This is mapped on display_id
681							Example :
682							Returns :
683							Args :
684
685							=cut
686
687							sub id {
688	9031			9031	1	17124	return shift->display_id(@_);
689							}
690
691
692							=head2 is_circular
693
694							Title : is_circular
695							Usage : if( $seqobj->is_circular) { # Do something }
696							Function: Returns true if the molecule is circular
697							Returns : Boolean value
698							Args : none
699
700							=cut
701
702							sub is_circular{
703	143388			143388	1	113639	my $self = shift;
704	143388	100				201734	return $self->{'is_circular'} = shift if @_;
705	143362					304910	return $self->{'is_circular'};
706							}
707
708
709							=head1 Methods for Bio::IdentifiableI compliance
710
711							=head2 object_id
712
713							Title : object_id
714							Usage : $string = $seqobj->object_id();
715							Function: Get or set a string which represents the stable primary identifier
716							in this namespace of this object. For DNA sequences this
717							is its accession_number, similarly for protein sequences.
718
719							This is aliased to accession_number().
720							Returns : A scalar
721							Args : Optional object ID to set.
722
723							=cut
724
725							sub object_id {
726	4			4	1	7	return shift->accession_number(@_);
727							}
728
729
730							=head2 version
731
732							Title : version
733							Usage : $version = $seqobj->version();
734							Function: Get or set a number which differentiates between versions of
735							the same object. Higher numbers are considered to be
736							later and more relevant, but a single object described
737							the same identifier should represent the same concept.
738							Returns : A number
739							Args : Optional version to set.
740
741							=cut
742
743							sub version{
744	3582			3582	1	3079	my ($self,$value) = @_;
745	3582	100				4761	if( defined $value) {
746	292					669	$self->{'_version'} = $value;
747							}
748	3582					5667	return $self->{'_version'};
749							}
750
751
752							=head2 authority
753
754							Title : authority
755							Usage : $authority = $seqobj->authority();
756							Function: Get or set a string which represents the organisation which
757							granted the namespace, written as the DNS name of the
758							organisation (eg, wormbase.org).
759							Returns : A scalar
760							Args : Optional authority to set.
761
762							=cut
763
764							sub authority {
765	91			91	1	74	my ($self, $value) = @_;
766	91	100				118	if( defined $value) {
767	86					88	$self->{'authority'} = $value;
768							}
769	91					87	return $self->{'authority'};
770							}
771
772
773							=head2 namespace
774
775							Title : namespace
776							Usage : $string = $seqobj->namespace();
777							Function: Get or set a string representing the name space this identifier
778							is valid in, often the database name or the name describing the
779							collection.
780							Returns : A scalar
781							Args : Optional namespace to set.
782
783							=cut
784
785							sub namespace{
786	527			527	1	474	my ($self,$value) = @_;
787	527	100				752	if( defined $value) {
788	493					745	$self->{'namespace'} = $value;
789							}
790	527		100			957	return $self->{'namespace'} \|\| "";
791							}
792
793
794							=head1 Methods for Bio::DescribableI compliance
795
796							This comprises of display_name and description.
797
798							=head2 display_name
799
800							Title : display_name
801							Usage : $string = $seqobj->display_name();
802							Function: Get or set a string which is what should be displayed to the user.
803							The string should have no spaces (ideally, though a cautious
804							user of this interface would not assumme this) and should be
805							less than thirty characters (though again, double checking
806							this is a good idea).
807
808							This is aliased to display_id().
809							Returns : A string for the display name
810							Args : Optional string for the display name to set.
811
812							=cut
813
814							sub display_name {
815	2			2	1	5	return shift->display_id(@_);
816							}
817
818
819							=head2 description
820
821							Title : description
822							Usage : $string = $seqobj->description();
823							Function: Get or set a text string suitable for displaying to the user a
824							description. This string is likely to have spaces, but
825							should not have any newlines or formatting - just plain
826							text. The string should not be greater than 255 characters
827							and clients can feel justified at truncating strings at 255
828							characters for the purposes of display.
829
830							This is aliased to desc().
831							Returns : A string for the description
832							Args : Optional string for the description to set.
833
834							=cut
835
836							sub description {
837	91			91	1	169	return shift->desc(@_);
838							}
839
840
841							=head1 Methods Inherited from Bio::PrimarySeqI
842
843							These methods are available on Bio::PrimarySeq, although they are
844							actually implemented on Bio::PrimarySeqI
845
846							=head2 revcom
847
848							Title : revcom
849							Usage : $rev = $seqobj->revcom();
850							Function: Produces a new Bio::SeqI implementing object which
851							is the reversed complement of the sequence. For protein
852							sequences this throws an exception of
853							"Sequence is a protein. Cannot revcom".
854
855							The id is the same id as the orginal sequence, and the
856							accession number is also indentical. If someone wants to
857							track that this sequence has be reversed, it needs to
858							define its own extensions.
859
860							To do an inplace edit of an object you can go:
861
862							$seqobj = $seqobj->revcom();
863
864							This of course, causes Perl to handle the garbage
865							collection of the old object, but it is roughly speaking as
866							efficient as an inplace edit.
867							Returns : A new (fresh) Bio::SeqI object
868							Args : none
869
870							=head2 trunc
871
872							Title : trunc
873							Usage : $subseq = $myseq->trunc(10,100);
874							Function: Provides a truncation of a sequence,
875							Returns : A fresh Bio::SeqI implementing object.
876							Args : Numbers for the start and end positions
877
878							=head1 Internal methods
879
880							These are internal methods to PrimarySeq
881
882							=head2 _guess_alphabet
883
884							Title : _guess_alphabet
885							Usage :
886							Function: Automatically guess and set the type of sequence: dna, rna, protein
887							or '' if the sequence was empty. This method first removes dots (.),
888							dashes (-) and question marks (?) before guessing the alphabet
889							using the IUPAC conventions for ambiguous residues. Since the DNA and
890							RNA characters are also valid characters for proteins, there is
891							no foolproof way of determining the right alphabet. This is our best
892							guess only!
893							Returns : string 'dna', 'rna', 'protein' or ''.
894							Args : none
895
896							=cut
897
898							sub _guess_alphabet {
899	15855			15855		13908	my ($self) = @_;
900							# Guess alphabet
901	15855					25959	my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
902							# Set alphabet unless it is unknown
903	15855	100				31327	$self->alphabet($alphabet) if $alphabet;
904	15855					13513	return $alphabet;
905							}
906
907
908							sub _guess_alphabet_from_string {
909							# Get the alphabet from a sequence string
910	18852			18852		16758	my ($self, $str, $nowarnonempty) = @_;
911
912	18852	100				27465	$nowarnonempty = 0 if not defined $nowarnonempty;
913
914							# Remove chars that clearly don't denote nucleic or amino acids
915	18852					51527	$str =~ s/[-.?]//gi;
916
917							# Check for sequences without valid letters
918	18852					13809	my $alphabet;
919	18852					15175	my $total = CORE::length($str);
920	18852	100				25455	if( $total == 0 ) {
921	2	100				4	if (not $nowarnonempty) {
922	1					6	$self->warn("Got a sequence without letters. Could not guess alphabet");
923							}
924	2					5	$alphabet = '';
925							}
926
927							# Determine alphabet now
928	18852	100				24499	if (not defined $alphabet) {
929	18850	100				30919	if ($str =~ m/[EFIJLOPQXZ]/i) {
930							# Start with a safe method to find proteins.
931							# Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
932	1762					1887	$alphabet = 'protein';
933							} else {
934							# Alphabet is unsure, could still be DNA, RNA or protein
935							# DNA and RNA contain mostly A, T, U, G, C and N, but the other
936							# letters they use are also among the 15 valid letters that a
937							# protein sequence can contain at this stage. Make our best guess
938							# based on sequence composition. If it contains over 70% of ACGTUN,
939							# it is likely nucleic.
940	17088	100				35829	if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
941	16081	100				26603	if ( $str =~ m/U/i ) {
942	53					152	$alphabet = 'rna';
943							} else {
944	16028					15855	$alphabet = 'dna';
945							}
946							} else {
947	1007					1037	$alphabet = 'protein';
948							}
949							}
950							}
951
952	18852					25278	return $alphabet;
953							}
954
955
956							############################################################################
957							# aliases due to name changes or to compensate for our lack of consistency #
958							############################################################################
959
960							sub accession {
961	0			0	0		my $self = shift;
962
963	0						$self->warn(ref($self)."::accession is deprecated, ".
964							"use accession_number() instead");
965	0						return $self->accession_number(@_);
966							}
967
968							1;