File Coverage

Bio/PrimarySeq.pm

Criterion	Covered	Total	%
statement	172	183	93.9
branch	121	130	93.0
condition	28	32	87.5
subroutine	24	25	96.0
pod	19	20	95.0
total	364	390	93.3

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# bioperl module for Bio::PrimarySeq
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Ewan Birney
7							#
8							# Copyright Ewan Birney
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::PrimarySeq - Bioperl lightweight sequence object
17
18							=head1 SYNOPSIS
19
20							# Bio::SeqIO for file reading, Bio::DB::GenBank for
21							# database reading
22
23							use Bio::Seq;
24							use Bio::SeqIO;
25							use Bio::DB::GenBank;
26
27							# make from memory
28
29							$seqobj = Bio::PrimarySeq->new (
30							-seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
31							-id => 'GeneFragment-12',
32							-accession_number => 'X78121',
33							-alphabet => 'dna',
34							-is_circular => 1,
35							);
36							print "Sequence ", $seqobj->id(), " with accession ",
37							$seqobj->accession_number, "\n";
38
39							# read from file
40
41							$inputstream = Bio::SeqIO->new(
42							-file => "myseq.fa",
43							-format => 'Fasta',
44							);
45							$seqobj = $inputstream->next_seq();
46							print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
47
48							# to get out parts of the sequence.
49
50							print "Sequence ", $seqobj->id(), " with accession ",
51							$seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
52
53							$string = $seqobj->seq();
54							$string2 = $seqobj->subseq(1,40);
55
56							=head1 DESCRIPTION
57
58							PrimarySeq is a lightweight sequence object, storing the sequence, its
59							name, a computer-useful unique name, and other fundamental attributes.
60							It does not contain sequence features or other information. To have a
61							sequence with sequence features you should use the Seq object which uses
62							this object.
63
64							Although new users will use Bio::PrimarySeq a lot, in general you will
65							be using it from the Bio::Seq object. For more information on Bio::Seq
66							see L. For interest you might like to know that
67							Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
68							to do with sequence to it (the has-a relationship lets us get out of a
69							otherwise nasty cyclical reference in Perl which would leak memory).
70
71							Sequence objects are defined by the Bio::PrimarySeqI interface, and this
72							object is a pure Perl implementation of the interface. If that's
73							gibberish to you, don't worry. The take home message is that this
74							object is the bioperl default sequence object, but other people can
75							use their own objects as sequences if they so wish. If you are
76							interested in wrapping your own objects as compliant Bioperl sequence
77							objects, then you should read the Bio::PrimarySeqI documentation
78
79							The documentation of this object is a merge of the Bio::PrimarySeq and
80							Bio::PrimarySeqI documentation. This allows all the methods which you can
81							call on sequence objects here.
82
83							=head1 FEEDBACK
84
85							=head2 Mailing Lists
86
87							User feedback is an integral part of the evolution of this and other
88							Bioperl modules. Send your comments and suggestions preferably to one
89							of the Bioperl mailing lists. Your participation is much appreciated.
90
91							bioperl-l@bioperl.org - General discussion
92							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
93
94							=head2 Support
95
96							Please direct usage questions or support issues to the mailing list:
97
98							I
99
100							rather than to the module maintainer directly. Many experienced and
101							reponsive experts will be able look at the problem and quickly
102							address it. Please include a thorough description of the problem
103							with code and data examples if at all possible.
104
105							=head2 Reporting Bugs
106
107							Report bugs to the Bioperl bug tracking system to help us keep track
108							the bugs and their resolution. Bug reports can be submitted via the
109							web:
110
111							https://github.com/bioperl/bioperl-live/issues
112
113							=head1 AUTHOR - Ewan Birney
114
115							Email birney@ebi.ac.uk
116
117							=head1 APPENDIX
118
119							The rest of the documentation details each of the object
120							methods. Internal methods are usually preceded with a _
121
122							=cut
123
124
125
126							package Bio::PrimarySeq;
127
128	203			203		9205	use strict;
	203					407
	203					9701
129
130							our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
131							our $GAP_SYMBOLS = '-~';
132
133	203					69635	use base qw(Bio::Root::Root Bio::PrimarySeqI
134	203			203		984	Bio::IdentifiableI Bio::DescribableI);
	203					339
135
136
137							# Setup the allowed values for alphabet()
138							my %valid_type = map {$_, 1} qw( dna rna protein );
139
140
141							=head2 new
142
143							Title : new
144							Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
145							-id => 'human_id',
146							-accession_number => 'AL000012',
147							);
148							Function: Returns a new primary seq object from
149							basic constructors, being a string for the sequence
150							and strings for id and accession_number.
151
152							Note that you can provide an empty sequence string. However, in
153							this case you MUST specify the type of sequence you wish to
154							initialize by the parameter -alphabet. See alphabet() for possible
155							values.
156							Returns : a new Bio::PrimarySeq object
157							Args : -seq => sequence string
158							-ref_to_seq => ... or reference to a sequence string
159							-display_id => display id of the sequence (locus name)
160							-accession_number => accession number
161							-primary_id => primary id (Genbank id)
162							-version => version number
163							-namespace => the namespace for the accession
164							-authority => the authority for the namespace
165							-description => description text
166							-desc => alias for description
167							-alphabet => skip alphabet guess and set it to dna, rna or protein
168							-id => alias for display id
169							-is_circular => boolean to indicate that sequence is circular
170							-direct => boolean to directly set sequences. The next time -seq,
171							seq() or -ref_to_seq is use, the sequence will not be
172							validated. Be careful with this...
173							-nowarnonempty => boolean to avoid warning when sequence is empty
174
175							=cut
176
177							sub new {
178	14902			14902	1	52355	my ($class, @args) = @_;
179	14902					35553	my $self = $class->SUPER::new(@args);
180	14902					63847	my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
181							$alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
182							$nowarnonempty) =
183							$self->_rearrange([qw(SEQ
184							DISPLAY_ID
185							ACCESSION_NUMBER
186							PRIMARY_ID
187							NAMESPACE
188							AUTHORITY
189							VERSION
190							OBJECT_ID
191							DESC
192							DESCRIPTION
193							ALPHABET
194							ID
195							IS_CIRCULAR
196							DIRECT
197							REF_TO_SEQ
198							LENGTH
199							NOWARNONEMPTY
200							)],
201							@args);
202
203							# Private var _nowarnonempty, needs to be set before calling _guess_alphabet
204	14902					40571	$self->{'_nowarnonempty'} = $nowarnonempty;
205	14902					21600	$self->{'_direct'} = $direct;
206
207	14902	100	100			31238	if( defined $id && defined $given_id ) {
208	6	50				24	if( $id ne $given_id ) {
209	0					0	$self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
210							}
211							}
212	14902	100				24895	if( defined $given_id ) { $id = $given_id; }
	11743					16264
213
214							# Bernd's idea: set ids now for more informative invalid sequence messages
215	14902	100				38606	defined $id && $self->display_id($id);
216	14902	100				23366	$acc && $self->accession_number($acc);
217	14902	100				21643	defined $pid && $self->primary_id($pid);
218
219							# Set alphabet now to avoid guessing it later, when sequence is set
220	14902	100				33192	$alphabet && $self->alphabet($alphabet);
221
222							# Set the length before the seq. If there is a seq, length will be updated later
223	14902		100			38898	$self->{'length'} = $len \|\| 0;
224
225							# Set the sequence (but also alphabet and length)
226	14902	100				23413	if ($ref_to_seq) {
227	1					3	$self->_set_seq_by_ref($ref_to_seq, $alphabet);
228							} else {
229	14901	100				22472	if (defined $seq) {
230							# Note: the sequence string may be empty
231	14189					22432	$self->seq($seq);
232							}
233							}
234
235	14900	100				28823	$desc && $self->desc($desc);
236	14900	100				21908	$description && $self->description($description);
237	14900	100				21629	$ns && $self->namespace($ns);
238	14900	100				21017	$auth && $self->authority($auth);
239							# Any variable that can have a value "0" must be tested with defined
240							# or it will fail to be added to the new object
241	14900	100				22094	defined($v) && $self->version($v);
242	14900	50				23177	defined($oid) && $self->object_id($oid);
243	14900	100				21161	defined($is_circular) && $self->is_circular($is_circular);
244
245	14900					46894	return $self;
246							}
247
248
249							=head2 seq
250
251							Title : seq
252							Usage : $string = $seqobj->seq();
253							Function: Get or set the sequence as a string of letters. The case of
254							the letters is left up to the implementer. Suggested cases are
255							upper case for proteins and lower case for DNA sequence (IUPAC
256							standard), but you should not rely on this. An error is thrown if
257							the sequence contains invalid characters: see validate_seq().
258							Returns : A scalar
259							Args : - Optional new sequence value (a string) to set
260							- Optional alphabet (it is guessed by default)
261
262							=cut
263
264							sub seq {
265	168331			168331	1	253516	my ($self, @args) = @_;
266
267	168331	100				233023	if( scalar @args == 0 ) {
268	140950					342302	return $self->{'seq'};
269							}
270
271	27381					40587	my ($seq_str, $alphabet) = @args;
272	27381	50				41124	if (@args) {
273	27381					44455	$self->_set_seq_by_ref(\$seq_str, $alphabet);
274							}
275
276	27378					44649	return $self->{'seq'};
277							}
278
279
280							sub _set_seq_by_ref {
281							# Set a sequence by reference. A reference is used to avoid the cost of
282							# copying the sequence (which can be very large) between functions.
283	27382			27382		37124	my ($self, $seq_str_ref, $alphabet) = @_;
284
285							# Validate sequence if sequence is not empty and we are not in direct mode
286	27382	100	100			82370	if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
287	27210					45989	$self->validate_seq($$seq_str_ref, 1);
288							}
289	27379					36974	delete $self->{'_direct'}; # next sequence will have to be validated
290
291							# Record sequence length
292	27379		100			50196	my $len = CORE::length($$seq_str_ref \|\| '');
293	27379		100			59229	my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
294							# Note: if the new seq is empty or undef, this is not considered a change
295	27379	100				43842	delete $self->{'_freeze_length'} if $is_changed_seq;
296	27379	100				46339	$self->{'length'} = $len if not exists $self->{'_freeze_length'};
297
298							# Set sequence
299	27379					38134	$self->{'seq'} = $$seq_str_ref;
300
301							# Set or guess alphabet
302	27379	100	100			65173	if ($alphabet) {
		100
303							# Alphabet specified, set it no matter what
304	18					31	$self->alphabet($alphabet);
305							} elsif ($is_changed_seq \|\| (! defined($self->alphabet()))) {
306							# If we changed a previous sequence to a new one or if there is no
307							# alphabet yet at all, we need to guess the (possibly new) alphabet
308	15706					25638	$self->_guess_alphabet();
309							} # else (seq not changed and alphabet was defined) do nothing
310
311	27379					37368	return 1;
312							}
313
314
315							=head2 validate_seq
316
317							Title : validate_seq
318							Usage : if(! $seqobj->validate_seq($seq_str) ) {
319							print "sequence $seq_str is not valid for an object of
320							alphabet ",$seqobj->alphabet, "\n";
321							}
322							Function: Test that the given sequence is valid, i.e. contains only valid
323							characters. The allowed characters are all letters (A-Z) and '-','.',
324							'*','?','=' and '~'. Spaces are not valid. Note that this
325							implementation does not take alphabet() into account and that empty
326							sequences are considered valid.
327							Returns : 1 if the supplied sequence string is valid, 0 otherwise.
328							Args : - Sequence string to be validated
329							- Boolean to optionally throw an error if the sequence is invalid
330
331							=cut
332
333							sub validate_seq {
334	23109			23109	1	33264	my ($self, $seqstr, $throw) = @_;
335	23109	100	100			134477	if ( (defined $seqstr ) &&
336							($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
337	8	100				18	if ($throw) {
338	3		50			8	$self->throw("Failed validation of sequence '".(defined($self->id) \|\|
339							'[unidentified sequence]')."'. Invalid characters were: " .
340							join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
341							}
342	5					25	return 0;
343							}
344	23101					37534	return 1;
345							}
346
347
348							=head2 subseq
349
350							Title : subseq
351							Usage : $substring = $seqobj->subseq(10,40);
352							$substring = $seqobj->subseq(10,40,'nogap');
353							$substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
354							$substring = $seqobj->subseq($location_obj);
355							$substring = $seqobj->subseq($location_obj, -nogap => 1);
356							Function: Return the subseq from start to end, where the first sequence
357							character has coordinate 1 number is inclusive, ie 1-2 are the
358							first two characters of the sequence. The given start coordinate
359							has to be larger than the end, even if the sequence is circular.
360							Returns : a string
361							Args : integer for start position
362							integer for end position
363							OR
364							Bio::LocationI location for subseq (strand honored)
365							Specify -NOGAP=>1 to return subseq with gap characters removed
366							Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
367							with $new_subseq in the sequence object
368
369							=cut
370
371							sub subseq {
372	10043			10043	1	10712	my $self = shift;
373	10043					12891	my @args = @_;
374	10043					24111	my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
375							END
376							NOGAP
377							REPLACE_WITH)], @args);
378
379							# If -replace_with is specified, validate the replacement sequence
380	10043	100				18441	if (defined $replace) {
381	2	100				4	$self->validate_seq( $replace ) \|\|
382							$self->throw("Replacement sequence does not look valid");
383							}
384
385	10042	100	66			29026	if( ref($start) && $start->isa('Bio::LocationI') ) {
		50	33
386	52					60	my $loc = $start;
387	52					75	my $seq = '';
388
389							# For Split objects if Guide Strand is negative,
390							# pass the sublocations in reverse
391	52					59	my $order = 0;
392	52	100				114	if ($loc->isa('Bio::Location::SplitLocationI')) {
393							# guide_strand can return undef, so don't compare directly
394							# to avoid 'uninitialized value' warning
395	44	100				103	my $guide_strand = defined ($loc->guide_strand) ? ($loc->guide_strand) : 0;
396	44	100				82	$order = ($guide_strand == -1) ? -1 : 0;
397							}
398							# Reversing order using ->each_Location(-1) does not work well for
399							# cut by origin-splits (like "complement(join(1900..END,START..50))"),
400							# so use "reverse" instead
401	52	100				128	my @sublocs = ($order == -1) ? reverse $loc->each_Location(): $loc->each_Location;
402	52					70	foreach my $subloc (@sublocs) {
403	120					237	my $piece = $self->subseq(-start => $subloc->start(),
404							-end => $subloc->end(),
405							-replace_with => $replace,
406							-nogap => $nogap);
407	120	100				210	$piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
408
409							# strand can return undef, so don't compare directly
410							# to avoid 'uninitialized value' warning
411	120	100				236	my $strand = defined ($subloc->strand) ? ($subloc->strand) : 0;
412	120	100				219	if ($strand < 0) {
413	59					98	$piece = $self->_revcom_from_string($piece, $self->alphabet);
414							}
415	120					227	$seq .= $piece;
416							}
417	52					224	return $seq;
418							} elsif( defined $start && defined $end ) {
419	9990	50				13464	if( $start > $end ){
420	0					0	$self->throw("Bad start,end parameters. Start [$start] has to be ".
421							"less than end [$end]");
422							}
423	9990	50				12541	if( $start <= 0 ) {
424	0					0	$self->throw("Bad start parameter ($start). Start must be positive.");
425							}
426
427							# Remove one from start, and then length is end-start
428	9990					9376	$start--;
429
430	9990					8557	my $seqstr;
431	9990	100				11248	if (defined $replace) {
432	1					4	$seqstr = substr $self->{seq}, $start, $end-$start, $replace;
433							} else {
434	9989					17443	$seqstr = substr $self->{seq}, $start, $end-$start;
435							}
436
437
438	9990	100				14489	if ($end > $self->length) {
439	1	50				2	if ($self->is_circular) {
440	1					2	my $start = 0;
441	1					3	my $end = $end - $self->length;
442
443	1					2	my $appendstr;
444	1	50				2	if (defined $replace) {
445	0					0	$appendstr = substr $self->{seq}, $start, $end-$start, $replace;
446							} else {
447	1					2	$appendstr = substr $self->{seq}, $start, $end-$start;
448							}
449
450	1					2	$seqstr .= $appendstr;
451							} else {
452	0					0	$self->throw("Bad end parameter ($end). End must be less than ".
453							"the total length of sequence (total=".$self->length.")")
454							}
455							}
456
457	9990	100				13400	$seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
458	9990					21032	return $seqstr;
459
460							} else {
461	0					0	$self->warn("Incorrect parameters to subseq - must be two integers or ".
462							"a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
463	0					0	return;
464							}
465							}
466
467
468							=head2 length
469
470							Title : length
471							Usage : $len = $seqobj->length();
472							Function: Get the stored length of the sequence in number of symbols (bases
473							or amino acids). In some circumstances, you can also set this attribute:
474
475							1. For empty sequences, you can set the length to anything you want:
476							my $seqobj = Bio::PrimarySeq->new( -length => 123 );
477							my $len = $seqobj->len; # 123
478							2. To save memory when using very long sequences, you can set the
479							length of the sequence to the length of the sequence (and nothing
480							else):
481							my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
482							# process $seqobj... then after you're done with it
483							$seqobj->length($seqobj->length);
484							$seqobj->seq(undef); # free memory!
485							my $len = $seqobj->len; # 1 Mbp
486
487							Note that if you set seq() to a value other than undef at any time,
488							the length attribute will be reset.
489							Returns : integer representing the length of the sequence.
490							Args : Optionally, the value on set
491
492							=cut
493
494							sub length {
495	30706			30706	1	43613	my ($self, $val) = @_;
496	30706	100				39800	if (defined $val) {
497	5					10	my $len = $self->{'length'};
498	5	100	100			17	if ($len && ($len != $val)) {
499	1					6	$self->throw("Can not set the length to $val, current length value is $len");
500							}
501	4					7	$self->{'length'} = $val;
502	4					8	$self->{'_freeze_length'} = undef;
503							}
504	30705					49670	return $self->{'length'};
505							}
506
507
508							=head2 display_id
509
510							Title : display_id or display_name
511							Usage : $id_string = $seqobj->display_id();
512							Function: Get or set the display id, aka the common name of the sequence object.
513
514							The semantics of this is that it is the most likely string to
515							be used as an identifier of the sequence, and likely to have
516							"human" readability. The id is equivalent to the ID field of
517							the GenBank/EMBL databanks and the id field of the
518							Swissprot/sptrembl database. In fasta format, the >(\S+) is
519							presumed to be the id, though some people overload the id to
520							embed other information. Bioperl does not use any embedded
521							information in the ID field, and people are encouraged to use
522							other mechanisms (accession field for example, or extending
523							the sequence object) to solve this.
524
525							With the new Bio::DescribeableI interface, display_name aliases
526							to this method.
527							Returns : A string for the display ID
528							Args : Optional string for the display ID to set
529
530							=cut
531
532							sub display_id {
533	23738			23738	1	37244	my ($self, $value) = @_;
534	23738	100				34722	if( defined $value) {
535	13993					21316	$self->{'display_id'} = $value;
536							}
537	23738					42199	return $self->{'display_id'};
538							}
539
540
541							=head2 accession_number
542
543							Title : accession_number or object_id
544							Usage : $unique_key = $seqobj->accession_number;
545							Function: Returns the unique biological id for a sequence, commonly
546							called the accession_number. For sequences from established
547							databases, the implementors should try to use the correct
548							accession number. Notice that primary_id() provides the
549							unique id for the implementation, allowing multiple objects
550							to have the same accession number in a particular implementation.
551
552							For sequences with no accession number, this method should
553							return "unknown".
554
555							[Note this method name is likely to change in 1.3]
556
557							With the new Bio::IdentifiableI interface, this is aliased
558							to object_id
559							Returns : A string
560							Args : A string (optional) for setting
561
562							=cut
563
564							sub accession_number {
565	900			900	1	1935	my( $self, $acc ) = @_;
566	900	100				1794	if (defined $acc) {
567	654					1127	$self->{'accession_number'} = $acc;
568							} else {
569	246					417	$acc = $self->{'accession_number'};
570	246	100				557	$acc = 'unknown' unless defined $acc;
571							}
572	900					1772	return $acc;
573							}
574
575
576							=head2 primary_id
577
578							Title : primary_id
579							Usage : $unique_key = $seqobj->primary_id;
580							Function: Returns the unique id for this object in this
581							implementation. This allows implementations to manage their
582							own object ids in a way the implementation can control
583							clients can expect one id to map to one object.
584
585							For sequences with no natural primary id, this method
586							should return a stringified memory location.
587							Returns : A string
588							Args : A string (optional, for setting)
589
590							=cut
591
592							sub primary_id {
593	511			511	1	913	my $self = shift;
594
595	511	100				1004	if(@_) {
596	453					717	$self->{'primary_id'} = shift;
597							}
598	511	100				1059	if( ! defined($self->{'primary_id'}) ) {
599	21					250	return "$self";
600							}
601	490					667	return $self->{'primary_id'};
602							}
603
604
605							=head2 alphabet
606
607							Title : alphabet
608							Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
609							Function: Get/set the alphabet of sequence, one of
610							'dna', 'rna' or 'protein'. This is case sensitive.
611
612							This is not called because this would cause
613							upgrade problems from the 0.5 and earlier Seq objects.
614							Returns : a string either 'dna','rna','protein'. NB - the object must
615							make a call of the type - if there is no alphabet specified it
616							has to guess.
617							Args : optional string to set : 'dna' \| 'rna' \| 'protein'
618
619
620							=cut
621
622							sub alphabet {
623	54742			54742	1	83427	my ($self,$value) = @_;
624	54742	100				78401	if (defined $value) {
625	27849					40535	$value = lc $value;
626	27849	50				49051	unless ( $valid_type{$value} ) {
627	0					0	$self->throw("Alphabet '$value' is not a valid alphabet (".
628							join(',', map "'$_'", sort keys %valid_type) .") lowercase");
629							}
630	27849					39547	$self->{'alphabet'} = $value;
631							}
632	54742					98214	return $self->{'alphabet'};
633							}
634
635
636							=head2 desc
637
638							Title : desc or description
639							Usage : $seqobj->desc($newval);
640							Function: Get/set description of the sequence.
641
642							'description' is an alias for this for compliance with the
643							Bio::DescribeableI interface.
644							Returns : value of desc (a string)
645							Args : newvalue (a string or undef, optional)
646
647
648							=cut
649
650							sub desc{
651	1711			1711	1	8288	my $self = shift;
652
653	1711	100				4325	return $self->{'desc'} = shift if @_;
654	454					1637	return $self->{'desc'};
655							}
656
657
658							=head2 can_call_new
659
660							Title : can_call_new
661							Usage :
662							Function:
663							Example :
664							Returns : true
665							Args :
666
667							=cut
668
669							sub can_call_new {
670	10			10	1	98	my ($self) = @_;
671
672	10					26	return 1;
673							}
674
675
676							=head2 id
677
678							Title : id
679							Usage : $id = $seqobj->id();
680							Function: This is mapped on display_id
681							Example :
682							Returns :
683							Args :
684
685							=cut
686
687							sub id {
688	9010			9010	1	19459	return shift->display_id(@_);
689							}
690
691
692							=head2 is_circular
693
694							Title : is_circular
695							Usage : if( $seqobj->is_circular) { # Do something }
696							Function: Returns true if the molecule is circular
697							Returns : Boolean value
698							Args : none
699
700							=cut
701
702							sub is_circular{
703	143408			143408	1	140853	my $self = shift;
704	143408	100				189359	return $self->{'is_circular'} = shift if @_;
705	143382					272558	return $self->{'is_circular'};
706							}
707
708
709							=head1 Methods for Bio::IdentifiableI compliance
710
711							=head2 object_id
712
713							Title : object_id
714							Usage : $string = $seqobj->object_id();
715							Function: Get or set a string which represents the stable primary identifier
716							in this namespace of this object. For DNA sequences this
717							is its accession_number, similarly for protein sequences.
718
719							This is aliased to accession_number().
720							Returns : A scalar
721							Args : Optional object ID to set.
722
723							=cut
724
725							sub object_id {
726	4			4	1	10	return shift->accession_number(@_);
727							}
728
729
730							=head2 version
731
732							Title : version
733							Usage : $version = $seqobj->version();
734							Function: Get or set a number which differentiates between versions of
735							the same object. Higher numbers are considered to be
736							later and more relevant, but a single object described
737							the same identifier should represent the same concept.
738							Returns : A number
739							Args : Optional version to set.
740
741							=cut
742
743							sub version{
744	3582			3582	1	4940	my ($self,$value) = @_;
745	3582	100				5374	if( defined $value) {
746	292					761	$self->{'_version'} = $value;
747							}
748	3582					6760	return $self->{'_version'};
749							}
750
751
752							=head2 authority
753
754							Title : authority
755							Usage : $authority = $seqobj->authority();
756							Function: Get or set a string which represents the organisation which
757							granted the namespace, written as the DNS name of the
758							organisation (eg, wormbase.org).
759							Returns : A scalar
760							Args : Optional authority to set.
761
762							=cut
763
764							sub authority {
765	91			91	1	111	my ($self, $value) = @_;
766	91	100				117	if( defined $value) {
767	86					110	$self->{'authority'} = $value;
768							}
769	91					118	return $self->{'authority'};
770							}
771
772
773							=head2 namespace
774
775							Title : namespace
776							Usage : $string = $seqobj->namespace();
777							Function: Get or set a string representing the name space this identifier
778							is valid in, often the database name or the name describing the
779							collection.
780							Returns : A scalar
781							Args : Optional namespace to set.
782
783							=cut
784
785							sub namespace{
786	527			527	1	847	my ($self,$value) = @_;
787	527	100				837	if( defined $value) {
788	493					937	$self->{'namespace'} = $value;
789							}
790	527		100			1154	return $self->{'namespace'} \|\| "";
791							}
792
793
794							=head1 Methods for Bio::DescribableI compliance
795
796							This comprises of display_name and description.
797
798							=head2 display_name
799
800							Title : display_name
801							Usage : $string = $seqobj->display_name();
802							Function: Get or set a string which is what should be displayed to the user.
803							The string should have no spaces (ideally, though a cautious
804							user of this interface would not assume this) and should be
805							less than thirty characters (though again, double checking
806							this is a good idea).
807
808							This is aliased to display_id().
809							Returns : A string for the display name
810							Args : Optional string for the display name to set.
811
812							=cut
813
814							sub display_name {
815	2			2	1	7	return shift->display_id(@_);
816							}
817
818
819							=head2 description
820
821							Title : description
822							Usage : $string = $seqobj->description();
823							Function: Get or set a text string suitable for displaying to the user a
824							description. This string is likely to have spaces, but
825							should not have any newlines or formatting - just plain
826							text. The string should not be greater than 255 characters
827							and clients can feel justified at truncating strings at 255
828							characters for the purposes of display.
829
830							This is aliased to desc().
831							Returns : A string for the description
832							Args : Optional string for the description to set.
833
834							=cut
835
836							sub description {
837	91			91	1	202	return shift->desc(@_);
838							}
839
840
841							=head1 Methods Inherited from Bio::PrimarySeqI
842
843							These methods are available on Bio::PrimarySeq, although they are
844							actually implemented on Bio::PrimarySeqI
845
846							=head2 revcom
847
848							Title : revcom
849							Usage : $rev = $seqobj->revcom();
850							Function: Produces a new Bio::SeqI implementing object which
851							is the reversed complement of the sequence. For protein
852							sequences this throws an exception of
853							"Sequence is a protein. Cannot revcom".
854
855							The id is the same id as the original sequence, and the
856							accession number is also identical. If someone wants to
857							track that this sequence has be reversed, it needs to
858							define its own extensions.
859
860							To do an inplace edit of an object you can go:
861
862							$seqobj = $seqobj->revcom();
863
864							This of course, causes Perl to handle the garbage
865							collection of the old object, but it is roughly speaking as
866							efficient as an inplace edit.
867							Returns : A new (fresh) Bio::SeqI object
868							Args : none
869
870							=head2 trunc
871
872							Title : trunc
873							Usage : $subseq = $myseq->trunc(10,100);
874							Function: Provides a truncation of a sequence,
875							Returns : A fresh Bio::SeqI implementing object.
876							Args : Numbers for the start and end positions
877
878							=head1 Internal methods
879
880							These are internal methods to PrimarySeq
881
882							=head2 _guess_alphabet
883
884							Title : _guess_alphabet
885							Usage :
886							Function: Automatically guess and set the type of sequence: dna, rna, protein
887							or '' if the sequence was empty. This method first removes dots (.),
888							dashes (-) and question marks (?) before guessing the alphabet
889							using the IUPAC conventions for ambiguous residues. Since the DNA and
890							RNA characters are also valid characters for proteins, there is
891							no foolproof way of determining the right alphabet. This is our best
892							guess only!
893							Returns : string 'dna', 'rna', 'protein' or ''.
894							Args : none
895
896							=cut
897
898							sub _guess_alphabet {
899	15781			15781		20920	my ($self) = @_;
900							# Guess alphabet
901	15781					25791	my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
902							# Set alphabet unless it is unknown
903	15781	100				39836	$self->alphabet($alphabet) if $alphabet;
904	15781					20107	return $alphabet;
905							}
906
907
908							sub _guess_alphabet_from_string {
909							# Get the alphabet from a sequence string
910	18778			18778		26845	my ($self, $str, $nowarnonempty) = @_;
911
912	18778	100				29371	$nowarnonempty = 0 if not defined $nowarnonempty;
913
914							# Remove chars that clearly don't denote nucleic or amino acids
915	18778					59226	$str =~ s/[-.?]//gi;
916
917							# Check for sequences without valid letters
918	18778					19706	my $alphabet;
919	18778					21524	my $total = CORE::length($str);
920	18778	100				28552	if( $total == 0 ) {
921	2	100				5	if (not $nowarnonempty) {
922	1					7	$self->warn("Got a sequence without letters. Could not guess alphabet");
923							}
924	2					5	$alphabet = '';
925							}
926
927							# Determine alphabet now
928	18778	100				28130	if (not defined $alphabet) {
929	18776	100				35567	if ($str =~ m/[EFIJLOPQXZ]/i) {
930							# Start with a safe method to find proteins.
931							# Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
932	1758					2464	$alphabet = 'protein';
933							} else {
934							# Alphabet is unsure, could still be DNA, RNA or protein
935							# DNA and RNA contain mostly A, T, U, G, C and N, but the other
936							# letters they use are also among the 15 valid letters that a
937							# protein sequence can contain at this stage. Make our best guess
938							# based on sequence composition. If it contains over 70% of ACGTUN,
939							# it is likely nucleic.
940	17018	100				40208	if( ($str =~ tr/ATUGCNWSKMatugcnwskm//) / $total > 0.7 ) {
941	16011	100				29402	if ( $str =~ m/U/i ) {
942	53					92	$alphabet = 'rna';
943							} else {
944	15958					19898	$alphabet = 'dna';
945							}
946							} else {
947	1007					1435	$alphabet = 'protein';
948							}
949							}
950							}
951
952	18778					31618	return $alphabet;
953							}
954
955
956							############################################################################
957							# aliases due to name changes or to compensate for our lack of consistency #
958							############################################################################
959
960							sub accession {
961	0			0	0		my $self = shift;
962
963	0						$self->warn(ref($self)."::accession is deprecated, ".
964							"use accession_number() instead");
965	0						return $self->accession_number(@_);
966							}
967
968							1;