File Coverage

blib/lib/Bio/ToolBox/SeqFeature.pm
Criterion Covered Total %
statement 190 288 65.9
branch 104 210 49.5
condition 43 105 40.9
subroutine 28 37 75.6
pod 31 32 96.8
total 396 672 58.9


line stmt bran cond sub pod time code
1             package Bio::ToolBox::SeqFeature;
2             our $VERSION = '1.67';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::SeqFeature - Fast, simple SeqFeature implementation
7              
8             =head1 SYNOPSIS
9              
10             # create a transcript
11             my $transcript = Bio::ToolBox::SeqFeature->new(
12             -seq_id => chr1,
13             -start => 1001,
14             -stop => 1500,
15             -strand => '+',
16             );
17             $seqf->primary_tag('mRNA'); # set parameters individually
18            
19             # create an exon
20             my $exon = Bio::ToolBox::SeqFeature->new(
21             -start => 1001,
22             -end => 1200,
23             -type => 'exon',
24             );
25            
26             # associate exon with transcript
27             $transcript->add_SeqFeature($exon);
28             my $exon_strand = $exon->strand; # inherits from parent
29            
30             # walk through subfeatures
31             foreach my $f ($transcript->get_all_SeqFeatures) {
32             printf "%s is a %s\n", $f->display_name, $f->type;
33             }
34            
35             # add attribute
36             $transcript->add_tag_value('Status', $status);
37            
38             # get attribute
39             my $value = $transcript->get_tag_values($key);
40              
41             =head1 DESCRIPTION
42              
43             SeqFeature objects represent functional elements on a genomic or chromosomal
44             sequence, such as genes, transcripts, exons, etc. In many cases, especially
45             genes, they have a hierarchical structure, typically in this order
46              
47             gene
48             mRNA or transcript
49             exon
50             CDS
51              
52             SeqFeature objects have at a minimum coordinate information, including
53             chromosome, start, stop, and strand, and a name or unique identifier. They
54             often also have type or source information, which usually follows the
55             Sequence Ontology key words.
56              
57             This is a fast, efficient, simplified SeqFeature implementation that mostly
58             implements the L API, and could be substituted for other
59             implementations, such L and L.
60             Unlike the others, however, it inherits no classes or methods and uses an
61             unorthodox blessed array to store feature attributes, decreasing memory
62             requirements and overhead complexity.
63              
64             =head1 METHODS
65              
66             Refer to the L documentation for general implementation and
67             ideas, which this module tries to implement.
68              
69             =head2 Creating new SeqFeature objects
70              
71             =over 4
72              
73             =item new
74              
75             Generate a new SeqFeature object. Pass an array of key =E value pairs
76             with the feature parameters. At a minimum, location information (C,
77             C, and C) should be provided, with name and ID recommended.
78             Most of the accession methods may be used as key tags to the C method.
79             The following attribute keys are accepted.
80              
81             =over 4
82              
83             =item -seq_id
84              
85             =item -start
86              
87             =item -end
88              
89             =item -stop
90              
91             =item -strand
92              
93             =item -name
94              
95             =item -display_name
96              
97             =item -id
98              
99             =item -primary_id
100              
101             =item -type
102              
103             =item -source
104              
105             =item -source_tag
106              
107             =item -primary_tag
108              
109             =item -score
110              
111             =item -phase
112              
113             =item -attributes
114              
115             Provide an anonymous array of key value pairs representing attribute
116             keys and their values.
117              
118             =item -segments
119              
120             Provide an anonymous array of SeqFeature objects to add as child
121             objects.
122              
123             =back
124              
125             =item duplicate
126              
127             This method will duplicate a SeqFeature object as a new object, with
128             limitations. The C tag, attribute key/values, and subfeatures
129             are B duplicated. This basically limits the object to coordinates,
130             C, C, C, C, and C.
131             Remaining attributes should be explicitly set by the user.
132              
133             =back
134              
135             =head2 Accession methods
136              
137             These are methods to set and/or retrieve attribute values. Pass a
138             single value to set the attribute. The attribute value is always
139             returned.
140              
141             =over 4
142              
143             =item seq_id
144              
145             The name of the chromosome or reference sequence. If you are generating
146             a new child object, e.g. exon, then seq_id does not need to be provided.
147             In these cases, the parent's seq_id is inherited.
148              
149             =item start
150              
151             The start coordinate of the feature. SeqFeature objects use the 1-base
152             numbering system, following BioPerl convention. Start coordinates are
153             always less than the stop coordinate.
154              
155             =item end
156              
157             =item stop
158              
159             The end coordinate of the feature. Stop coordinates are always greater
160             than the start coordinate.
161              
162             =item strand
163              
164             The strand that the feature is on. The default value is always unstranded,
165             or 0. Any of the following may be supplied: "C<1 0 -1 + . ->". Numeric
166             integers 1, 0, and -1 are always returned.
167              
168             =item source
169              
170             =item source_tag
171              
172             A text string representing the source of the feature. This corresponds to
173             the second field in a GFF file. The source tag is optional. If not supplied,
174             the source tag can be inherited from a parent feature if present.
175              
176             =item primary_tag
177              
178             The type of feature. These usually follow Sequence Ontology terms, but are
179             not required. The default value is the generic term "region". Examples
180             include gene, mRNA, transcript, exon, CDS, etc. This corresponds to the
181             third field in a GFF file.
182              
183             =item type
184              
185             A shortcut method which can represent either "primary_tag:source_tag" or,
186             if no source_tag is defined, simply "primary_tag".
187              
188             =item name
189              
190             =item display_name
191              
192             A text string representing the name of the feature. The name is not
193             required to be a unique value, but generally is. The default name, if
194             none is provided, is the C.
195              
196             =item id
197              
198             =item primary_id
199              
200             A text string representing a unique identifier of the feature. If not
201             explicitly defined, a unique ID is automatically generated.
202              
203             =item score
204              
205             A numeric (integer or floating) value representing the feature.
206              
207             =item phase
208              
209             An integer (0,1,2) representing the coding frame. Only required for
210             CDS features.
211              
212             =back
213              
214             =head2 Special Attributes
215              
216             Special attributes are key value pairs that do not fall under the above
217             conventions. It is possible to have more than one value assigned to a
218             given key. In a GFF file, this corresponds to the attributes in the 9th
219             field, with the exception of special reserved attributes such as C,
220             C, and C.
221              
222             =over 4
223              
224             =item add_tag_value
225              
226             $seqf->add_tag_value($key, $value);
227              
228             Sets the special attribute C<$key> to C<$value>. If you have more than one
229             value, C<$value> should be an anonymous array of text values. Following
230             GFF convention, C<$key> should not comprise of special characters, including
231             ";,= ".
232              
233             =item all_tags
234              
235             =item get_all_tags
236              
237             Returns an array of all attribute keys.
238              
239             =item has_tag($key)
240              
241             Boolean method whether the SeqFeature object contains the attribute.
242              
243             =item get_tag_values($key)
244              
245             =item each_tag_value($key)
246              
247             Returns the value for attribute C<$key>. If multiple values are present,
248             it may return an array or array reference.
249              
250             =item attributes
251              
252             Returns an array or reference to the key value hash;
253              
254             =item remove_tag($key)
255              
256             Deletes the indicated attribute.
257              
258             =back
259              
260             =head2 Subfeature Hierarchy
261              
262             Following Sequence Ontology and GFF conventions, SeqFeatures can have
263             subfeatures (children) representing a hierarchical structure, for example
264             genes beget transcripts which beget exons.
265              
266             Child SeqFeature objects may have more than one parent, for example, shared
267             exons between alternate transcripts. In which case, only one exon
268             SeqFeature object is generated, but is added to both transcript objects.
269              
270             =over 4
271              
272             =item add_SeqFeature
273              
274             =item add_segment
275              
276             Pass one or more SeqFeature objects to be associated as children.
277              
278             =item get_SeqFeatures
279              
280             =item get_all_SeqFeatures
281              
282             =item segments
283              
284             Returns an array of all sub SeqFeature objects.
285              
286             =item delete_SeqFeature($id)
287              
288             This method will delete a specific child SeqFeature. Pass the method the
289             C of the SeqFeature. All SeqFeatures will have a C.
290              
291             =back
292              
293             =head2 Range Methods
294              
295             These are range methods for comparing one SeqFeature object to another.
296             They are analogous to L methods.
297              
298             They currently do not support strand checks or strand options.
299              
300             =over 4
301              
302             =item length
303              
304             Returns the length of the SeqFeature object.
305              
306             =item overlaps
307              
308             my $overlap_check = $seqf1->overlaps($other_seqf);
309              
310             Returns a boolean value whether a second SeqFeature object overlaps
311             with the self object.
312              
313             =item contains
314              
315             my $check = $seqf1->contains($seqf2);
316              
317             Returns a boolean value whether the self object completely
318             contains a second SeqFeature object.
319              
320             =item equals
321              
322             my $check = $seqf1->equals($seqf2);
323              
324             Returns a boolean value whether the self object coordinates are
325             equivalent to a second SeqFeature object.
326              
327             =item intersection
328              
329             my $check = $seqf1->intersection($seqf2);
330              
331             Returns a new SeqFeature object representing the intersection or
332             overlap area between the self object and a second SeqFeature
333             object.
334              
335             =item union($other)
336              
337             my $new_seqf = $seqf1->union($seqf2);
338              
339             Returns a new SeqFeature object representing the merged interval
340             between the self and a second SeqFeature object.
341              
342             =item subtract
343              
344             my $new_seqf = $seqf1->subtract($seqf2);
345              
346             Returns a new SeqFeature object representing the interval of the
347             self object after subtracting a second SeqFeature object.
348              
349             =back
350              
351             =head2 Export Strings
352              
353             These methods export the SeqFeature object as a text string in the
354             specified format. New line characters are included.
355              
356             =over 4
357              
358             =item gff_string($recurse)
359              
360             Exports the SeqFeature object as a GFF3 formatted string. Pass a
361             boolean value if you wish to recurse through the hierarchy and
362             print subfeatures as a multi-line string. Child-Parent ID attributes
363             are smartly handled, including multiple parentage.
364              
365             See L for methods for printing GTF transcript models.
366              
367             =item bed_string
368              
369             Exports the SeqFeature object as a simple BED6 formatted string.
370             See L for methods for printing BED12 transcript models.
371              
372             =back
373              
374             =head1 LIMITATIONS
375              
376             Because of their underlying array structure, Bio::ToolBox::SeqFeature objects
377             should generally not be used as a base class (unless you know the ramifications
378             of doing so). The following Bio classes and Interfaces are similar and their API
379             was used as a model. However, in most cases they are not likely to work with this
380             module because of object structure incompatibility, although this has not been
381             explicitly tested.
382              
383             =over 4
384              
385             =item Bio::AnnotationI
386              
387             =item Bio::LocationI
388              
389             =item Bio::RangeI
390              
391             =item Bio::SeqI
392              
393             =item Bio::Tools::GFF
394              
395             =item Bio::DB::SeqFeature::Store
396              
397             =item Bio::Graphics
398              
399             =back
400              
401             =head1 SEE ALSO
402              
403             L, L, L,
404             L, L, L
405              
406             =cut
407              
408 2     2   14 use strict;
  2         5  
  2         91  
409 2     2   14 use Carp qw(carp cluck croak confess);
  2         31  
  2         188  
410             use constant {
411 2         8441 SEQID => 0,
412             START => 1,
413             STOP => 2,
414             STRND => 3,
415             NAME => 4,
416             ID => 5,
417             TYPE => 6,
418             SRC => 7,
419             SCORE => 8,
420             PHASE => 9,
421             ATTRB => 10,
422             SUBF => 11,
423 2     2   15 };
  2         3  
424             our $IDCOUNT = 0;
425              
426             1;
427              
428             #### Aliases ####
429             # to maintain compatibility with Bio::SeqFeature::Lite and Bio::SeqFeatureI we
430             # put in plenty of aliases to some of the methods
431             *stop = \&end;
432             *name = \&display_name;
433             *id = \&primary_id;
434             *method = \&primary_tag;
435             *source = \&source_tag;
436             *add_segment = \&add_SeqFeature;
437             *get_all_SeqFeatures = *segments = \&get_SeqFeatures;
438             *each_tag_value = \&get_tag_values;
439             *get_all_tags = \&all_tags;
440             *gff3_string = \&gff_string;
441              
442              
443             #### METHODS ####
444             sub new {
445 1658     1658 1 2722 my $class = shift;
446 1658 100       3210 if (ref($class)) {
447 723         1037 $class = ref($class);
448             }
449 1658         7158 my %args = @_;
450 1658         3657 my $self = bless [], $class;
451             # bless ourselves early to take advantage of some methods
452             # but otherwise do what we can simply and quickly
453            
454             # primary options
455             $self->[SEQID] = $args{-seq_id} || $args{-seqid} || $args{'-ref'} ||
456 1658   0     4756 $args{chrom} || undef;
457 1658   50     3589 $self->[START] = $args{-start} || undef;
458 1658   0     3352 $self->[STOP] = $args{-end} || $args{-stop} || undef;
459 1658 50       4596 $self->strand($args{-strand}) if exists $args{-strand};
460 1658   100     5598 $self->[NAME] = $args{-display_name} || $args{-name} || undef;
461 1658   100     5575 $self->[ID] = $args{-primary_id} || $args{-id} || undef;
462            
463             # check orientation
464 1658 50 33     7647 if (defined $self->[START] and defined $self->[STOP] and
      33        
465             $self->[START] > $self->[STOP]
466             ) {
467             # flip the coordinates around
468 0         0 ($self->[START], $self->[STOP]) = ($self->[STOP], $self->[START]);
469 0         0 $self->[STRND] *= -1;
470             }
471            
472             # additional options
473 1658   50     6046 $args{-type} ||= $args{-primary_tag} || undef;
      66        
474 1658 50       3056 if (defined $args{-type}) {
475 1658         3146 $self->type($args{-type});
476             }
477 1658   100     3629 $args{-source} ||= $args{-source_tag} || undef;
      100        
478 1658 100       2944 if (defined $args{-source}) {
479 1638         2562 $self->[SRC] = $args{-source};
480             }
481 1658 100       2885 if (exists $args{-score}) {
482 32         60 $self->[SCORE] = $args{-score};
483             }
484 1658 100       2977 if (exists $args{-phase}) {
485 240         544 $self->phase($args{-phase});
486             }
487 1658 100 66     4769 if (exists $args{-attributes} or exists $args{-tags}) {
488 1   33     4 $args{-attributes} ||= $args{-tags};
489 1 50       4 if (ref($args{-attributes}) eq 'HASH') {
490 1         3 $self->[ATTRB] = $args{-attributes};
491             }
492             }
493 1658 100       3156 if (exists $args{-segments}) {
494 2         5 $self->[SUBF] = [];
495 2         3 foreach my $s (@{ $args{-segments} }) {
  2         8  
496 16 50       28 unless (ref($s) eq $class) {
497 0         0 croak "segments should be passed as $class objects!";
498             }
499 16         23 push @{$self->[SUBF]}, $s;
  16         29  
500             }
501             }
502            
503 1658         5281 return $self;
504             }
505              
506              
507             sub seq_id {
508 1671     1671 1 8941 my $self = shift;
509 1671 50       3003 if (@_) {
510 0         0 $self->[SEQID] = $_[0];
511             }
512 1671 50       4669 return defined $self->[SEQID] ? $self->[SEQID] : undef;
513             }
514              
515             sub start {
516 6490     6490 1 24149 my $self = shift;
517 6490 50       10540 if (@_) {
518 0         0 $self->[START] = $_[0];
519             }
520 6490 50       20247 return defined $self->[START] ? $self->[START] : undef;
521             }
522              
523             sub end {
524 2361     2361 1 3374 my $self = shift;
525 2361 100       3965 if (@_) {
526 9         23 $self->[STOP] = $_[0];
527             }
528 2361 50       6972 return defined $self->[STOP] ? $self->[STOP] : undef;
529             }
530              
531             sub strand {
532 4178     4178 1 5735 my $self = shift;
533 4178 100       7416 if (@_) {
534 1674         2349 my $str = $_[0];
535 1674 100       6921 if ($str eq '+') {
    100          
    100          
    50          
    0          
536 262         463 $self->[STRND] = 1;
537             }
538             elsif ($str eq '-') {
539 116         222 $self->[STRND] = -1;
540             }
541             elsif ($str eq '.') {
542 32         69 $self->[STRND] = 0;
543             }
544             elsif ($str =~ /^[\+\-]?1$/) {
545 1264         2424 $self->[STRND] = $str;
546             }
547             elsif ($str eq '0') {
548 0         0 $self->[STRND] = 0;
549             }
550             }
551 4178 50       9247 return defined $self->[STRND] ? $self->[STRND] : 0;
552             }
553              
554             sub display_name {
555 2188     2188 1 8094 my $self = shift;
556 2188 100       3733 if (@_) {
557 223         426 $self->[NAME] = $_[0];
558             }
559 2188 100       7395 return defined $self->[NAME] ? $self->[NAME] : $self->primary_id;
560             }
561              
562             sub primary_id {
563 2011     2011 1 2969 my $self = shift;
564 2011 100       4767 if (@_) {
    100          
565 235         504 $self->[ID] = $_[0];
566             }
567             elsif (not defined $self->[ID]) {
568             # automatically assign a new ID
569 205         410 $self->[ID] = sprintf("%s_%09d", $self->primary_tag, $IDCOUNT++);
570             }
571 2011         5902 return $self->[ID];
572             }
573              
574             sub primary_tag {
575 9333     9333 1 13121 my $self = shift;
576 9333 100       15681 if (@_) {
577 16         28 $self->[TYPE] = $_[0];
578             }
579 9333   50     14943 $self->[TYPE] ||= 'region';
580 9333         22397 return $self->[TYPE];
581             }
582              
583             sub source_tag {
584 1925     1925 1 2569 my $self = shift;
585 1925 50       3313 if (@_) {
586 0         0 $self->[SRC] = $_[0];
587             }
588 1925 50       4556 return defined $self->[SRC] ? $self->[SRC] : undef;
589             }
590              
591             sub type {
592 1901     1901 1 4488 my $self = shift;
593 1901 100       3437 if (@_) {
594 1658         2287 my $type = $_[0];
595 1658 50       3592 if ($type =~ /:/) {
596 0         0 my ($t, $s) = split /:/, $type;
597 0         0 $self->[TYPE] = $t;
598 0         0 $self->[SRC] = $s;
599             }
600             else {
601 1658         3473 $self->[TYPE] = $type;
602             }
603             }
604 1901   50     3573 $self->[TYPE] ||= undef;
605 1901   100     6328 $self->[SRC] ||= undef;
606 1901 100       3449 if (defined $self->[SRC]) {
607 243         1067 return join(':', $self->[TYPE], $self->[SRC]);
608             }
609 1658         2562 return $self->[TYPE];
610             }
611              
612             sub score {
613 119     119 1 203 my $self = shift;
614 119 100       228 if (@_) {
615 68         142 $self->[SCORE] = $_[0];
616             }
617 119 100       307 return defined $self->[SCORE] ? $self->[SCORE] : undef;
618             }
619              
620             sub phase {
621 420     420 1 713 my $self = shift;
622 420 100       802 if (@_) {
623 356         531 my $p = $_[0];
624 356 50       1079 unless ($p =~ /^[012\.]$/) {
625 0         0 cluck "phase must be 0, 1, 2 or .!";
626             }
627 356         838 $self->[PHASE] = $p;
628             }
629 420 100       1027 return defined $self->[PHASE] ? $self->[PHASE] : undef;
630             }
631              
632             sub duplicate {
633 471     471 1 646 my $self = shift;
634 471         917 my $n = $self->new(
635             -seq_id => $self->[SEQID],
636             -start => $self->[START],
637             -end => $self->[STOP],
638             -strand => $self->strand,
639             -type => $self->primary_tag,
640             -source => $self->source_tag,
641             -display_name => $self->name
642             );
643 471 50       1122 $n->score( $self->[SCORE] ) if defined $self->[SCORE];
644 471 50       861 $n->phase( $self->[PHASE] ) if defined $self->[PHASE];
645 471         936 return $n;
646             }
647              
648             sub add_SeqFeature {
649 773     773 1 1082 my $self = shift;
650 773   100     1989 $self->[SUBF] ||= [];
651 773         1163 my $count = 0;
652 773         1300 foreach my $s (@_) {
653 773 50       1683 if (ref($s) eq ref($self)) {
654 773 50       1515 $s->seq_id($self->seq_id) unless ($s->seq_id);
655 773 50       1557 $s->strand($self->strand) unless ($s->strand);
656 773 50       1384 $s->source_tag($self->source_tag) unless ($s->source_tag);
657 773         1071 push @{ $self->[SUBF] }, $s;
  773         1535  
658 773         1421 $count++;
659             }
660             else {
661 0         0 cluck "please use SeqFeature objects when adding sub features!";
662             }
663             }
664 773         1717 return $count;
665             }
666              
667             sub get_SeqFeatures {
668 1215     1215 1 1658 my $self = shift;
669 1215 100       2341 return unless $self->[SUBF];
670 1067         1414 my @children;
671 1067         1331 foreach (@{ $self->[SUBF] }) {
  1067         1849  
672 10867         14704 push @children, $_;
673             }
674 1067 50       3354 return wantarray ? @children : \@children;
675             }
676              
677             sub delete_SeqFeature {
678 0     0 1 0 my $self = shift;
679 0 0       0 return unless $self->[SUBF];
680 0         0 my $id = shift;
681 0         0 my $d;
682 0         0 my $i = 0;
683 0         0 foreach (@{ $self->[SUBF] }) {
  0         0  
684 0 0       0 if ($_->[ID] eq $id) {
685 0         0 $d = $i;
686 0         0 last;
687             }
688 0         0 $i++;
689             }
690 0 0       0 if (defined $d) {
691 0         0 splice(@{$self->[SUBF]}, $d, 1);
  0         0  
692 0         0 return 1;
693             }
694 0         0 return;
695             }
696              
697             sub add_tag_value {
698 518     518 1 1014 my ($self, $key, $value) = @_;
699 518 100 66     1814 return unless ($key and $value);
700 450   100     1595 $self->[ATTRB] ||= {};
701 450 50       879 if (exists $self->[ATTRB]->{$key}) {
702 0 0       0 if (ref($self->[ATTRB]->{$key}) eq 'ARRAY') {
703 0         0 push @{ $self->[ATTRB]->{$key} }, $value;
  0         0  
704             }
705             else {
706 0         0 my $current = $self->[ATTRB]->{$key};
707 0         0 $self->[ATTRB]->{$key} = [$current, $value];
708             }
709             }
710             else {
711 450         1289 $self->[ATTRB]->{$key} = $value;
712             }
713             }
714              
715             sub has_tag {
716 514     514 1 950 my ($self, $key) = @_;
717 514 100       1271 return unless $self->[ATTRB];
718 365         933 return exists $self->[ATTRB]->{$key};
719             }
720              
721             sub get_tag_values {
722 423     423 1 730 my ($self, $key) = @_;
723 423 100       780 return unless $self->[ATTRB];
724 422 100       747 if (exists $self->[ATTRB]->{$key}) {
725 409 50       754 if (ref($self->[ATTRB]->{$key}) eq 'ARRAY') {
726 0 0       0 return wantarray ? @{ $self->[ATTRB]->{$key} } : $self->[ATTRB]->{$key};
  0         0  
727             }
728             else {
729 409         1061 return $self->[ATTRB]->{$key};
730             }
731             }
732             else {
733 13         36 return;
734             }
735             }
736              
737             sub attributes {
738 1     1 1 2 my $self = shift;
739 1 50       5 return unless $self->[ATTRB];
740 0 0       0 return wantarray ? %{ $self->[ATTRB] } : $self->[ATTRB];
  0         0  
741             }
742              
743             sub all_tags {
744 49     49 1 78 my $self = shift;
745 49 100       124 return unless $self->[ATTRB];
746 15         23 my @k = keys %{ $self->[ATTRB] };
  15         79  
747 15 50       53 return wantarray ? @k : \@k;
748             }
749              
750             sub remove_tag {
751 0     0 1 0 my ($self, $key) = @_;
752 0 0       0 return unless $self->[ATTRB];
753 0 0       0 if (exists $self->[ATTRB]->{$key}) {
754 0         0 delete $self->[ATTRB]->{$key};
755             }
756             }
757              
758              
759              
760             ### Range Methods
761             # Borrowed from Bio::RangeI, but does not do strong/weak strand checks
762              
763             sub length {
764 232     232 1 305 my $self = shift;
765 232         362 return $self->end - $self->start + 1;
766             }
767              
768             sub overlaps {
769 0     0 1 0 my ($self, $other) = @_;
770 0 0 0     0 return unless ($other and ref($other));
771 0 0       0 return unless ($self->seq_id eq $other->seq_id);
772             return not (
773 0   0     0 $self->start > $other->end or
774             $self->end < $other->start
775             );
776             }
777              
778             sub contains {
779 0     0 1 0 my ($self, $other) = @_;
780 0 0 0     0 return unless ($other and ref($other));
781 0 0       0 return unless ($self->seq_id eq $other->seq_id);
782             return (
783 0   0     0 $other->start >= $self->start and
784             $other->end <= $self->end
785             );
786             }
787              
788             sub equals {
789 0     0 1 0 my ($self, $other) = @_;
790 0 0 0     0 return unless ($other and ref($other));
791 0 0       0 return unless ($self->seq_id eq $other->seq_id);
792             return (
793 0   0     0 $other->start == $self->start and
794             $other->end == $self->end
795             );
796             }
797              
798             sub intersection {
799 0     0 1 0 my ($self, $other) = @_;
800 0 0 0     0 return unless ($other and ref($other));
801 0 0       0 return unless ($self->seq_id eq $other->seq_id);
802 0         0 my ($start, $stop);
803 0 0       0 if ($self->start >= $other->start) {
804 0         0 $start = $self->start;
805             }
806             else {
807 0         0 $start = $other->start;
808             }
809 0 0       0 if ($self->end <= $other->end) {
810 0         0 $stop = $self->end;
811             }
812             else {
813 0         0 $stop = $other->end;
814             }
815 0 0       0 return if $start > $stop;
816 0         0 return $self->new(
817             -seq_id => $self->seq_id,
818             -start => $start,
819             -end => $stop,
820             );
821             }
822              
823             sub union {
824 0     0 1 0 my ($self, $other) = @_;
825 0 0 0     0 return unless ($other and ref($other));
826 0 0       0 return unless ($self->seq_id eq $other->seq_id);
827 0         0 my ($start, $stop);
828 0 0       0 if ($self->start <= $other->start) {
829 0         0 $start = $self->start;
830             }
831             else {
832 0         0 $start = $other->start;
833             }
834 0 0       0 if ($self->end >= $other->end) {
835 0         0 $stop = $self->end;
836             }
837             else {
838 0         0 $stop = $other->end;
839             }
840 0 0       0 return if $start > $stop;
841 0         0 return $self->new(
842             -seq_id => $self->seq_id,
843             -start => $start,
844             -end => $stop,
845             );
846             }
847              
848             sub subtract {
849 0     0 1 0 my ($self, $other) = @_;
850 0 0 0     0 return unless ($other and ref($other));
851 0 0       0 return unless ($self->seq_id eq $other->seq_id);
852 0 0       0 return if $self->equals($other);
853 0         0 my @pieces;
854 0 0       0 if ($self->overlaps($other)) {
855 0         0 my $int = $self->intersection($other);
856 0 0       0 if ($self->start < $int->start) {
857 0         0 push @pieces, $self->new(
858             -seq_id => $self->seq_id,
859             -start => $self->start,
860             -end => $int->start - 1,
861             );
862             }
863 0 0       0 if ($self->end > $int->end) {
864 0         0 push @pieces, $self->new(
865             -seq_id => $self->seq_id,
866             -start => $int->end + 1,
867             -end => $self->end,
868             );
869             }
870             }
871             else {
872 0         0 push @pieces, $self->new(
873             -seq_id => $self->seq_id,
874             -start => $self->start,
875             -end => $self->end,
876             );
877             }
878 0 0       0 return wantarray ? @pieces : \@pieces;
879             }
880              
881              
882             ### Export methods
883              
884             sub version {
885 2     2 0 7 my $self = shift;
886 2 50 33     20 if (@_ and $_[0] ne '3') {
887 0         0 carp "sorry, only GFF version 3 is currently supported!";
888             }
889 2         6 return 3;
890             }
891              
892             sub gff_string {
893             # exports version 3 GFF, sorry, no mechanism for others....
894 101     101 1 134 my $self = shift;
895 101   50     178 my $recurse = shift || 0;
896 101   100     191 my $childIDs = shift || undef;
897            
898             # skip myself if we've already printed myself
899             return if ($recurse and $childIDs and
900 101 100 66     712 not exists $childIDs->{ $self->primary_id } );
      100        
901            
902             # map all parent-child relationships
903 49 100 66     143 if ($recurse and not defined $childIDs) {
904             # we have a top level SeqFeature and we need to go spelunking for IDs
905 2         5 $childIDs = {};
906 2         6 foreach my $f ($self->get_SeqFeatures) {
907 14         29 $f->_spelunk($self->primary_id, $childIDs);
908             }
909             }
910            
911             # basics
912 49 50 50     96 my $string = join("\t", (
    0 50        
    50 50        
    100 50        
913             $self->seq_id || '.',
914             $self->source_tag || '.',
915             $self->primary_tag,
916             $self->start || 1,
917             $self->end || 1,
918             defined $self->score ? $self->score : '.',
919             $self->strand > 0 ? '+' : $self->strand < 0 ? '-' : '.',
920             defined $self->phase ? $self->phase : '.',
921             ) );
922            
923             # group attributes
924 49         97 my $attributes;
925 49         88 my $name = $self->display_name;
926 49 50       87 if ($name) {
927             # names are optional and may not exist
928 49         102 $attributes = sprintf "Name=%s;ID=%s", $self->_encode($name),
929             $self->_encode( $self->primary_id );
930             }
931             else {
932             # IDs always exist
933 0         0 $attributes = sprintf "ID=%s", $self->_encode( $self->primary_id );
934             }
935 49 50       117 if ($childIDs) {
936 49 100       82 if (exists $childIDs->{$self->primary_id}) {
937             $attributes .= ';Parent=' . join(',',
938 102         151 sort {$a cmp $b}
939 99         160 map { $self->_encode($_) }
940 47         73 keys %{ $childIDs->{$self->primary_id} });
  47         73  
941 47         106 delete $childIDs->{$self->primary_id};
942             }
943             }
944 49         109 foreach my $tag ($self->all_tags) {
945 41 50       82 next if $tag eq 'Name';
946 41 50       75 next if $tag eq 'ID';
947 41 100       78 next if $tag eq 'Parent';
948 27         49 my $value = $self->get_tag_values($tag);
949 27 50       51 if (ref($value) eq 'ARRAY') {
950 0         0 $value = join(",", map { $self->_encode($_) } @$value);
  0         0  
951             }
952             else {
953 27         58 $value = $self->_encode($value);
954             }
955 27         60 $attributes .= ";$tag=$value";
956             }
957 49         167 $string .= "\t$attributes\n";
958            
959             # recurse
960 49 50       90 if ($recurse) {
961 49         84 foreach my $f ($self->get_SeqFeatures) {
962 99         187 my $s = $f->gff_string(1, $childIDs);
963 99 100       250 $string .= $s if $s;
964             }
965             }
966 49         138 return $string;
967             }
968              
969             sub _encode {
970 224     224   360 my ($self, $value) = @_;
971 224         376 $value =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
972 224         534 return $value;
973             }
974              
975             sub _spelunk {
976 99     99   167 my ($self, $parentID, $childIDs) = @_;
977 99         162 $childIDs->{$self->primary_id}{$parentID} += 1;
978 99         167 foreach my $f ($self->get_SeqFeatures) {
979 85         141 $f->_spelunk($self->primary_id, $childIDs);
980             }
981             }
982              
983             sub bed_string {
984 0     0 1   my $self = shift;
985 0 0 0       my $string = join("\t", (
    0 0        
      0        
      0        
986             $self->seq_id || '.',
987             ($self->start || 1) - 1,
988             $self->end || 1,
989             $self->display_name || $self->primary_id,
990             defined $self->score ? $self->score : 0,
991             $self->strand >= 0 ? '+' : '-',
992             ) );
993 0           return "$string\n";
994             }
995              
996              
997             __END__