File Coverage

blib/lib/Bio/ToolBox/SeqFeature.pm
Criterion Covered Total %
statement 191 288 66.3
branch 107 210 50.9
condition 43 105 40.9
subroutine 28 37 75.6
pod 31 32 96.8
total 400 672 59.5


line stmt bran cond sub pod time code
1             package Bio::ToolBox::SeqFeature;
2             our $VERSION = '1.67';
3              
4             =head1 NAME
5              
6             Bio::ToolBox::SeqFeature - Fast, simple SeqFeature implementation
7              
8             =head1 SYNOPSIS
9              
10             # create a transcript
11             my $transcript = Bio::ToolBox::SeqFeature->new(
12             -seq_id => chr1,
13             -start => 1001,
14             -stop => 1500,
15             -strand => '+',
16             );
17             $seqf->primary_tag('mRNA'); # set parameters individually
18            
19             # create an exon
20             my $exon = Bio::ToolBox::SeqFeature->new(
21             -start => 1001,
22             -end => 1200,
23             -type => 'exon',
24             );
25            
26             # associate exon with transcript
27             $transcript->add_SeqFeature($exon);
28             my $exon_strand = $exon->strand; # inherits from parent
29            
30             # walk through subfeatures
31             foreach my $f ($transcript->get_all_SeqFeatures) {
32             printf "%s is a %s\n", $f->display_name, $f->type;
33             }
34            
35             # add attribute
36             $transcript->add_tag_value('Status', $status);
37            
38             # get attribute
39             my $value = $transcript->get_tag_values($key);
40              
41             =head1 DESCRIPTION
42              
43             SeqFeature objects represent functional elements on a genomic or chromosomal
44             sequence, such as genes, transcripts, exons, etc. In many cases, especially
45             genes, they have a hierarchical structure, typically in this order
46              
47             gene
48             mRNA or transcript
49             exon
50             CDS
51              
52             SeqFeature objects have at a minimum coordinate information, including
53             chromosome, start, stop, and strand, and a name or unique identifier. They
54             often also have type or source information, which usually follows the
55             Sequence Ontology key words.
56              
57             This is a fast, efficient, simplified SeqFeature implementation that mostly
58             implements the L API, and could be substituted for other
59             implementations, such L and L.
60             Unlike the others, however, it inherits no classes or methods and uses an
61             unorthodox blessed array to store feature attributes, decreasing memory
62             requirements and overhead complexity.
63              
64             =head1 METHODS
65              
66             Refer to the L documentation for general implementation and
67             ideas, which this module tries to implement.
68              
69             =head2 Creating new SeqFeature objects
70              
71             =over 4
72              
73             =item new
74              
75             Generate a new SeqFeature object. Pass an array of key =E value pairs
76             with the feature parameters. At a minimum, location information (C,
77             C, and C) should be provided, with name and ID recommended.
78             Most of the accession methods may be used as key tags to the C method.
79             The following attribute keys are accepted.
80              
81             =over 4
82              
83             =item -seq_id
84              
85             =item -start
86              
87             =item -end
88              
89             =item -stop
90              
91             =item -strand
92              
93             =item -name
94              
95             =item -display_name
96              
97             =item -id
98              
99             =item -primary_id
100              
101             =item -type
102              
103             =item -source
104              
105             =item -source_tag
106              
107             =item -primary_tag
108              
109             =item -score
110              
111             =item -phase
112              
113             =item -attributes
114              
115             Provide an anonymous array of key value pairs representing attribute
116             keys and their values.
117              
118             =item -segments
119              
120             Provide an anonymous array of SeqFeature objects to add as child
121             objects.
122              
123             =back
124              
125             =item duplicate
126              
127             This method will duplicate a SeqFeature object as a new object, with
128             limitations. The C tag, attribute key/values, and subfeatures
129             are B duplicated. This basically limits the object to coordinates,
130             C, C, C, C, and C.
131             Remaining attributes should be explicitly set by the user.
132              
133             =back
134              
135             =head2 Accession methods
136              
137             These are methods to set and/or retrieve attribute values. Pass a
138             single value to set the attribute. The attribute value is always
139             returned.
140              
141             =over 4
142              
143             =item seq_id
144              
145             The name of the chromosome or reference sequence. If you are generating
146             a new child object, e.g. exon, then seq_id does not need to be provided.
147             In these cases, the parent's seq_id is inherited.
148              
149             =item start
150              
151             The start coordinate of the feature. SeqFeature objects use the 1-base
152             numbering system, following BioPerl convention. Start coordinates are
153             always less than the stop coordinate.
154              
155             =item end
156              
157             =item stop
158              
159             The end coordinate of the feature. Stop coordinates are always greater
160             than the start coordinate.
161              
162             =item strand
163              
164             The strand that the feature is on. The default value is always unstranded,
165             or 0. Any of the following may be supplied: "C<1 0 -1 + . ->". Numeric
166             integers 1, 0, and -1 are always returned.
167              
168             =item source
169              
170             =item source_tag
171              
172             A text string representing the source of the feature. This corresponds to
173             the second field in a GFF file. The source tag is optional. If not supplied,
174             the source tag can be inherited from a parent feature if present.
175              
176             =item primary_tag
177              
178             The type of feature. These usually follow Sequence Ontology terms, but are
179             not required. The default value is the generic term "region". Examples
180             include gene, mRNA, transcript, exon, CDS, etc. This corresponds to the
181             third field in a GFF file.
182              
183             =item type
184              
185             A shortcut method which can represent either "primary_tag:source_tag" or,
186             if no source_tag is defined, simply "primary_tag".
187              
188             =item name
189              
190             =item display_name
191              
192             A text string representing the name of the feature. The name is not
193             required to be a unique value, but generally is. The default name, if
194             none is provided, is the C.
195              
196             =item id
197              
198             =item primary_id
199              
200             A text string representing a unique identifier of the feature. If not
201             explicitly defined, a unique ID is automatically generated.
202              
203             =item score
204              
205             A numeric (integer or floating) value representing the feature.
206              
207             =item phase
208              
209             An integer (0,1,2) representing the coding frame. Only required for
210             CDS features.
211              
212             =back
213              
214             =head2 Special Attributes
215              
216             Special attributes are key value pairs that do not fall under the above
217             conventions. It is possible to have more than one value assigned to a
218             given key. In a GFF file, this corresponds to the attributes in the 9th
219             field, with the exception of special reserved attributes such as C,
220             C, and C.
221              
222             =over 4
223              
224             =item add_tag_value
225              
226             $seqf->add_tag_value($key, $value);
227              
228             Sets the special attribute C<$key> to C<$value>. If you have more than one
229             value, C<$value> should be an anonymous array of text values. Following
230             GFF convention, C<$key> should not comprise of special characters, including
231             ";,= ".
232              
233             =item all_tags
234              
235             =item get_all_tags
236              
237             Returns an array of all attribute keys.
238              
239             =item has_tag($key)
240              
241             Boolean method whether the SeqFeature object contains the attribute.
242              
243             =item get_tag_values($key)
244              
245             =item each_tag_value($key)
246              
247             Returns the value for attribute C<$key>. If multiple values are present,
248             it may return an array or array reference.
249              
250             =item attributes
251              
252             Returns an array or reference to the key value hash;
253              
254             =item remove_tag($key)
255              
256             Deletes the indicated attribute.
257              
258             =back
259              
260             =head2 Subfeature Hierarchy
261              
262             Following Sequence Ontology and GFF conventions, SeqFeatures can have
263             subfeatures (children) representing a hierarchical structure, for example
264             genes beget transcripts which beget exons.
265              
266             Child SeqFeature objects may have more than one parent, for example, shared
267             exons between alternate transcripts. In which case, only one exon
268             SeqFeature object is generated, but is added to both transcript objects.
269              
270             =over 4
271              
272             =item add_SeqFeature
273              
274             =item add_segment
275              
276             Pass one or more SeqFeature objects to be associated as children.
277              
278             =item get_SeqFeatures
279              
280             =item get_all_SeqFeatures
281              
282             =item segments
283              
284             Returns an array of all sub SeqFeature objects.
285              
286             =item delete_SeqFeature($id)
287              
288             This method will delete a specific child SeqFeature. Pass the method the
289             C of the SeqFeature. All SeqFeatures will have a C.
290              
291             =back
292              
293             =head2 Range Methods
294              
295             These are range methods for comparing one SeqFeature object to another.
296             They are analogous to L methods.
297              
298             They currently do not support strand checks or strand options.
299              
300             =over 4
301              
302             =item length
303              
304             Returns the length of the SeqFeature object.
305              
306             =item overlaps
307              
308             my $overlap_check = $seqf1->overlaps($other_seqf);
309              
310             Returns a boolean value whether a second SeqFeature object overlaps
311             with the self object.
312              
313             =item contains
314              
315             my $check = $seqf1->contains($seqf2);
316              
317             Returns a boolean value whether the self object completely
318             contains a second SeqFeature object.
319              
320             =item equals
321              
322             my $check = $seqf1->equals($seqf2);
323              
324             Returns a boolean value whether the self object coordinates are
325             equivalent to a second SeqFeature object.
326              
327             =item intersection
328              
329             my $check = $seqf1->intersection($seqf2);
330              
331             Returns a new SeqFeature object representing the intersection or
332             overlap area between the self object and a second SeqFeature
333             object.
334              
335             =item union($other)
336              
337             my $new_seqf = $seqf1->union($seqf2);
338              
339             Returns a new SeqFeature object representing the merged interval
340             between the self and a second SeqFeature object.
341              
342             =item subtract
343              
344             my $new_seqf = $seqf1->subtract($seqf2);
345              
346             Returns a new SeqFeature object representing the interval of the
347             self object after subtracting a second SeqFeature object.
348              
349             =back
350              
351             =head2 Export Strings
352              
353             These methods export the SeqFeature object as a text string in the
354             specified format. New line characters are included.
355              
356             =over 4
357              
358             =item gff_string($recurse)
359              
360             Exports the SeqFeature object as a GFF3 formatted string. Pass a
361             boolean value if you wish to recurse through the hierarchy and
362             print subfeatures as a multi-line string. Child-Parent ID attributes
363             are smartly handled, including multiple parentage.
364              
365             See L for methods for printing GTF transcript models.
366              
367             =item bed_string
368              
369             Exports the SeqFeature object as a simple BED6 formatted string.
370             See L for methods for printing BED12 transcript models.
371              
372             =back
373              
374             =head1 LIMITATIONS
375              
376             Because of their underlying array structure, Bio::ToolBox::SeqFeature objects
377             should generally not be used as a base class (unless you know the ramifications
378             of doing so). The following Bio classes and Interfaces are similar and their API
379             was used as a model. However, in most cases they are not likely to work with this
380             module because of object structure incompatibility, although this has not been
381             explicitly tested.
382              
383             =over 4
384              
385             =item Bio::AnnotationI
386              
387             =item Bio::LocationI
388              
389             =item Bio::RangeI
390              
391             =item Bio::SeqI
392              
393             =item Bio::Tools::GFF
394              
395             =item Bio::DB::SeqFeature::Store
396              
397             =item Bio::Graphics
398              
399             =back
400              
401             =head1 SEE ALSO
402              
403             L, L, L,
404             L, L, L
405              
406             =cut
407              
408 2     2   11 use strict;
  2         2  
  2         63  
409 2     2   9 use Carp qw(carp cluck croak confess);
  2         23  
  2         148  
410             use constant {
411 2         5828 SEQID => 0,
412             START => 1,
413             STOP => 2,
414             STRND => 3,
415             NAME => 4,
416             ID => 5,
417             TYPE => 6,
418             SRC => 7,
419             SCORE => 8,
420             PHASE => 9,
421             ATTRB => 10,
422             SUBF => 11,
423 2     2   9 };
  2         4  
424             our $IDCOUNT = 0;
425              
426             1;
427              
428             #### Aliases ####
429             # to maintain compatibility with Bio::SeqFeature::Lite and Bio::SeqFeatureI we
430             # put in plenty of aliases to some of the methods
431             *stop = \&end;
432             *name = \&display_name;
433             *id = \&primary_id;
434             *method = \&primary_tag;
435             *source = \&source_tag;
436             *add_segment = \&add_SeqFeature;
437             *get_all_SeqFeatures = *segments = \&get_SeqFeatures;
438             *each_tag_value = \&get_tag_values;
439             *get_all_tags = \&all_tags;
440             *gff3_string = \&gff_string;
441              
442              
443             #### METHODS ####
444             sub new {
445 1658     1658 1 1816 my $class = shift;
446 1658 100       2373 if (ref($class)) {
447 723         777 $class = ref($class);
448             }
449 1658         4792 my %args = @_;
450 1658         2739 my $self = bless [], $class;
451             # bless ourselves early to take advantage of some methods
452             # but otherwise do what we can simply and quickly
453            
454             # primary options
455             $self->[SEQID] = $args{-seq_id} || $args{-seqid} || $args{'-ref'} ||
456 1658   0     3185 $args{chrom} || undef;
457 1658   50     2647 $self->[START] = $args{-start} || undef;
458 1658   0     2452 $self->[STOP] = $args{-end} || $args{-stop} || undef;
459 1658 50       3528 $self->strand($args{-strand}) if exists $args{-strand};
460 1658   100     3928 $self->[NAME] = $args{-display_name} || $args{-name} || undef;
461 1658   100     4008 $self->[ID] = $args{-primary_id} || $args{-id} || undef;
462            
463             # check orientation
464 1658 50 33     5313 if (defined $self->[START] and defined $self->[STOP] and
      33        
465             $self->[START] > $self->[STOP]
466             ) {
467             # flip the coordinates around
468 0         0 ($self->[START], $self->[STOP]) = ($self->[STOP], $self->[START]);
469 0         0 $self->[STRND] *= -1;
470             }
471            
472             # additional options
473 1658   50     4355 $args{-type} ||= $args{-primary_tag} || undef;
      66        
474 1658 50       2182 if (defined $args{-type}) {
475 1658         2362 $self->type($args{-type});
476             }
477 1658   100     2433 $args{-source} ||= $args{-source_tag} || undef;
      100        
478 1658 100       2112 if (defined $args{-source}) {
479 1638         1793 $self->[SRC] = $args{-source};
480             }
481 1658 100       2101 if (exists $args{-score}) {
482 32         64 $self->[SCORE] = $args{-score};
483             }
484 1658 100       2159 if (exists $args{-phase}) {
485 240         371 $self->phase($args{-phase});
486             }
487 1658 100 66     3802 if (exists $args{-attributes} or exists $args{-tags}) {
488 1   33     5 $args{-attributes} ||= $args{-tags};
489 1 50       4 if (ref($args{-attributes}) eq 'HASH') {
490 1         3 $self->[ATTRB] = $args{-attributes};
491             }
492             }
493 1658 100       2082 if (exists $args{-segments}) {
494 2         9 $self->[SUBF] = [];
495 2         4 foreach my $s (@{ $args{-segments} }) {
  2         7  
496 16 50       25 unless (ref($s) eq $class) {
497 0         0 croak "segments should be passed as $class objects!";
498             }
499 16         15 push @{$self->[SUBF]}, $s;
  16         21  
500             }
501             }
502            
503 1658         3685 return $self;
504             }
505              
506              
507             sub seq_id {
508 1671     1671 1 6136 my $self = shift;
509 1671 50       2186 if (@_) {
510 0         0 $self->[SEQID] = $_[0];
511             }
512 1671 50       3504 return defined $self->[SEQID] ? $self->[SEQID] : undef;
513             }
514              
515             sub start {
516 6485     6485 1 17576 my $self = shift;
517 6485 50       7762 if (@_) {
518 0         0 $self->[START] = $_[0];
519             }
520 6485 50       14716 return defined $self->[START] ? $self->[START] : undef;
521             }
522              
523             sub end {
524 2362     2362 1 2415 my $self = shift;
525 2362 100       3090 if (@_) {
526 9         14 $self->[STOP] = $_[0];
527             }
528 2362 50       5097 return defined $self->[STOP] ? $self->[STOP] : undef;
529             }
530              
531             sub strand {
532 4245     4245 1 4084 my $self = shift;
533 4245 100       5317 if (@_) {
534 1699         1699 my $str = $_[0];
535 1699 100       5038 if ($str eq '+') {
    100          
    100          
    100          
    50          
536 262         325 $self->[STRND] = 1;
537             }
538             elsif ($str eq '-') {
539 116         159 $self->[STRND] = -1;
540             }
541             elsif ($str eq '.') {
542 16         26 $self->[STRND] = 0;
543             }
544             elsif ($str =~ /^[\+\-]?1$/) {
545 1207         1731 $self->[STRND] = $str;
546             }
547             elsif ($str eq '0') {
548 98         124 $self->[STRND] = 0;
549             }
550             }
551 4245 50       6851 return defined $self->[STRND] ? $self->[STRND] : 0;
552             }
553              
554             sub display_name {
555 2189     2189 1 5981 my $self = shift;
556 2189 100       2706 if (@_) {
557 223         279 $self->[NAME] = $_[0];
558             }
559 2189 100       4966 return defined $self->[NAME] ? $self->[NAME] : $self->primary_id;
560             }
561              
562             sub primary_id {
563 2011     2011 1 2029 my $self = shift;
564 2011 100       3361 if (@_) {
    100          
565 235         287 $self->[ID] = $_[0];
566             }
567             elsif (not defined $self->[ID]) {
568             # automatically assign a new ID
569 205         332 $self->[ID] = sprintf("%s_%09d", $self->primary_tag, $IDCOUNT++);
570             }
571 2011         4163 return $self->[ID];
572             }
573              
574             sub primary_tag {
575 9333     9333 1 9581 my $self = shift;
576 9333 100       10911 if (@_) {
577 16         18 $self->[TYPE] = $_[0];
578             }
579 9333   50     11410 $self->[TYPE] ||= 'region';
580 9333         16416 return $self->[TYPE];
581             }
582              
583             sub source_tag {
584 1925     1925 1 1961 my $self = shift;
585 1925 50       2360 if (@_) {
586 0         0 $self->[SRC] = $_[0];
587             }
588 1925 50       3335 return defined $self->[SRC] ? $self->[SRC] : undef;
589             }
590              
591             sub type {
592 1901     1901 1 3395 my $self = shift;
593 1901 100       2556 if (@_) {
594 1658         1636 my $type = $_[0];
595 1658 50       2532 if ($type =~ /:/) {
596 0         0 my ($t, $s) = split /:/, $type;
597 0         0 $self->[TYPE] = $t;
598 0         0 $self->[SRC] = $s;
599             }
600             else {
601 1658         2607 $self->[TYPE] = $type;
602             }
603             }
604 1901   50     2446 $self->[TYPE] ||= undef;
605 1901   100     4107 $self->[SRC] ||= undef;
606 1901 100       2509 if (defined $self->[SRC]) {
607 243         834 return join(':', $self->[TYPE], $self->[SRC]);
608             }
609 1658         1732 return $self->[TYPE];
610             }
611              
612             sub score {
613 119     119 1 126 my $self = shift;
614 119 100       190 if (@_) {
615 68         97 $self->[SCORE] = $_[0];
616             }
617 119 100       244 return defined $self->[SCORE] ? $self->[SCORE] : undef;
618             }
619              
620             sub phase {
621 420     420 1 497 my $self = shift;
622 420 100       578 if (@_) {
623 356         353 my $p = $_[0];
624 356 50       734 unless ($p =~ /^[012\.]$/) {
625 0         0 cluck "phase must be 0, 1, 2 or .!";
626             }
627 356         617 $self->[PHASE] = $p;
628             }
629 420 100       668 return defined $self->[PHASE] ? $self->[PHASE] : undef;
630             }
631              
632             sub duplicate {
633 471     471 1 464 my $self = shift;
634 471         621 my $n = $self->new(
635             -seq_id => $self->[SEQID],
636             -start => $self->[START],
637             -end => $self->[STOP],
638             -strand => $self->strand,
639             -type => $self->primary_tag,
640             -source => $self->source_tag,
641             -display_name => $self->name
642             );
643 471 50       851 $n->score( $self->[SCORE] ) if defined $self->[SCORE];
644 471 50       546 $n->phase( $self->[PHASE] ) if defined $self->[PHASE];
645 471         666 return $n;
646             }
647              
648             sub add_SeqFeature {
649 773     773 1 776 my $self = shift;
650 773   100     1414 $self->[SUBF] ||= [];
651 773         770 my $count = 0;
652 773         937 foreach my $s (@_) {
653 773 50       1130 if (ref($s) eq ref($self)) {
654 773 50       977 $s->seq_id($self->seq_id) unless ($s->seq_id);
655 773 100       1159 $s->strand($self->strand) unless ($s->strand);
656 773 50       1044 $s->source_tag($self->source_tag) unless ($s->source_tag);
657 773         743 push @{ $self->[SUBF] }, $s;
  773         1040  
658 773         975 $count++;
659             }
660             else {
661 0         0 cluck "please use SeqFeature objects when adding sub features!";
662             }
663             }
664 773         1540 return $count;
665             }
666              
667             sub get_SeqFeatures {
668 1215     1215 1 1331 my $self = shift;
669 1215 100       1677 return unless $self->[SUBF];
670 1067         989 my @children;
671 1067         964 foreach (@{ $self->[SUBF] }) {
  1067         1358  
672 10867         10690 push @children, $_;
673             }
674 1067 50       2498 return wantarray ? @children : \@children;
675             }
676              
677             sub delete_SeqFeature {
678 0     0 1 0 my $self = shift;
679 0 0       0 return unless $self->[SUBF];
680 0         0 my $id = shift;
681 0         0 my $d;
682 0         0 my $i = 0;
683 0         0 foreach (@{ $self->[SUBF] }) {
  0         0  
684 0 0       0 if ($_->[ID] eq $id) {
685 0         0 $d = $i;
686 0         0 last;
687             }
688 0         0 $i++;
689             }
690 0 0       0 if (defined $d) {
691 0         0 splice(@{$self->[SUBF]}, $d, 1);
  0         0  
692 0         0 return 1;
693             }
694 0         0 return;
695             }
696              
697             sub add_tag_value {
698 518     518 1 741 my ($self, $key, $value) = @_;
699 518 100 66     1157 return unless ($key and $value);
700 450   100     1119 $self->[ATTRB] ||= {};
701 450 50       675 if (exists $self->[ATTRB]->{$key}) {
702 0 0       0 if (ref($self->[ATTRB]->{$key}) eq 'ARRAY') {
703 0         0 push @{ $self->[ATTRB]->{$key} }, $value;
  0         0  
704             }
705             else {
706 0         0 my $current = $self->[ATTRB]->{$key};
707 0         0 $self->[ATTRB]->{$key} = [$current, $value];
708             }
709             }
710             else {
711 450         855 $self->[ATTRB]->{$key} = $value;
712             }
713             }
714              
715             sub has_tag {
716 516     516 1 647 my ($self, $key) = @_;
717 516 100       814 return unless $self->[ATTRB];
718 367         683 return exists $self->[ATTRB]->{$key};
719             }
720              
721             sub get_tag_values {
722 425     425 1 515 my ($self, $key) = @_;
723 425 100       586 return unless $self->[ATTRB];
724 424 100       566 if (exists $self->[ATTRB]->{$key}) {
725 411 50       531 if (ref($self->[ATTRB]->{$key}) eq 'ARRAY') {
726 0 0       0 return wantarray ? @{ $self->[ATTRB]->{$key} } : $self->[ATTRB]->{$key};
  0         0  
727             }
728             else {
729 411         757 return $self->[ATTRB]->{$key};
730             }
731             }
732             else {
733 13         28 return;
734             }
735             }
736              
737             sub attributes {
738 1     1 1 3 my $self = shift;
739 1 50       13 return unless $self->[ATTRB];
740 0 0       0 return wantarray ? %{ $self->[ATTRB] } : $self->[ATTRB];
  0         0  
741             }
742              
743             sub all_tags {
744 49     49 1 48 my $self = shift;
745 49 100       76 return unless $self->[ATTRB];
746 15         15 my @k = keys %{ $self->[ATTRB] };
  15         36  
747 15 50       33 return wantarray ? @k : \@k;
748             }
749              
750             sub remove_tag {
751 0     0 1 0 my ($self, $key) = @_;
752 0 0       0 return unless $self->[ATTRB];
753 0 0       0 if (exists $self->[ATTRB]->{$key}) {
754 0         0 delete $self->[ATTRB]->{$key};
755             }
756             }
757              
758              
759              
760             ### Range Methods
761             # Borrowed from Bio::RangeI, but does not do strong/weak strand checks
762              
763             sub length {
764 232     232 1 222 my $self = shift;
765 232         328 return $self->end - $self->start + 1;
766             }
767              
768             sub overlaps {
769 0     0 1 0 my ($self, $other) = @_;
770 0 0 0     0 return unless ($other and ref($other));
771 0 0       0 return unless ($self->seq_id eq $other->seq_id);
772             return not (
773 0   0     0 $self->start > $other->end or
774             $self->end < $other->start
775             );
776             }
777              
778             sub contains {
779 0     0 1 0 my ($self, $other) = @_;
780 0 0 0     0 return unless ($other and ref($other));
781 0 0       0 return unless ($self->seq_id eq $other->seq_id);
782             return (
783 0   0     0 $other->start >= $self->start and
784             $other->end <= $self->end
785             );
786             }
787              
788             sub equals {
789 0     0 1 0 my ($self, $other) = @_;
790 0 0 0     0 return unless ($other and ref($other));
791 0 0       0 return unless ($self->seq_id eq $other->seq_id);
792             return (
793 0   0     0 $other->start == $self->start and
794             $other->end == $self->end
795             );
796             }
797              
798             sub intersection {
799 0     0 1 0 my ($self, $other) = @_;
800 0 0 0     0 return unless ($other and ref($other));
801 0 0       0 return unless ($self->seq_id eq $other->seq_id);
802 0         0 my ($start, $stop);
803 0 0       0 if ($self->start >= $other->start) {
804 0         0 $start = $self->start;
805             }
806             else {
807 0         0 $start = $other->start;
808             }
809 0 0       0 if ($self->end <= $other->end) {
810 0         0 $stop = $self->end;
811             }
812             else {
813 0         0 $stop = $other->end;
814             }
815 0 0       0 return if $start > $stop;
816 0         0 return $self->new(
817             -seq_id => $self->seq_id,
818             -start => $start,
819             -end => $stop,
820             );
821             }
822              
823             sub union {
824 0     0 1 0 my ($self, $other) = @_;
825 0 0 0     0 return unless ($other and ref($other));
826 0 0       0 return unless ($self->seq_id eq $other->seq_id);
827 0         0 my ($start, $stop);
828 0 0       0 if ($self->start <= $other->start) {
829 0         0 $start = $self->start;
830             }
831             else {
832 0         0 $start = $other->start;
833             }
834 0 0       0 if ($self->end >= $other->end) {
835 0         0 $stop = $self->end;
836             }
837             else {
838 0         0 $stop = $other->end;
839             }
840 0 0       0 return if $start > $stop;
841 0         0 return $self->new(
842             -seq_id => $self->seq_id,
843             -start => $start,
844             -end => $stop,
845             );
846             }
847              
848             sub subtract {
849 0     0 1 0 my ($self, $other) = @_;
850 0 0 0     0 return unless ($other and ref($other));
851 0 0       0 return unless ($self->seq_id eq $other->seq_id);
852 0 0       0 return if $self->equals($other);
853 0         0 my @pieces;
854 0 0       0 if ($self->overlaps($other)) {
855 0         0 my $int = $self->intersection($other);
856 0 0       0 if ($self->start < $int->start) {
857 0         0 push @pieces, $self->new(
858             -seq_id => $self->seq_id,
859             -start => $self->start,
860             -end => $int->start - 1,
861             );
862             }
863 0 0       0 if ($self->end > $int->end) {
864 0         0 push @pieces, $self->new(
865             -seq_id => $self->seq_id,
866             -start => $int->end + 1,
867             -end => $self->end,
868             );
869             }
870             }
871             else {
872 0         0 push @pieces, $self->new(
873             -seq_id => $self->seq_id,
874             -start => $self->start,
875             -end => $self->end,
876             );
877             }
878 0 0       0 return wantarray ? @pieces : \@pieces;
879             }
880              
881              
882             ### Export methods
883              
884             sub version {
885 2     2 0 4 my $self = shift;
886 2 50 33     12 if (@_ and $_[0] ne '3') {
887 0         0 carp "sorry, only GFF version 3 is currently supported!";
888             }
889 2         3 return 3;
890             }
891              
892             sub gff_string {
893             # exports version 3 GFF, sorry, no mechanism for others....
894 101     101 1 102 my $self = shift;
895 101   50     131 my $recurse = shift || 0;
896 101   100     129 my $childIDs = shift || undef;
897            
898             # skip myself if we've already printed myself
899             return if ($recurse and $childIDs and
900 101 100 66     229 not exists $childIDs->{ $self->primary_id } );
      100        
901            
902             # map all parent-child relationships
903 49 100 66     100 if ($recurse and not defined $childIDs) {
904             # we have a top level SeqFeature and we need to go spelunking for IDs
905 2         4 $childIDs = {};
906 2         5 foreach my $f ($self->get_SeqFeatures) {
907 14         19 $f->_spelunk($self->primary_id, $childIDs);
908             }
909             }
910            
911             # basics
912 49 50 50     64 my $string = join("\t", (
    0 50        
    50 50        
    100 50        
913             $self->seq_id || '.',
914             $self->source_tag || '.',
915             $self->primary_tag,
916             $self->start || 1,
917             $self->end || 1,
918             defined $self->score ? $self->score : '.',
919             $self->strand > 0 ? '+' : $self->strand < 0 ? '-' : '.',
920             defined $self->phase ? $self->phase : '.',
921             ) );
922            
923             # group attributes
924 49         68 my $attributes;
925 49         57 my $name = $self->display_name;
926 49 50       55 if ($name) {
927             # names are optional and may not exist
928 49         68 $attributes = sprintf "Name=%s;ID=%s", $self->_encode($name),
929             $self->_encode( $self->primary_id );
930             }
931             else {
932             # IDs always exist
933 0         0 $attributes = sprintf "ID=%s", $self->_encode( $self->primary_id );
934             }
935 49 50       82 if ($childIDs) {
936 49 100       66 if (exists $childIDs->{$self->primary_id}) {
937             $attributes .= ';Parent=' . join(',',
938 95         106 sort {$a cmp $b}
939 99         118 map { $self->_encode($_) }
940 47         48 keys %{ $childIDs->{$self->primary_id} });
  47         55  
941 47         77 delete $childIDs->{$self->primary_id};
942             }
943             }
944 49         81 foreach my $tag ($self->all_tags) {
945 41 50       53 next if $tag eq 'Name';
946 41 50       49 next if $tag eq 'ID';
947 41 100       74 next if $tag eq 'Parent';
948 27         37 my $value = $self->get_tag_values($tag);
949 27 50       39 if (ref($value) eq 'ARRAY') {
950 0         0 $value = join(",", map { $self->_encode($_) } @$value);
  0         0  
951             }
952             else {
953 27         31 $value = $self->_encode($value);
954             }
955 27         47 $attributes .= ";$tag=$value";
956             }
957 49         89 $string .= "\t$attributes\n";
958            
959             # recurse
960 49 50       57 if ($recurse) {
961 49         77 foreach my $f ($self->get_SeqFeatures) {
962 99         137 my $s = $f->gff_string(1, $childIDs);
963 99 100       179 $string .= $s if $s;
964             }
965             }
966 49         92 return $string;
967             }
968              
969             sub _encode {
970 224     224   249 my ($self, $value) = @_;
971 224         270 $value =~ s/([\t\n\r%&\=;, ])/sprintf("%%%X",ord($1))/ge;
  0         0  
972 224         374 return $value;
973             }
974              
975             sub _spelunk {
976 99     99   142 my ($self, $parentID, $childIDs) = @_;
977 99         105 $childIDs->{$self->primary_id}{$parentID} += 1;
978 99         111 foreach my $f ($self->get_SeqFeatures) {
979 85         101 $f->_spelunk($self->primary_id, $childIDs);
980             }
981             }
982              
983             sub bed_string {
984 0     0 1   my $self = shift;
985 0 0 0       my $string = join("\t", (
    0 0        
      0        
      0        
986             $self->seq_id || '.',
987             ($self->start || 1) - 1,
988             $self->end || 1,
989             $self->display_name || $self->primary_id,
990             defined $self->score ? $self->score : 0,
991             $self->strand >= 0 ? '+' : '-',
992             ) );
993 0           return "$string\n";
994             }
995              
996              
997             __END__