File Coverage

blib/lib/Bio/ToolBox/parser/gff.pm
Criterion Covered Total %
statement 278 488 56.9
branch 149 306 48.6
condition 17 81 20.9
subroutine 25 38 65.7
pod 22 25 88.0
total 491 938 52.3


line stmt bran cond sub pod time code
1             package Bio::ToolBox::parser::gff;
2              
3             our $VERSION = '1.66';
4              
5             =head1 NAME
6              
7             Bio::ToolBox::parser::gff - parse GFF3, GTF, and GFF files
8              
9             =head1 SYNOPSIS
10              
11             use Bio::ToolBox::parser::gff;
12             my $filename = 'file.gff3';
13            
14             my $parser = Bio::ToolBox::parser::gff->new(
15             file => $filename,
16             do_gene => 1,
17             do_exon => 1,
18             ) or die "unable to open gff file!\n";
19            
20             while (my $feature = $parser->next_top_feature() ) {
21             # each $feature is a SeqFeature object
22             my @children = $feature->get_SeqFeatures();
23             }
24              
25             =head1 DESCRIPTION
26              
27             This module parses a GFF file into SeqFeature objects. It natively
28             handles GFF3, GTF, and general GFF files.
29              
30             For both GFF3 and GTF files, fully nested gene models, typically
31             gene =E transcript =E (exon, CDS, etc), may be built using the appropriate
32             attribute tags. For GFF3 files, these include ID and Parent tags; for GTF
33             these include C and C tags.
34              
35             For GFF3 files, any feature without a C tag is assumed to be a
36             parent. Children features referencing a parent feature that has not been
37             loaded are considered orphans. Orphans are attempted to be re-associated
38             with missing parents after the file is completely parsed. Any orphans left
39             may be collected. Files with orphans are considered poorly formatted or
40             incomplete and should be fixed. Multiple parentage, for example exons
41             shared between different transcripts of the same gene, are fully supported.
42              
43             Embedded Fasta sequences are ignored, as are most comment and pragma lines.
44              
45             The SeqFeature objects that are returned are L
46             objects. Refer to that documentation for more information.
47              
48             =head1 METHODS
49              
50             =head2 Initialize and modify the parser.
51              
52             These are class methods to initialize the parser with an annotation file
53             and modify the parsing behavior. Most parameters can be set either upon
54             initialization or as class methods on the object. Unpredictable behavior
55             may occur if you implement these in the midst of parsing a file.
56              
57             Do not open subsequent files with the same object. Always create a new
58             object to parse a new file
59              
60             =over 4
61              
62             =item new
63              
64             my $parser = Bio::ToolBox::parser::gff->new($filename);
65             my $parser = Bio::ToolBox::parser::gff->new(
66             file => 'file.gtf.gz',
67             do_gene => 1,
68             do_utr => 1,
69             );
70              
71             Initialize a new gff parser object. Pass a single value (a GFF file name)
72             to open a file. Alternatively, pass an array of key value pairs to control
73             how the file is parsed. Options include the following.
74              
75             =over 4
76              
77             =item file
78              
79             Provide a GFF file name to be parsed. It should have a gff, gtf, or gff3 file
80             extension. The file may be gzip compressed.
81              
82             =item version
83              
84             Specify the version. Normally this is not needed, as version can be determined
85             either from the file extension (in the case of gtf and gff3) or from the
86             C<##gff-version> pragma at the top of the file. Acceptable values include 1, 2,
87             2.5 (gtf), or 3.
88              
89             =item class
90              
91             Pass the name of a L compliant class that will be used to
92             create the SeqFeature objects. The default is to use L.
93              
94             =item simplify
95              
96             Pass a boolean value to simplify the SeqFeature objects parsed from the GFF
97             file and ignore extraneous attributes.
98              
99             =item do_gene
100              
101             Pass a boolean (1 or 0) value to combine multiple transcripts with the same gene
102             name under a single gene object. Default is true.
103              
104             =item do_cds
105              
106             =item do_exon
107              
108             =item do_utr
109              
110             =item do_codon
111              
112             Pass a boolean (1 or 0) value to parse certain subfeatures. Exon subfeatures
113             are always parsed, but C, C, C, C,
114             and C features may be optionally parsed. Default is false.
115              
116             =back
117              
118             =item open_file
119              
120             $parser->open_file($file) or die "unable to open $file!";
121              
122             Pass the name of a GFF file to be parsed. The file may optionally be gzipped
123             (F<.gz> extension). Do not open a new file when one has already opened a file.
124             Create a new object for a new file, or concatenate the GFF files.
125              
126             =item version
127              
128             Set or get the GFF version of the current file. Acceptable values include 1, 2,
129             2.5 (gtf), or 3. Normally this is determined by file extension or
130             C pragma on the first line, and should not need to be set by the
131             user in most circumstances.
132              
133             =item simplify
134              
135             Pass a boolean true value to simplify the attributes of GFF3 and GTF files
136             that may have considerable numbers of tags, e.g. Ensembl files. Only
137             essential information, including name, ID, and parentage, is retained.
138             Useful if you're trying to quickly parse annotation files for basic
139             information.
140              
141             =back
142              
143             =head2 Feature retrieval
144              
145             The following methods parse the GFF file lines into SeqFeature objects.
146             It is best if these methods are not mixed; unexpected results may occur.
147              
148             =over 4
149              
150             =item next_top_feature
151              
152             This method will return a top level parent SeqFeature object
153             assembled with child features as sub-features. For example, a gene
154             object with mRNA subfeatures, which in turn may have exon and/or CDS
155             subfeatures. Child features are assembled based on the existence of
156             proper Parent attributes in child features. If no Parent attributes are
157             included in the GFF file, then this will behave as L.
158              
159             Child features (those containing a C attribute)
160             are associated with the parent feature. A warning will be issued about lost
161             children (orphans). Shared subfeatures, for example exons common to
162             multiple transcripts, are associated properly with each parent. An opportunity
163             to rescue orphans is available using the L method.
164              
165             Note that subfeatures may not necessarily be in ascending genomic order
166             when associated with the feature, depending on their order in the GFF3
167             file and whether shared subfeatures are present or not. When calling
168             subfeatures in your program, you may want to sort the subfeatures. For
169             example
170              
171             my @subfeatures = map { $_->[0] }
172             sort { $a->[1] <=> $b->[1] }
173             map { [$_, $_->start] }
174             $parent->get_SeqFeatures;
175              
176             =item top_features
177              
178             This method will return an array of the top (parent) features defined in
179             the GFF file. This is similar to the L method except that
180             all features are returned at once.
181              
182             =item next_feature
183              
184             This method will return a SeqFeature object representation of
185             the next feature (line) in the file. Parent - child relationships are
186             NOT assembled; however, undefined parents in a GTF file may still be
187             generated, just not returned.
188              
189             This method is best used with simple GFF files with no hierarchies
190             present. This may be used in a while loop until the end of the file
191             is reached. Pragmas are ignored and comment lines and sequence are
192             automatically skipped.
193              
194             =back
195              
196             =head2 Other methods
197              
198             Additional methods for working with the parser object and the parsed
199             SeqFeature objects.
200              
201             =over 4
202              
203             =item fh
204              
205             This method returns the L object of the opened GFF file.
206              
207             =item parse_file
208              
209             Parses the entire file into memory. This is automatically called when
210             either L or L is called.
211              
212             =item find_gene
213              
214             my $gene = $parser->find_gene(
215             name => $display_name,
216             id => $primary_id,
217             ) or warn "gene $display_name can not be found!";
218              
219             Pass a gene name, or an array of key =E values (C, C,
220             C, C, and/or coordinate information), that can be used
221             to find a gene already loaded into memory. Only useful after
222             is called. Genes with a matching name are confirmed by a matching ID or
223             overlapping coordinates, if available. Otherwise the first match is returned.
224              
225             =item orphans
226              
227             my @orphans = $parser->orphans;
228             printf "we have %d orphans left over!", scalar @orpans;
229              
230             This method will return an array of orphan SeqFeature objects that indicated
231             they had a parent but said parent could not be found. Typically, this is an
232             indication of an incomplete or malformed GFF3 file. Nevertheless, it might
233             be a good idea to check this after retrieving all top features.
234              
235             =item comments
236              
237             This method will return an array of the comment or pragma lines that may have
238             been in the parsed file. These may or may not be useful.
239              
240             =item typelist
241              
242             Returns a comma-delimited string of the GFF primary tags (column 3)
243             observed in the first 1000 lines of an opened file. Useful for
244             checking what is in the GFF file. See
245             L.
246              
247             =item from_gff_string
248              
249             my $seqfeature = $parser->from_gff_string($string);
250              
251             This method will parse a single GFF, GTF, or GFF3 formatted string or line
252             of text and return a SeqFeature object.
253              
254             =item unescape
255              
256             This method will unescape special characters in a text string. Certain
257             characters, including ";" and "=", are reserved for GFF3 formatting and
258             are not allowed, thus requiring them to be escaped.
259              
260             =item seq_ids
261              
262             Returns an array or array reference of the names of the chromosomes or
263             reference sequences present in the file. These may be defined by GFF3
264             sequence-region pragmas or inferred from the features.
265              
266             =item seq_id_lengths
267              
268             my $seq2len = $parser->seq_id_lengths;
269             foreach (keys %$seq2len) {
270             printf "chromosome %s is %d bp long\n", $_, $seq2len->{$_};
271             }
272              
273             Returns a hash reference to the chromosomes or reference sequences and
274             their corresponding lengths. In this case, the length is either defined
275             by the C pragma or inferred by the greatest end position of
276             the top features.
277              
278             =back
279              
280             =head1 SEE ALSO
281              
282             L, L, L
283              
284             =cut
285              
286 1     1   107454 use strict;
  1         3  
  1         37  
287 1     1   7 use Carp qw(carp cluck croak);
  1         5  
  1         94  
288 1     1   788 use Bio::ToolBox::Data::file;
  1         3  
  1         6661  
289             my $SFCLASS = 'Bio::ToolBox::SeqFeature'; # alternative to Bio::SeqFeature::Lite
290             eval "require $SFCLASS" or croak $@;
291              
292             1;
293              
294             sub new {
295 11     11 1 15553 my $class = shift;
296 11         228 my $self = {
297             'fh' => undef,
298             'top_features' => [],
299             'orphans' => [],
300             'duplicate_ids' => {},
301             'loaded' => {},
302             'eof' => 0,
303             'do_gene' => 1,
304             'do_exon' => 0,
305             'do_cds' => 0,
306             'do_utr' => 0,
307             'do_codon' => 0,
308             'gff3' => 0,
309             'gtf' => 0,
310             'comments' => [],
311             'seq_ids' => {},
312             'simplify' => 0,
313             'typelist' => '',
314             'convertor_sub' => undef,
315             };
316 11         37 bless $self, $class;
317            
318             # check for options
319 11 100       41 if (@_) {
320 8 50       30 if (scalar @_ == 1) {
321 0 0       0 $self->open_file($_[0]) or croak "unable to open file!";
322             }
323             else {
324 8         42 my %options = @_;
325 8 100       33 if (exists $options{simplify}) {
326 4         18 $self->simplify( $options{simplify} );
327             }
328 8 100       29 if (exists $options{do_gene}) {
329 4         16 $self->do_gene($options{do_gene});
330             }
331 8 100       22 if (exists $options{do_exon}) {
332 2         39 $self->do_exon($options{do_exon});
333             }
334 8 100       24 if (exists $options{do_cds}) {
335 4         21 $self->do_cds($options{do_cds});
336             }
337 8 50       23 if (exists $options{do_utr}) {
338 0         0 $self->do_utr($options{do_utr});
339             }
340 8 50       23 if (exists $options{do_codon}) {
341 0         0 $self->do_codon($options{do_codon});
342             }
343 8 50       24 if (exists $options{version}) {
344 0         0 $self->version($options{version});
345             }
346 8 50 33     29 if (exists $options{file} or $options{table}) {
347 8   33     29 $options{file} ||= $options{table};
348 8 50       32 $self->open_file( $options{file} ) or
349             croak "unable to open file!";
350             }
351 8 50       38 if (exists $options{class}) {
352 8         22 my $class = $options{class};
353 8 50       571 if (eval "require $class; 1") {
354 8         36 $SFCLASS = $class;
355             }
356             else {
357 0         0 croak $@;
358             }
359             }
360             }
361             }
362            
363             # done
364 11         51 return $self;
365             }
366              
367             sub do_gene {
368 593     593 1 1851 my $self = shift;
369 593 100       1207 if (@_) {
370 7         15 $self->{'do_gene'} = shift;
371             }
372 593         1873 return $self->{'do_gene'};
373             }
374              
375             sub do_exon {
376 188     188 1 305 my $self = shift;
377 188 100       445 if (@_) {
378 6         13 $self->{'do_exon'} = shift;
379             }
380 188         409 return $self->{'do_exon'};
381             }
382              
383             sub do_cds {
384 272     272 1 443 my $self = shift;
385 272 100       584 if (@_) {
386 5         15 $self->{'do_cds'} = shift;
387             }
388 272         601 return $self->{'do_cds'};
389             }
390              
391             sub do_utr {
392 0     0 1 0 my $self = shift;
393 0 0       0 if (@_) {
394 0         0 $self->{'do_utr'} = shift;
395             }
396 0         0 return $self->{'do_utr'};
397             }
398              
399             sub do_codon {
400 34     34 1 66 my $self = shift;
401 34 50       79 if (@_) {
402 0         0 $self->{'do_codon'} = shift;
403             }
404 34         81 return $self->{'do_codon'};
405             }
406              
407             sub do_name {
408             # this does nothing other than maintain compatibility with ucsc parser
409 0     0 0 0 return 0;
410             }
411              
412             sub share {
413             # this does nothing other than maintain compatibility with ucsc parser
414 0     0 0 0 return 1;
415             }
416              
417             sub simplify {
418 85     85 1 141 my $self = shift;
419 85 100       226 if (defined $_[0]) {
420 7         25 $self->{simplify} = shift;
421             }
422 85         1197 return $self->{simplify};
423             }
424              
425             sub version {
426 24     24 1 1675 my $self = shift;
427 24 100       58 if (@_) {
428 11         33 my $v = shift;
429 11 100 33     53 if ($v eq '3') {
    50 0        
    0          
430 7         22 $self->{version} = $v;
431 7         25 $self->{gff3} = 1;
432             }
433             elsif ($v eq '2.5' or $v eq '2.2') {
434 4         13 $self->{version} = '2.5';
435 4         10 $self->{gtf} = 1;
436             }
437             elsif ($v eq '2' or $v eq '1') {
438 0         0 $self->{version} = $v;
439             }
440             else {
441 0         0 warn "unrecognized GFF version '$v'!\n";
442             }
443             }
444 24         383 return $self->{version};
445             }
446              
447             sub open_file {
448 11     11 1 22 my $self = shift;
449            
450             # check file
451 11         22 my $filename = shift;
452 11 50       32 unless ($filename) {
453 0         0 cluck("no file name passed!\n");
454 0         0 return;
455             }
456            
457             # check type list
458 11         77 my $typelist = Bio::ToolBox::Data::file->sample_gff_type_list($filename);
459 11 50       77 if ($typelist !~ /\w+/) {
460 0         0 print "GFF file has no evident types!? $filename may not be a valid GFF file\n";
461 0         0 return;
462             }
463 11         37 $self->{typelist} = $typelist;
464            
465             # Open filehandle object
466 11 50       88 my $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename) or
467             croak " cannot open file '$filename'!\n";
468            
469             # check gff version pragma
470 11         272 my $first = $fh->getline;
471 11 50       483 if ($first =~ /^##gff.version\s+([\d\.]+)\s*$/i) {
472             # override any version that may have been inferred from the extension
473             # based on the assumption that this pragma is correct
474 11         56 $self->version($1);
475             }
476             else {
477             # no pragma, reopen the file
478 0         0 $fh->close;
479 0         0 $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename);
480             # set version based on file type extension????
481 0 0       0 if ($filename =~ /\.gtf.*$/i) {
    0          
482 0         0 $self->version('2.5');
483 0         0 $self->{gtf} = 1;
484             }
485             elsif ($filename =~ /\.gff3.*$/i) {
486 0         0 $self->version('3');
487 0         0 $self->{gff3} = 1;
488             }
489             }
490 11         30 $self->{fh} = $fh;
491 11         37 return 1;
492             }
493              
494             sub fh {
495 668     668 1 2588 return shift->{fh};
496             }
497              
498             sub typelist {
499 13     13 1 81 return shift->{typelist};
500             }
501              
502             sub next_feature {
503 587     587 1 4234 my $self = shift;
504            
505             # check that we have an open filehandle
506 587 50       1185 unless ($self->fh) {
507 0         0 croak("no GFF file loaded to parse!");
508             }
509 587 50       1192 return if $self->{'eof'};
510            
511             # check convertor
512 587 100       1178 unless (defined $self->{convertor_sub}) {
513 11         44 $self->_assign_gff_converter;
514             }
515            
516             # look for the next feature line
517 587         12953 while (my $line = $self->{fh}->getline) {
518            
519             # check first character
520 770         18655 my $firstchar = substr($line, 0, 1);
521            
522             # skip any comment and pragma lines that we might encounter
523 770 100       1977 if ($firstchar eq '#') {
    50          
524 25 50       102 if ($line =~ /^###$/) {
    50          
525             # a close pragma, all we can do is check for orphans
526             # a properly written shouldn't have any orphans, but just in case
527 0         0 $self->check_orphanage;
528 0         0 next;
529             }
530             elsif ($line =~ /^##sequence.region/i) {
531             # sequence region pragma
532 0         0 my ($pragma, $seq_id, $start, $stop) = split /\s+/, $line;
533 0 0 0     0 if (defined $seq_id and $start =~ /^\d+$/ and $stop =~ /^\d+$/) {
      0        
534             # we're actually only concerned with the stop coordinate
535 0         0 $self->{seq_ids}{$seq_id} = $stop;
536             }
537             else {
538 0         0 warn "malformed sequence-region pragma! $line\n";
539             }
540 0         0 next;
541             }
542             else {
543             # must be some sort of pragma or a comment line, may be useful, keep it
544 25         43 push @{$self->{comments}}, $line;
  25         58  
545 25         415 next;
546             }
547             }
548             elsif ($firstchar eq '>') {
549             # fasta header line
550             # this is almost always at the end of the file, and rarely is sequence put
551             # into GFF files anyway, so let's assume it's the end of the file
552 0         0 $self->{'eof'} = 1;
553 0         0 return;
554             }
555            
556             # line must be a GFF feature
557 745         1195 chomp $line;
558 745         4272 my @fields = split('\t', $line);
559 745 50       1784 next unless scalar(@fields) == 9;
560            
561             # check the primary_tag and generate the SeqFeature object for known types
562 745         1525 my $type = lc $fields[2];
563 745 100       2718 if ($type eq 'cds') {
    100          
    50          
    100          
    100          
    100          
    100          
564 265 100       663 if ($self->do_cds) {
565 199         350 return &{$self->{convertor_sub}}($self, \@fields);
  199         436  
566             } else {
567 66         1264 next;
568             }
569             }
570             elsif ($type eq 'exon') {
571 178 50       406 if ($self->do_exon) {
572 178         349 return &{$self->{convertor_sub}}($self, \@fields);
  178         379  
573             } else {
574 0         0 next;
575             }
576             }
577             elsif ($type =~ /utr|untranslated/) {
578 0 0       0 if ($self->do_utr) {
579 0         0 return &{$self->{convertor_sub}}($self, \@fields);
  0         0  
580             } else {
581 0         0 next;
582             }
583             }
584             elsif ($type =~ /codon/) {
585 32 50       86 if ($self->do_codon) {
586 0         0 return &{$self->{convertor_sub}}($self, \@fields);
  0         0  
587             } else {
588 32         614 next;
589             }
590             }
591             elsif ($type =~ /gene$/) {
592 165 50       385 if ($self->do_gene) {
593 165         296 return &{$self->{convertor_sub}}($self, \@fields);
  165         331  
594             } else {
595 0         0 next;
596             }
597             }
598             elsif ($type =~ /transcript|rna/) {
599 36         69 return &{$self->{convertor_sub}}($self, \@fields);
  36         96  
600             }
601             elsif ($type =~ /chromosome|contig|scaffold/) {
602             # gff3 files can record the chromosome as a gff record
603             # process this as a region
604 7         34 $self->{seq_ids}{$fields[0]} = $fields[4];
605 7         128 next;
606             }
607             else {
608             # everything else must be some non-standard weird element
609             # only process this if the user wants everything
610 62 100       161 return &{$self->{convertor_sub}}($self, \@fields) unless $self->simplify;
  2         11  
611             }
612             }
613            
614             # presumably reached the end of the file
615 7         263 $self->{'eof'} = 1;
616 7         33 return;
617             }
618              
619             sub next_top_feature {
620 70     70 1 111 my $self = shift;
621             # check that we have an open filehandle
622 70 50       199 unless ($self->fh) {
623 0         0 croak("no GFF3 file loaded to parse!");
624             }
625 70 50       167 unless ($self->{'eof'}) {
626 0 0       0 $self->parse_file or croak "unable to parse file!";
627             }
628 70         101 return shift @{ $self->{top_features} };
  70         199  
629             }
630              
631             sub top_features {
632 4     4 1 6378 my $self = shift;
633 4 50       20 unless ($self->{'eof'}) {
634 0         0 $self->parse_file;
635             }
636 4         11 my @features = @{ $self->{top_features} };
  4         21  
637 4 50       26 return wantarray ? @features : \@features;
638             }
639              
640             *parse_table = \&parse_file;
641              
642             sub parse_file {
643 7     7 1 50 my $self = shift;
644             # check that we have an open filehandle
645 7 50       28 unless ($self->fh) {
646 0         0 confess("no file loaded to parse!");
647             }
648 7 50       30 return 1 if $self->{'eof'};
649            
650             # Each line will be processed into a SeqFeature object, and then checked
651             # for parentage. Child objects with a Parent tag will be appropriately
652             # associated with its parent, or put into an orphanage. Any orphans
653             # left will be checked a final time for parents; if a parent can't be
654             # found, it will be lost. Features without a parent are assumed to be
655             # top-level features.
656            
657 7 50       25 printf " Parsing %s format file....\n",
    100          
658             $self->version eq '3' ? 'GFF3' :
659             $self->version =~ /2\../ ? 'GTF' : 'GFF';
660            
661            
662             TOP_FEATURE_LOOP:
663 7         91 while (my $feature = $self->next_feature) {
664            
665             ### Process the feature
666             # check the ID
667 576         1501 my $id = $feature->primary_id;
668             # if the seqfeature didn't have an ID specified from the file, then
669             # Bio::ToolBox::SeqFeature will autogenerate one, but Bio::SeqFeature::Lite
670             # will not - so we will likely lose that feature
671 576 100       2344 if ($id) {
672             # remember this feature since we have an ID
673 404 50       866 if (exists $self->{loaded}{$id}) {
674             # this ID should be unique in the GFF file
675             # otherwise it might be a shared duplicate or a malformed GFF file
676             # generally only a concern for top level features
677 0 0       0 if ($feature->primary_tag =~ /gene|rna|transcript/) {
678 0         0 $self->{duplicate_ids}{$id} += 1;
679             }
680            
681             # store all of the features as an array
682 0 0       0 if (ref($self->{loaded}{$id}) eq 'ARRAY') {
683             # there's more than two duplicates! pile it on!
684 0         0 push @{ $self->{loaded}{$id} }, $feature;
  0         0  
685             }
686             else {
687 0         0 my $existing = $self->{loaded}{$id};
688 0         0 $self->{loaded}{$id} = [$existing, $feature];
689             }
690             }
691             else {
692             # unique ID, so remember it
693 404         1166 $self->{loaded}{$id} = $feature;
694             }
695             }
696             # if the feature didn't have an ID, we'll just assume it is
697             # a child of another feature, otherwise it may get lost
698            
699             # look for parents and children
700 576 100       1353 if ($feature->has_tag('Parent')) {
701             # must be a child
702             # there may be more than one parent, per the GFF3 specification
703 411         1771 foreach my $parent_id ( $feature->get_tag_values('Parent') ) {
704 411 50       2536 if (exists $self->{loaded}{$parent_id}) {
705             # we've seen this id
706             # associate the child with the parent
707 411         674 my $parent = $self->{loaded}{$parent_id};
708 411         1109 $parent->add_SeqFeature($feature);
709            
710             # check boundaries for gtf genes
711             # gtf genes may not be explicitly defined so must correct as necessary
712             # gff3 files shouldn't have this issue
713 411 100       18701 if ($self->{gtf}) {
714 312 50       638 if ($feature->start < $parent->start) {
715 0         0 $parent->start( $feature->start );
716             }
717 312 100       2612 if ($feature->end > $parent->end) {
718 1         5 $parent->end( $feature->end );
719             }
720             # check parent's parent too
721 312 100       2560 if ($parent->has_tag('Parent')) {
722             # in all likelihood parent is a transcript and there is a
723             # gene that probably also needs fixin'
724 278         1215 my ($grandparent_id) = $parent->get_tag_values('Parent');
725 278         1756 my $grandparent = $self->{loaded}{$grandparent_id};
726 278 50       534 if ($feature->start < $grandparent->start) {
727 0         0 $grandparent->start( $feature->start );
728             }
729 278 50       2297 if ($feature->end > $grandparent->end) {
730 0         0 $grandparent->end( $feature->end );
731             }
732             }
733             }
734             }
735             else {
736             # can't find the parent, maybe not loaded yet?
737             # put 'em in the orphanage
738 0         0 $self->_add_orphan($feature);
739             }
740             }
741             }
742             else {
743             # must be a parent
744 165         416 push @{ $self->{top_features} }, $feature;
  165         553  
745             }
746             }
747             # Finished loading the GFF lines
748            
749             # check for orphans
750 7 50       14 if (scalar @{ $self->{orphans} }) {
  7         34  
751 0         0 $self->check_orphanage;
752             # report
753 0 0       0 if (scalar @{ $self->{orphans} }) {
  0         0  
754             printf " GFF errors: %d features could not be associated with reported parents!\n",
755 0         0 scalar @{ $self->{orphans} };
  0         0  
756             }
757             }
758            
759             # report on duplicate IDs
760 7 50       19 if (keys %{ $self->{duplicate_ids} }) {
  7         46  
761             printf " GFF errors: %d IDs were duplicated\n",
762 0         0 scalar(keys %{ $self->{duplicate_ids} });
  0         0  
763             }
764            
765 7         55 return 1;
766             }
767              
768              
769             sub _make_gene_parent {
770             # for generating GTF gene parent features
771 12     12   29 my ($self, $fields, $gene_id) = @_;
772 12         92 my $gene = $SFCLASS->new(
773             -seq_id => $fields->[0],
774             -primary_tag => 'gene',
775             -start => $fields->[3],
776             -end => $fields->[4],
777             -strand => $fields->[6],
778             -primary_id => $gene_id,
779             );
780            
781 12 50       437 if ($fields->[8] =~ /gene_name "([^"]+)";?/) {
782 12         38 $gene->display_name($1);
783             }
784             else {
785 0         0 $gene->display_name($gene_id);
786             }
787            
788             # add extra information if possible
789 12 100       82 unless ($self->simplify) {
790 2 50       18 if ($fields->[8] =~ /gene_biotype "([^"]+)";?/) {
    50          
791 0         0 $gene->add_tag_value('gene_biotype', $1);
792             }
793             elsif ($fields->[8] =~ /gene_type "([^"]+)";?/) {
794 0         0 $gene->add_tag_value('gene_type', $1);
795             }
796 2 50       10 if ($fields->[8] =~ /gene_source "([^"]+)";?/) {
797 0         0 $gene->source($1);
798             }
799             }
800 12         32 return $gene;
801             }
802              
803              
804             sub _make_rna_parent {
805             # for generating GTF gene parent features
806 0     0   0 my ($self, $fields, $transcript_id) = @_;
807 0         0 my $rna = $SFCLASS->new(
808             -seq_id => $fields->[0],
809             -primary_tag => 'transcript',
810             -start => $fields->[3],
811             -end => $fields->[4],
812             -strand => $fields->[6],
813             -primary_id => $transcript_id,
814             );
815            
816 0 0       0 if ($fields->[8] =~ /transcript_name "([^"]+)";?/) {
817 0         0 $rna->display_name($1);
818             }
819             else {
820 0         0 $rna->display_name($transcript_id);
821             }
822            
823             # add extra information if possible
824 0 0       0 unless ($self->simplify) {
825 0 0       0 if ($fields->[8] =~ /transcript_biotype "([^"]+)";?/) {
    0          
826 0         0 $rna->add_tag_value('transcript_biotype', $1);
827             }
828             elsif ($fields->[8] =~ /transcript_type "([^"]+)";?/) {
829 0         0 $rna->add_tag_value('transcript_type', $1);
830             }
831 0 0       0 if ($fields->[8] =~ /transcript_source "([^"]+)";?/) {
832 0         0 $rna->source($1);
833             }
834             }
835 0         0 return $rna;
836             }
837              
838              
839             sub find_gene {
840 37     37 1 15664 my $self = shift;
841            
842             # check that we have gene2seqf table
843 37 100       92 unless (exists $self->{gene2seqf}) {
844 5 50       21 croak "must parse file first!" unless $self->{'eof'};
845 5         23 $self->{gene2seqf} = {};
846 5         11 foreach (@{ $self->{top_features} }) {
  5         22  
847 107         209 my $name = lc $_->display_name;
848 107 50       422 if (exists $self->{gene2seqf}->{$name}) {
849 0         0 push @{ $self->{gene2seqf}->{$name} }, $_;
  0         0  
850             }
851             else {
852 107         318 $self->{gene2seqf}->{$name} = [$_];
853             }
854             }
855             }
856            
857             # get the name and coordinates from arguments
858 37         65 my ($name, $id, $chrom, $start, $end, $strand);
859 37 50       110 if (scalar @_ == 0) {
    100          
860 0         0 carp "must provide information to find_gene method!";
861 0         0 return;
862             }
863             elsif (scalar @_ == 1) {
864 4         14 $name = $_[0];
865             }
866             else {
867 33         73 my %opt = @_;
868 33   0     66 $name = $opt{name} || $opt{display_name} || undef;
869 33   0     63 $id = $opt{id} || $opt{primary_id} || undef;
870 33   50     114 $chrom = $opt{chrom} || $opt{seq_id} || undef;
871 33   50     94 $start = $opt{start} || undef;
872 33   50     106 $end = $opt{stop} || $opt{end} || undef;
873 33   50     114 $strand = $opt{strand} || 0;
874             }
875 37 50       78 unless ($name) {
876 0         0 carp "name is required for find_gene!";
877 0         0 return;
878             }
879            
880             # check if a gene with this name exists
881 37 50       140 if (exists $self->{gene2seqf}->{lc $name} ) {
882             # we found a matching gene
883            
884             # pull out the gene seqfeature(s) array reference
885             # there may be more than one gene
886 37         1229 my $genes = $self->{gene2seqf}->{ lc $name };
887            
888             # go through a series of checks to find the appropriate
889 37 100       76 if ($id) {
890 33         56 foreach my $g (@$genes) {
891 33 50       80 if ($g->primary_id eq $id) {
892 33         100 return $g;
893             }
894             }
895 0         0 return; # none of these matched despite having an ID
896             }
897 4 0 33     14 if ($chrom and $start and $end) {
      33        
898 0         0 foreach my $g (@$genes) {
899 0 0 0     0 if (
      0        
900             # overlap method borrowed from Bio::RangeI
901             ($g->strand == $strand) and not (
902             $g->start > $end or
903             $g->end < $start
904             )
905             ) {
906             # gene and transcript overlap on the same strand
907             # we found the intersecting gene
908 0         0 return $g;
909             }
910             }
911 0         0 return; # none of these matched despite having coordinate info
912             }
913 4 50       16 if (scalar @$genes == 1) {
    0          
914             # going on trust here that this is the one
915 4         16 return $genes->[0];
916             }
917             elsif (scalar @$genes > 1) {
918 0         0 carp "more than one gene named $name found!";
919 0         0 return $genes->[0];
920             }
921            
922             # nothing suitable found
923 0         0 return;
924             }
925             }
926              
927              
928             sub from_gff_string {
929 0     0 1 0 my ($self, $string) = @_;
930            
931             # check string
932 0         0 chomp $string;
933 0 0       0 unless ($string) {
934 0         0 cluck("must pass a string!\n");
935 0         0 return;
936             }
937 0         0 my @fields = split('\t', $string);
938            
939             # check convertor
940 0 0       0 unless (defined $self->{convertor_sub}) {
941 0         0 $self->_assign_gff_converter;
942             }
943            
944             # parse appropriately
945 0         0 return &{$self->{convertor_sub}}($self, \@fields);
  0         0  
946             }
947              
948              
949             sub _assign_gff_converter {
950 11     11   26 my $self = shift;
951 11 100       39 if ($self->{gff3}) {
    50          
952 7         26 $self->{convertor_sub} = \&_gff3_to_seqf;
953             }
954             elsif ($self->{gtf}) {
955 4 100       14 if ($self->simplify) {
956 2         9 $self->{convertor_sub} = \&_gtf_to_seqf_simple;
957             }
958             else {
959 2         7 $self->{convertor_sub} = \&_gtf_to_seqf_full;
960             }
961             # double check we have transcript information
962 4 50       12 if ($self->typelist !~ /transcript|rna/i) {
963             # we will have to rely on exon and/or cds information to get transcript
964 0 0 0     0 unless ($self->do_exon or $self->do_cds) {
965 0         0 $self->do_exon(1);
966             }
967             }
968 4 50 33     13 if ($self->do_gene and $self->typelist !~ /gene/i) {
969 4         9 $self->do_exon(1);
970             }
971             }
972             else {
973 0         0 $self->{convertor_sub} = \&_gff2_to_seqf;
974             }
975             }
976              
977              
978             sub _gff3_to_seqf {
979 266     266   496 my ($self, $fields) = @_;
980 266         457 my $group = $fields->[8];
981 266         589 my $feature = $self->_gff_to_seqf($fields);
982            
983             # process groups
984 266         2246 foreach my $g (split(/\s*;\s*/, $group)) {
985 1382         4656 my ($tag, $value) = split /=/, $g;
986 1382         2672 $tag = $self->unescape($tag);
987 1382         2972 my @values = map { $self->unescape($_) } split(/,/, $value);
  1470         2381  
988            
989             # determine the appropriate attribute based on tag name
990 1382 100       5058 if ($tag eq 'Name') {
    100          
    100          
    50          
    100          
991 266         679 $feature->display_name($values[0]);
992             }
993             elsif ($tag eq 'ID') {
994 167         481 $feature->primary_id($values[0]);
995             }
996             elsif ($tag eq 'Parent') {
997             # always record Parent except for transcripts when genes are not wanted
998 99 50 33     222 unless (not $self->do_gene and $feature->primary_tag =~ /rna|transcript/i) {
999 99         208 foreach (@values) {
1000 99         239 $feature->add_tag_value($tag, $_);
1001             }
1002             }
1003             }
1004             elsif (lc $tag eq 'exon_id') {
1005             # ensembl GFF3 store the exon id but doesn't record it as the ID, why?
1006 0         0 $feature->primary_id($values[0]);
1007             }
1008             elsif (not $self->{simplify}) {
1009 2         7 foreach (@values) {
1010 2         22 $feature->add_tag_value($tag, $_);
1011             }
1012             }
1013             }
1014            
1015 266         1404 return $feature;
1016             }
1017              
1018              
1019             sub _gtf_to_seqf_simple {
1020              
1021 312     312   564 my ($self, $fields) = @_;
1022 312         557 my $group = $fields->[8];
1023 312         653 my $feature = $self->_gff_to_seqf($fields);
1024            
1025             # extract essential tags
1026 312         502 my ($gene_id, $transcript_id);
1027 312 50       1629 if ($group =~ /gene_id "([^"]+)";?/i) {
1028 312         764 $gene_id = $1;
1029             }
1030 312 50       1151 if ($group =~ /transcript_id "([^"]+)";?/i) {
1031 312         626 $transcript_id = $1;
1032             }
1033 312 50 33     677 unless ($gene_id or $transcript_id) {
1034             # improperly formatted GTF file without one of these two items, nothing more to do
1035 0         0 return $feature;
1036             }
1037            
1038             # common subfeatures including exon, CDS, UTR, and codons
1039 312 100       1456 if ($fields->[2] =~ /cds|exon|utr|codon|untranslated/i) {
    50          
    0          
1040 278         954 $feature->add_tag_value('Parent', $transcript_id);
1041            
1042             # exon id if present
1043 278 50 66     2106 if ($fields->[2] eq 'exon' and $group =~ /exon_id "([^"]+)";?/i) {
1044 0         0 $feature->primary_id($1);
1045             }
1046            
1047             # check gene parent
1048 278 50 33     604 if ($self->do_gene and not exists $self->{loaded}{$gene_id}) {
1049 0         0 my $gene = $self->_make_gene_parent($fields, $gene_id);
1050 0         0 $self->{loaded}{$gene_id} = $gene;
1051 0         0 push @{ $self->{top_features} }, $gene;
  0         0  
1052             }
1053            
1054             # check transcript parent
1055 278 50       670 unless (exists $self->{loaded}{$transcript_id}) {
1056 0         0 my $rna = $self->_make_rna_parent($fields, $transcript_id);
1057 0         0 $self->{loaded}{$transcript_id} = $rna;
1058 0 0       0 if ($self->do_gene) {
1059 0         0 $rna->add_tag_value('Parent', $gene_id);
1060 0         0 $self->{loaded}{$gene_id}->add_SeqFeature($rna);
1061             }
1062             else {
1063 0         0 push @{ $self->{top_features} }, $rna;
  0         0  
1064             }
1065             }
1066             }
1067            
1068             # a transcript feature
1069             elsif ($fields->[2] =~ /rna|transcript/i) {
1070             # these are sometimes present in GTF files, such as from Ensembl
1071             # but are not required and often absent
1072 34         121 $feature->primary_id($transcript_id);
1073            
1074             # transcript information
1075 34 50       271 if ($group =~ /transcript_name "([^"]+)";?/i) {
1076 34         102 $feature->display_name($1);
1077             }
1078            
1079             # check whether parent was loaded and add gene information if not
1080 34 50       223 if ($self->do_gene) {
1081 34         96 $feature->add_tag_value('Parent', $gene_id);
1082 34 100       240 if (not exists $self->{loaded}{$gene_id}) {
1083 10         45 my $gene = $self->_make_gene_parent($fields, $gene_id);
1084 10         31 $self->{loaded}{$gene_id} = $gene;
1085 10         19 push @{ $self->{top_features} }, $gene;
  10         31  
1086             }
1087             }
1088             }
1089            
1090             # a gene feature
1091             elsif ($fields->[2] eq 'gene') {
1092             # these are sometimes present in GTF files, such as from Ensembl
1093             # but are not required and often absent
1094 0         0 $feature->primary_id($gene_id);
1095 0 0       0 if ($group =~ /gene_name "([^"]+)";?/i) {
1096 0         0 $feature->display_name($1);
1097             }
1098             }
1099            
1100             # anything else, like CNS (conserved noncoding sequence) doesn't get any
1101             # further special attributes, as far as I can tell
1102 312         1473 return $feature;
1103             }
1104              
1105              
1106             sub _gtf_to_seqf_full {
1107            
1108 2     2   9 my ($self, $fields) = @_;
1109 2         6 my $group = $fields->[8];
1110 2         8 my $feature = $self->_gff_to_seqf($fields);
1111            
1112             # process the group tags
1113 2         4 my %attributes;
1114 2         12 foreach my $g (split('; ', $group)) { # supposed to be "; " as delimiter
1115 10         28 my ($tag, $value, @bits) = split ' ', $g;
1116 10         37 $value =~ s/[";]//g; # remove the flanking double quotes, assume no internal quotes
1117 10         27 $attributes{$tag} = $value;
1118             }
1119            
1120             # assign special tags based on the feature type
1121 2 50       23 if ($fields->[2] =~ /cds|exon|utr|codon|untranslated/i) {
    50          
    0          
1122 0         0 $feature->add_tag_value('Parent', $attributes{'transcript_id'});
1123            
1124             # exon id if present
1125 0 0 0     0 if ($fields->[2] eq 'exon' and exists $attributes{'exon_id'}) {
1126 0         0 $feature->primary_id($attributes{'exon_id'});
1127             }
1128            
1129             # check gene parent
1130 0 0       0 if ($self->do_gene) {
1131 0   0     0 my $gene_id = $attributes{gene_id} || undef;
1132 0 0 0     0 if ($gene_id and not exists $self->{loaded}{$gene_id}) {
1133 0         0 my $gene = $self->_make_gene_parent($fields, $gene_id);
1134 0         0 $self->{loaded}{$gene_id} = $gene;
1135 0         0 push @{ $self->{top_features} }, $gene;
  0         0  
1136             }
1137             }
1138            
1139             # check transcript parent
1140 0   0     0 my $transcript_id = $attributes{transcript_id} || undef;
1141 0 0 0     0 if ($transcript_id and not exists $self->{loaded}{$transcript_id}) {
1142 0         0 my $rna = $self->_make_rna_parent($fields, $transcript_id);
1143 0         0 $self->{loaded}{$transcript_id} = $rna;
1144 0 0       0 if ($self->do_gene) {
1145 0         0 $rna->add_tag_value('Parent', $attributes{gene_id});
1146 0         0 $self->{loaded}{ $attributes{gene_id} }->add_SeqFeature($rna);
1147             }
1148             else {
1149 0         0 push @{ $self->{top_features} }, $rna;
  0         0  
1150             }
1151             }
1152             }
1153            
1154             # transcript
1155             elsif ($fields->[2] =~ /transcript|rna/i) {
1156             # these are sometimes present in GTF files, such as from Ensembl
1157            
1158             # transcript information
1159 2         11 $feature->primary_id($attributes{transcript_id});
1160 2         10 delete $attributes{transcript_id};
1161 2 50       6 if (exists $attributes{transcript_name}) {
1162 2         9 $feature->display_name($attributes{transcript_name});
1163 2         11 delete $attributes{transcript_name};
1164             }
1165            
1166             # check gene parent
1167 2 50       8 if ($self->do_gene) {
1168 2   50     7 my $gene_id = $attributes{gene_id} || undef;
1169 2 50 33     12 if ($gene_id and not exists $self->{loaded}{$gene_id}) {
1170 2         13 my $gene = $self->_make_gene_parent($fields, $gene_id);
1171 2         6 $self->{loaded}{$gene_id} = $gene;
1172 2         3 push @{ $self->{top_features} }, $gene;
  2         7  
1173             }
1174 2         10 $feature->add_tag_value('Parent', $gene_id);
1175             }
1176             }
1177            
1178             # gene
1179             elsif ($fields->[2] eq 'gene') {
1180             # these are sometimes present in GTF files, such as from Ensembl
1181             # but are not required and often absent
1182 0         0 $feature->primary_id($attributes{gene_id});
1183 0         0 delete $attributes{gene_id};
1184 0 0       0 if (exists $attributes{gene_name}) {
1185 0         0 $feature->display_name($attributes{gene_name});
1186 0         0 delete $attributes{gene_name};
1187             }
1188             }
1189            
1190             # store remaining attributes, if any
1191 2         18 foreach my $key (keys %attributes) {
1192 6         27 $feature->add_tag_value($key, $attributes{$key});
1193             }
1194 2         20 return $feature;
1195             }
1196              
1197              
1198             sub _gff2_to_seqf {
1199             # generic gff1 or gff2 format or poorly defined gff3 or gtf file
1200             # hope for the best!
1201 0     0   0 my ($self, $fields) = @_;
1202 0         0 my $feature = $self->_gff_to_seqf($fields);
1203            
1204             # process groups
1205             # we have no uniform method of combining features, so we'll leave the tags
1206             # as is and hope for the best
1207 0         0 foreach my $g (split(/\s*;\s*/, $fields->[8])) {
1208 0         0 my ($tag, $value) = split /\s+/, $g;
1209 0 0 0     0 next unless ($tag and $value);
1210 0         0 $feature->add_tag_value($tag, $value);
1211             }
1212 0         0 return $feature;
1213             }
1214              
1215              
1216             sub _gff_to_seqf {
1217 580     580   923 my ($self, $fields) = @_;
1218            
1219             # generate the basic SeqFeature
1220 580         2773 my $feature = $SFCLASS->new(
1221             -seq_id => $fields->[0],
1222             -source => $fields->[1],
1223             -primary_tag => $fields->[2],
1224             -start => $fields->[3],
1225             -end => $fields->[4],
1226             -strand => $fields->[6],
1227             );
1228            
1229             # add more attributes if they're not null
1230 580 50       14045 if ($fields->[5] ne '.') {
1231 0         0 $feature->score($fields->[5]);
1232             }
1233 580 100       1242 if ($fields->[7] ne '.') {
1234 199         526 $feature->phase($fields->[7]);
1235             }
1236            
1237             # finished
1238 580         1471 return $feature;
1239             }
1240              
1241              
1242             sub unescape {
1243             # Borrowed unashamedly from bioperl Bio::Tools::GFF
1244             # which in turn was borrowed from Bio::DB::GFF
1245 2852     2852 1 3773 my $self = shift;
1246 2852         3773 my $v = shift;
1247 2852         4301 $v =~ tr/+/ /;
1248 2852         5233 $v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge;
  5574         15214  
1249 2852         5996 return $v;
1250             }
1251              
1252              
1253             sub _add_orphan {
1254 0     0     my ($self, $feature) = @_;
1255 0           push @{ $self->{orphans} }, $feature;
  0            
1256 0           return 1;
1257             }
1258              
1259              
1260             sub check_orphanage {
1261 0     0 0   my $self = shift;
1262 0 0         return unless scalar @{ $self->{orphans} };
  0            
1263            
1264             # go through the list of orphans
1265 0           my @reunited; # list of indices to delete after reuniting orphan with parent
1266 0           for (my $i = 0; $i < scalar @{ $self->{orphans} }; $i++) {
  0            
1267 0           my $orphan = $self->{orphans}->[$i];
1268 0           my $success = 0;
1269            
1270             # find the parent
1271 0           foreach my $parent ($orphan->get_tag_values('Parent') ) {
1272 0 0         if (exists $self->{loaded}{$parent}) {
1273             # we have loaded the parent
1274             # associate each orphan feature with the parent
1275 0           $self->{loaded}{$parent}->add_SeqFeature($orphan);
1276 0           $success++;
1277             }
1278             }
1279             # delete the orphan from the array if it found it's long lost parent
1280 0 0         push @reunited, $i if $success;
1281             }
1282            
1283             # clean up the orphanage
1284 0           while (@reunited) {
1285 0           my $i = pop @reunited;
1286 0           splice(@{ $self->{orphans} }, $i, 1);
  0            
1287             }
1288             }
1289              
1290             sub orphans {
1291 0     0 1   my $self = shift;
1292 0           my @orphans;
1293 0           foreach (@{ $self->{orphans} }) {
  0            
1294 0           push @orphans, $_;
1295             }
1296 0 0         return wantarray ? @orphans : \@orphans;
1297             }
1298              
1299              
1300             sub comments {
1301 0     0 1   my $self = shift;
1302 0           my @comments;
1303 0           foreach (@{ $self->{comments} }) {
  0            
1304 0           push @comments, $_;
1305             }
1306 0 0         return wantarray ? @comments : \@comments;
1307             }
1308              
1309              
1310             sub seq_ids {
1311 0     0 1   my $self = shift;
1312 0 0         unless (scalar keys %{$self->{seq_ids}}) {
  0            
1313 0           $self->_get_seq_ids;
1314             }
1315 0           my @s = keys %{$self->{seq_ids}};
  0            
1316 0 0         return wantarray ? @s : \@s;
1317             }
1318              
1319              
1320             sub seq_id_lengths {
1321 0     0 1   my $self = shift;
1322 0 0         unless (scalar keys %{$self->{seq_ids}}) {
  0            
1323 0           $self->_get_seq_ids;
1324             }
1325 0           return $self->{seq_ids};
1326             }
1327              
1328             sub _get_seq_ids {
1329 0     0     my $self = shift;
1330 0 0         return unless $self->{'eof'};
1331 0           foreach (@{ $self->{top_features} }) {
  0            
1332 0           my $s = $_->seq_id;
1333 0 0         unless (exists $self->{seq_ids}{$s}) {
1334 0           $self->{seq_ids}{$s} = 1;
1335             }
1336 0 0         $self->{seq_ids}{$s} = $_->end if $_->end > $self->{seq_ids}{$s};
1337             }
1338             }
1339              
1340             __END__