File Coverage

Bio/SeqIO/gbxml.pm
Criterion Covered Total %
statement 146 160 91.2
branch 71 100 71.0
condition 6 17 35.2
subroutine 19 19 100.0
pod 6 6 100.0
total 248 302 82.1


line stmt bran cond sub pod time code
1             # $Id: gbxml.pm
2             #
3             # BioPerl module for Bio::SeqIO::gbxml
4             #
5             # Cared for by Ryan Golhar
6             # NOTE: This module is implemented on an as needed basis. As features
7             # are needed, they are implemented. Its very bare-bones.
8             #
9             # Based off http://www.insdc.org/page.php?page=documents&sid=105a8b52b69db9c36c82a2e0d923ca69
10             #
11             # I tried to follow the genbank module to keep things as consistent as possible
12             # Right now, I'm not respecting the want_slot parameters. This will need to be added.
13              
14             =head1 NAME
15              
16             Bio::SeqIO::gbxml - GenBank sequence input/output stream using SAX
17              
18             =head1 SYNOPSIS
19              
20             It is probably best not to use this object directly, but rather go
21             through the SeqIO handler system. To read a GenBank XML file:
22              
23             $stream = Bio::SeqIO->new( -file => $filename, -format => 'gbxml');
24              
25             while ( my $bioSeqObj = $stream->next_seq() ) {
26             # do something with $bioSeqObj
27             }
28              
29             To write a Seq object to the current file handle in GenBank XML format:
30              
31             $stream->write_seq( -seq => $seqObj);
32              
33             If instead you would like a XML::DOM object containing the GBXML, use:
34              
35             my $newXmlObject = $stream->to_bsml( -seq => $seqObj);
36              
37             =head1 DEPENDENCIES
38              
39             In addition to parts of the Bio:: hierarchy, this module uses:
40              
41             XML::SAX
42              
43             =head1 DESCRIPTION
44              
45             This object can transform Bio::Seq objects to and from GenBank XML
46             flatfiles.
47              
48             =head1 FEEDBACK
49              
50             =head2 Mailing Lists
51              
52             User feedback is an integral part of the evolution of this and other
53             Bioperl modules. Send your comments and suggestions preferably to one
54             of the Bioperl mailing lists. Your participation is much appreciated.
55              
56             bioperl-l@bioperl.org - General discussion
57             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
58              
59             =head2 Reporting Bugs
60              
61             Report bugs to the Bioperl bug tracking system to help us keep track
62             the bugs and their resolution. Bug reports can be submitted via the
63             web:
64              
65             https://github.com/bioperl/bioperl-live/issues
66              
67             =head1 AUTHOR - Ryan Golhar
68              
69             Email golharam-at-umdnj-dot-edu
70              
71             =cut
72              
73             package Bio::SeqIO::gbxml;
74 1     1   385 use vars qw($Default_Source);
  1         1  
  1         33  
75 1     1   3 use strict;
  1         1  
  1         13  
76              
77 1     1   263 use Bio::SeqIO::FTHelper;
  1         1  
  1         22  
78 1     1   6 use Bio::SeqFeature::Generic;
  1         1  
  1         13  
79 1     1   291 use Bio::Species;
  1         1  
  1         27  
80 1     1   4 use XML::SAX;
  1         1  
  1         36  
81 1     1   4 use Bio::Seq::SeqFactory;
  1         2  
  1         17  
82 1     1   3 use Bio::Annotation::Collection;
  1         1  
  1         14  
83 1     1   269 use Bio::Annotation::Comment;
  1         1  
  1         19  
84 1     1   265 use Bio::Annotation::Reference;
  1         1  
  1         22  
85 1     1   3 use Bio::Annotation::DBLink;
  1         2  
  1         15  
86              
87 1     1   3 use base qw(Bio::SeqIO XML::SAX::Base);
  1         2  
  1         330  
88              
89             $Default_Source = 'GBXML';
90              
91             sub _initialize {
92 1     1   2 my ($self) = shift;
93 1         5 $self->SUPER::_initialize(@_);
94 1         7 $self->{'_parser'} = XML::SAX::ParserFactory->parser('Handler' => $self);
95 1 50       27358 if( ! defined $self->sequence_factory ) {
96 1         7 $self->sequence_factory(Bio::Seq::SeqFactory->new
97             (-verbose => $self->verbose(),
98             -type => 'Bio::Seq::RichSeq'));
99             }
100 1         3 return;
101             }
102              
103             =head1 METHODS
104              
105             =cut
106              
107             =head2 next_seq
108              
109             Title : next_seq
110             Usage : my $bioSeqObj = $stream->next_seq
111             Function: Retrieves the next sequence from a SeqIO::gbxml stream.
112             Returns : A reference to a Bio::Seq::RichSeq object
113             Args :
114              
115             =cut
116              
117             sub next_seq {
118 1     1 1 697 my $self = shift;
119 1 50 33     2 if( @{$self->{'_seendata'}->{'_seqs'} || []} || eof($self->_fh)) {
  1 50       12  
120 0         0 return shift @{$self->{'_seendata'}->{'_seqs'}};
  0         0  
121             }
122 1         5 $self->{'_parser'}->parse_file($self->_fh);
123 1         35 return shift @{$self->{'_seendata'}->{'_seqs'}};
  1         5  
124             }
125              
126             # XML::SAX::Base methods
127              
128             sub start_document {
129 1     1 1 249 my ($self,$doc) = @_;
130 1         3 $self->{'_seendata'} = {'_seqs' => [] #,
131             # '_authors' => [],
132             # '_feats' => []
133             };
134 1         10 $self->SUPER::start_document($doc);
135             }
136              
137             sub end_document {
138 1     1 1 114 my ($self,$doc) = @_;
139 1         18 $self->SUPER::end_document($doc);
140             }
141              
142              
143             sub start_element {
144 60     60 1 9208 my ($self,$ele) = @_;
145 60         77 my $name = uc($ele->{'LocalName'});
146              
147             # my $attr = $ele->{'Attributes'};
148             # my $seqid = defined $self->{'_seendata'}->{'_seqs'}->[-1] ?
149             # $self->{'_seendata'}->{'_seqs'}->[-1]->display_id : undef;
150              
151             # for my $k ( keys %$attr ) {
152             # $attr->{uc $k} = $attr->{$k};
153             # delete $attr->{$k};
154             # }
155            
156 60 100       163 if( $name eq 'GBSET' ) {
    100          
    100          
157            
158             } elsif( $name eq 'GBSEQ' ) {
159             # Initialize, we are starting a new sequence.
160 1         1 push @{$self->{'_seendata'}->{'_seqs'}},
  1         5  
161             $self->sequence_factory->create();
162             } elsif( $name eq 'GBFEATURE' ) {
163 1         2 my $curseq = $self->{'_seendata'}->{'_seqs'}->[-1];
164 1         9 my $fthelper = Bio::SeqIO::FTHelper->new();
165 1         45 $fthelper->verbose($self->verbose());
166 1         3 push @{$self->{'_seendata'}->{'_feats'}}, $fthelper;
  1         3  
167             }
168            
169             # } elsif( $name eq 'FEATURE-TABLES' ) {
170             # } elsif( $name eq 'database-xref' ) {
171             # my ($db,$id) = split(/:/,$content);
172             # $curseq->annotation->add_Annotation('dblink',
173             # Bio::Annotation::DBLink->new
174             # ( -database => $db,
175             # -primary_id=> $id));
176             # } elsif( $name eq 'INTERVAL-LOC' ) {
177             # my $curfeat = $self->{'_seendata'}->{'_feats'}->[-1];
178             # my ($start,$end,$strand) =
179             # map { $attr->{'{}'.$_}->{'Value'} } qw(STARTPOS
180             # ENDPOS
181             # COMPLEMENT);
182              
183             # $curfeat->start($start);
184             # $curfeat->end($end);
185             # $curfeat->strand(-1) if($strand);
186             # } elsif( $name eq 'REFERENCE' ) {
187             # push @{$self->{'_seendata'}->{'_annot'}},
188             # Bio::Annotation::Reference->new();
189             # }
190 60         64 $self->{'_characters'} = '';
191            
192 60         45 push @{$self->{'_state'}}, $name;
  60         86  
193 60         113 $self->SUPER::start_element($ele);
194             }
195              
196             sub end_element {
197 60     60 1 4068 my ($self,$ele) = @_;
198 60         43 pop @{$self->{'_state'}};
  60         61  
199 60         73 my $name = uc $ele->{'LocalName'};
200 60         77 my $curseq = $self->{'_seendata'}->{'_seqs'}->[-1];
201 60         57 my $curfeat = $self->{'_seendata'}->{'_feats'}->[-1];
202            
203 60 100       475 if ($name eq 'GBSEQ_LOCUS') {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
    100          
204 1         6 $curseq->display_id($self->{'_characters'});
205            
206             } elsif ($name eq 'GBSEQ_LENGTH' ) {
207 1         7 $curseq->length($self->{'_characters'});
208            
209             } elsif ($name eq 'GBSEQ_MOLTYPE' ) {
210 1 50       10 if ($self->{'_characters'} =~ /mRNA|dna/) {
211 1         7 $curseq->alphabet('dna');
212             } else {
213 0         0 $curseq->alphabet('protein');
214             }
215 1         5 $curseq->molecule($self->{'_characters'});
216            
217             } elsif ($name eq 'GBSEQ_TOPOLOGY' ) {
218 1 50       9 $curseq->is_circular(($self->{'_characters'} =~ /^linear$/i) ? 0 : 1);
219              
220             } elsif ($name eq 'GBSEQ_DIVISION' ) {
221 1         3 $curseq->division($self->{'_characters'});
222              
223             } elsif ($name =~ m/GBSEQ_UPDATE-DATE|GBSEQ_CREATE-DATE/ ) {
224 2         3 my $date = $self->{'_characters'};
225             # This code was taken from genbank.pm
226 2 50       12 if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {
227 2 50       6 if( length($date) < 11 ) { # improperly formatted date
228             # But we'll be nice and fix it for them
229 0         0 my ($d,$m,$y) = ($2,$3,$4);
230 0 0       0 $d = "0$d" if( length($d) == 1 );
231             # guess the century here
232 0 0       0 if( length($y) == 2 ) {
233             # arbitrarily guess that '60' means 1960
234 0 0       0 $y = ($y > 60) ? "19$y" : "20$y";
235 0         0 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
236             }
237 0         0 $date = [join('-',$d,$m,$y)];
238             }
239 2         6 $curseq->add_date($date);
240             }
241              
242             } elsif ($name eq 'GBSEQ_DEFINITION' ) {
243 1         8 $curseq->description($self->{'_characters'});
244              
245             } elsif ($name eq 'GBSEQ_PRIMARY-ACCESSION' ) {
246 1         7 $curseq->accession_number($self->{'_characters'});
247              
248             } elsif ($name eq 'GBSEQ_ACCESSION-VERSION' ) {
249             # also taken from genbank.pm
250 1         4 $self->{'_characters'} =~ m/^\w+\.(\d+)/;
251 1 50       3 if ($1) {
252 1         6 $curseq->version($1);
253 1         3 $curseq->seq_version($1);
254             }
255              
256             } elsif ($name eq 'GBSEQID' ) {
257 3 100       10 if ($self->{'_characters'} =~ m/gi\|(\d+)/) {
258 1         8 $curseq->primary_id($1);
259             }
260              
261             } elsif ($name eq 'GBSEQ_SOURCE') {
262 1         3 $self->{'_taxa'}->{'_common'} = $self->{'_characters'};
263              
264             } elsif ($name eq 'GBSEQ_ORGANISM' ) {
265             # taken from genbank.pm
266 1         2 my @organell_names = ("chloroplast", "mitochondr");
267 1         4 my @spflds = split(' ', $self->{'_characters'});
268            
269 1         2 $_ = $self->{'_characters'};
270 1 50       2 if (grep { $_ =~ /^$spflds[0]/i; } @organell_names) {
  2         23  
271 0         0 $self->{'_taxa'}->{'_organelle'} = shift(@spflds);
272             }
273 1         3 $self->{'_taxa'}->{'_genus'} = shift(@spflds);
274 1 50       4 $self->{'_taxa'}->{'_species'} = shift(@spflds) if (@spflds);
275 1 50       3 $self->{'_taxa'}->{'_sub_species'} = shift(@spflds) if (@spflds);
276 1         2 $self->{'_taxa'}->{'_ns_name'} = $self->{'_characters'};
277            
278             } elsif ($name eq 'GBSEQ_TAXONOMY' ) {
279             # taken from genbank.pm
280 1         2 $_ = $self->{'_characters'};
281 1         1 my @class;
282 1         8 push (@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $_);
  9         14  
  9         7  
  9         16  
283            
284 1 50 33     18 next unless $self->{'_taxa'}->{'_genus'} and $self->{'_taxa'}->{'_genus'} !~ /^(unknown|None)$/oi;
285 1 50       6 if ($class[0] eq 'Viruses') {
    50          
286 0         0 push( @class, $self->{'_taxa'}->{'_ns_name'} );
287             }
288             elsif ($class[$#class] eq $self->{'_taxa'}->{'_genus'}) {
289 1         2 push( @class, $self->{'_taxa'}->{'_species'} );
290             } else {
291 0         0 push( @class, $self->{'_taxa'}->{'_genus'}, $self->{'_taxa'}->{'_species'} );
292             }
293 1         3 @class = reverse @class;
294            
295 1         10 my $make = Bio::Species->new();
296 1         4 $make->classification( \@class, "FORCE");
297 1 50       17 $make->common_name($self->{'_taxa'}->{'_common'}) if $self->{'_taxa'}->{'_common'};
298 1 50       6 unless ($class[-1] eq 'Viruses') {
299 1 50       5 $make->sub_species( $self->{'_taxa'}->{'_sub_species'} ) if $self->{'_taxa'}->{'_sub_species'};
300             }
301 1 50       3 $make->organelle( $self->{'_taxa'}->{'_organelle'} ) if $self->{'_taxa'}->{'_organelle'};
302 1         15 $curseq->species($make);
303 1         5 delete $self->{'_taxa'};
304              
305             } elsif( $name eq 'GBSEQ_COMMENT' ) {
306 1 50       7 $curseq->annotation->add_Annotation('comment', Bio::Annotation::Comment->new(-text => $self->{'_characters'} )) if ($self->{'_characters'});
307            
308             } elsif ($name eq 'GBFEATURE_KEY' ) {
309 1         6 $curfeat->key($self->{'_characters'});
310            
311             } elsif ($name eq 'GBFEATURE_LOCATION' ) {
312 1         4 $curfeat->loc($self->{'_characters'});
313            
314             } elsif ($name eq 'GBQUALIFIER_NAME' ) {
315 6         9 $self->{'_feature'}->{"_qualifer_name"} = $self->{'_characters'};
316              
317             } elsif ($name eq 'GBQUALIFIER_VALUE' ) {
318 6         9 my $qualifier = $self->{'_feature'}->{"_qualifer_name"};
319 6         6 delete $self->{'_feature'}->{"_qualifer_name"};
320            
321 6   50     12 $curfeat->field->{$qualifier} ||= [];
322 6         4 push(@{$curfeat->field->{$qualifier}}, $self->{'_characters'});
  6         10  
323            
324             } elsif ($name eq 'GBSEQ_SEQUENCE' ) {
325 1         8 $curseq->seq($self->{'_characters'});
326            
327             } elsif( $name eq 'GBFEATURE' ) {
328 1         1 shift @{$self->{'_seendata'}->{'_feats'}};
  1         3  
329             # copied from genbank.pm
330 1 50       3 if (!defined($curfeat)) {
331 0         0 $self->warn("Unexpected error in feature table for ".$curseq->display_id." Skipping feature, attempting to recover");
332             } else {
333 1         6 my $feat = $curfeat->_generic_seqfeature($self->location_factory(), $curseq->display_id);
334 1 50 33     4 if ($curseq->species && ($feat->primary_tag eq 'source') &&
      33        
      33        
335             $feat->has_tag('db_xref') && (! $curseq->species->ncbi_taxid())) {
336 1         3 foreach my $tagval ($feat->get_tag_values('db_xref')) {
337 1 50       25 if (index($tagval,"taxon:") == 0) {
338 1         3 $curseq->species->ncbi_taxid(substr($tagval,6));
339             }
340             }
341             }
342 1         10 $curseq->add_SeqFeature($feat);
343             }
344             }
345            
346             # if( $name eq 'REFERENCE') {
347             # my $ref = pop @{$self->{'_seendata'}->{'_annot'}};
348             # $curseq->annotation->add_Annotation('reference',$ref);
349             # }
350 60         130 $self->SUPER::end_element($ele);
351             }
352              
353             # Characters should be buffered because we may not always get the entire string. Once the entire string is read
354             # process it in end_element.
355             sub characters {
356 122     122 1 3987 my ($self,$data) = @_;
357 122 50       79 if( ! @{$self->{'_state'}} ) {
  122         178  
358 0         0 $self->warn("Calling characters with no previous start_element call. Ignoring data");
359             } else {
360             # my $curseq = $self->{'_seendata'}->{'_seqs'}->[-1];
361             # my $curfeat = $self->{'_seendata'}->{'_feats'}->[-1];
362             # my $curannot = $self->{'_seendata'}->{'_annot'}->[-1];
363             # my $name = $self->{'_state'}->[-1];
364              
365             # if ($name eq 'GBSEQ_LOCUS' ) {
366 122         137 $self->{'_characters'} .= $data->{'Data'};
367            
368             # } elsif ($name eq 'GBSEQ_LENGTH' ) {
369             # $self->{'_characters'} .= $data->{'Data'};
370              
371             # } elsif ($name eq 'GBSEQ_MOLTYPE' ) {
372             # $self->{'_characters'} .= $data->{'Data'};
373              
374             # } elsif ($name eq 'GBSEQ_TOPOLOGY' ) {
375             # $self->{'_characters'} .= $data->{'Data'};
376              
377             # } elsif ($name eq 'GBSEQ_DIVISION' ) {
378             # $self->{'_characters'} .= $data->{'Data'};
379              
380             # } elsif ($name =~ m/GBSEQ_UPDATE-DATE|GBSEQ_CREATE-DATE/ ) {
381             # $self->{'_characters'} .= $data->{'Data'};
382              
383             # } elsif ($name eq 'GBSEQ_DEFINITION' ) {
384             # $self->{'_characters'} .= $data->{'Data'};
385              
386             # } elsif ($name eq 'GBSEQ_PRIMARY-ACCESSION' ) {
387             # $self->{'_characters'} .= $data->{'Data'};
388            
389             # } elsif ($name eq 'GBSEQ_ACCESSION-VERSION' ) {
390             # $self->{'_characters'} .= $data->{'Data'};
391            
392             # } elsif ($name eq 'GBSEQID' ) {
393             # $self->{'_characters'} .= $data->{'Data'};
394            
395             # } elsif ($name eq 'GBSEQ_SOURCE') {
396             # $self->{'_characters'} .= $data->{'Data'};
397            
398             # } elsif ($name eq 'GBSEQ_ORGANISM' ) {
399             # $self->{'_characters'} .= $data->{'Data'};
400            
401             # } elsif ($name eq 'GBSEQ_TAXONOMY' ) {
402             # $self->{'_characters'} .= $data->{'Data'};
403            
404             # } elsif ($name eq 'GBSEQ_COMMENT' ) {
405             # $self->{'_characters'} .= $data->{'Data'};
406              
407             # } elsif ($name eq 'GBFEATURE_KEY' ) {
408             # $self->{'_characters'} .= $data->{'Data'};
409              
410             # } elsif ($name eq 'GBFEATURE_LOCATION' ) {
411             # $self->{'_characters'} .= $data->{'Data'};
412              
413             # } elsif ($name eq 'GBQUALIFIER_NAME' ) {
414             # $self->{'_characters'} .= $data->{'Data'};
415              
416             # } elsif ($name eq 'GBQUALIFIER_VALUE' ) {
417             # $self->{'_characters'} .= $data->{'Data'};
418            
419             # } elsif ($name eq 'GBINTERVAL_FROM' ) {
420             # $self->{'_feature'}->{'_interval_from'} = $data->{'Data'};
421              
422             # } elsif ($name eq 'GBINTERVAL_TO' ) {
423             # $self->{'_feature'}->{'_interval_to'} = $data->{'Data'};
424              
425             # } elsif ($name eq 'GBINTERVAL_ACCESSION' ) {
426             # $self->{'_feature'}->{'_interval_accession'} = $data->{'Data'};
427              
428             # } elsif ($name eq 'GBSEQ_SEQUENCE' ) {
429             # $self->{'_characters'} .= $data->{'Data'};
430             # }
431             }
432 122         187 $self->SUPER::characters($data);
433             }
434              
435             1;
436