line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: gbxml.pm |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# BioPerl module for Bio::SeqIO::gbxml |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# Cared for by Ryan Golhar |
6
|
|
|
|
|
|
|
# NOTE: This module is implemented on an as needed basis. As features |
7
|
|
|
|
|
|
|
# are needed, they are implemented. Its very bare-bones. |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# Based off http://www.insdc.org/page.php?page=documents&sid=105a8b52b69db9c36c82a2e0d923ca69 |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# I tried to follow the genbank module to keep things as consistent as possible |
12
|
|
|
|
|
|
|
# Right now, I'm not respecting the want_slot parameters. This will need to be added. |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::SeqIO::gbxml - GenBank sequence input/output stream using SAX |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
It is probably best not to use this object directly, but rather go |
21
|
|
|
|
|
|
|
through the SeqIO handler system. To read a GenBank XML file: |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
$stream = Bio::SeqIO->new( -file => $filename, -format => 'gbxml'); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
while ( my $bioSeqObj = $stream->next_seq() ) { |
26
|
|
|
|
|
|
|
# do something with $bioSeqObj |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
To write a Seq object to the current file handle in GenBank XML format: |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
$stream->write_seq( -seq => $seqObj); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
If instead you would like a XML::DOM object containing the GBXML, use: |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
my $newXmlObject = $stream->to_bsml( -seq => $seqObj); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 DEPENDENCIES |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
In addition to parts of the Bio:: hierarchy, this module uses: |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
XML::SAX |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 DESCRIPTION |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
This object can transform Bio::Seq objects to and from GenBank XML |
46
|
|
|
|
|
|
|
flatfiles. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=head1 FEEDBACK |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 Mailing Lists |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
53
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
54
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
57
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=head2 Reporting Bugs |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
62
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
63
|
|
|
|
|
|
|
web: |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head1 AUTHOR - Ryan Golhar |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
Email golharam-at-umdnj-dot-edu |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=cut |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
package Bio::SeqIO::gbxml; |
74
|
1
|
|
|
1
|
|
543
|
use vars qw($Default_Source); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
49
|
|
75
|
1
|
|
|
1
|
|
7
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
26
|
|
76
|
|
|
|
|
|
|
|
77
|
1
|
|
|
1
|
|
368
|
use Bio::SeqIO::FTHelper; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
22
|
|
78
|
1
|
|
|
1
|
|
4
|
use Bio::SeqFeature::Generic; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
14
|
|
79
|
1
|
|
|
1
|
|
282
|
use Bio::Species; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
21
|
|
80
|
1
|
|
|
1
|
|
4
|
use XML::SAX; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
81
|
1
|
|
|
1
|
|
4
|
use Bio::Seq::SeqFactory; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
16
|
|
82
|
1
|
|
|
1
|
|
3
|
use Bio::Annotation::Collection; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
14
|
|
83
|
1
|
|
|
1
|
|
247
|
use Bio::Annotation::Comment; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
20
|
|
84
|
1
|
|
|
1
|
|
273
|
use Bio::Annotation::Reference; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
22
|
|
85
|
1
|
|
|
1
|
|
4
|
use Bio::Annotation::DBLink; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
17
|
|
86
|
|
|
|
|
|
|
|
87
|
1
|
|
|
1
|
|
3
|
use base qw(Bio::SeqIO XML::SAX::Base); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
328
|
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
$Default_Source = 'GBXML'; |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
sub _initialize { |
92
|
1
|
|
|
1
|
|
2
|
my ($self) = shift; |
93
|
1
|
|
|
|
|
5
|
$self->SUPER::_initialize(@_); |
94
|
1
|
|
|
|
|
9
|
$self->{'_parser'} = XML::SAX::ParserFactory->parser('Handler' => $self); |
95
|
1
|
50
|
|
|
|
27428
|
if( ! defined $self->sequence_factory ) { |
96
|
1
|
|
|
|
|
9
|
$self->sequence_factory(Bio::Seq::SeqFactory->new |
97
|
|
|
|
|
|
|
(-verbose => $self->verbose(), |
98
|
|
|
|
|
|
|
-type => 'Bio::Seq::RichSeq')); |
99
|
|
|
|
|
|
|
} |
100
|
1
|
|
|
|
|
3
|
return; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=head1 METHODS |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=cut |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=head2 next_seq |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
Title : next_seq |
110
|
|
|
|
|
|
|
Usage : my $bioSeqObj = $stream->next_seq |
111
|
|
|
|
|
|
|
Function: Retrieves the next sequence from a SeqIO::gbxml stream. |
112
|
|
|
|
|
|
|
Returns : A reference to a Bio::Seq::RichSeq object |
113
|
|
|
|
|
|
|
Args : |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=cut |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub next_seq { |
118
|
1
|
|
|
1
|
1
|
717
|
my $self = shift; |
119
|
1
|
50
|
33
|
|
|
1
|
if( @{$self->{'_seendata'}->{'_seqs'} || []} || eof($self->_fh)) { |
|
1
|
50
|
|
|
|
13
|
|
120
|
0
|
|
|
|
|
0
|
return shift @{$self->{'_seendata'}->{'_seqs'}}; |
|
0
|
|
|
|
|
0
|
|
121
|
|
|
|
|
|
|
} |
122
|
1
|
|
|
|
|
5
|
$self->{'_parser'}->parse_file($self->_fh); |
123
|
1
|
|
|
|
|
33
|
return shift @{$self->{'_seendata'}->{'_seqs'}}; |
|
1
|
|
|
|
|
5
|
|
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# XML::SAX::Base methods |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub start_document { |
129
|
1
|
|
|
1
|
1
|
229
|
my ($self,$doc) = @_; |
130
|
1
|
|
|
|
|
4
|
$self->{'_seendata'} = {'_seqs' => [] #, |
131
|
|
|
|
|
|
|
# '_authors' => [], |
132
|
|
|
|
|
|
|
# '_feats' => [] |
133
|
|
|
|
|
|
|
}; |
134
|
1
|
|
|
|
|
11
|
$self->SUPER::start_document($doc); |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub end_document { |
138
|
1
|
|
|
1
|
1
|
117
|
my ($self,$doc) = @_; |
139
|
1
|
|
|
|
|
18
|
$self->SUPER::end_document($doc); |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
sub start_element { |
144
|
60
|
|
|
60
|
1
|
9175
|
my ($self,$ele) = @_; |
145
|
60
|
|
|
|
|
83
|
my $name = uc($ele->{'LocalName'}); |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# my $attr = $ele->{'Attributes'}; |
148
|
|
|
|
|
|
|
# my $seqid = defined $self->{'_seendata'}->{'_seqs'}->[-1] ? |
149
|
|
|
|
|
|
|
# $self->{'_seendata'}->{'_seqs'}->[-1]->display_id : undef; |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# for my $k ( keys %$attr ) { |
152
|
|
|
|
|
|
|
# $attr->{uc $k} = $attr->{$k}; |
153
|
|
|
|
|
|
|
# delete $attr->{$k}; |
154
|
|
|
|
|
|
|
# } |
155
|
|
|
|
|
|
|
|
156
|
60
|
100
|
|
|
|
163
|
if( $name eq 'GBSET' ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
} elsif( $name eq 'GBSEQ' ) { |
159
|
|
|
|
|
|
|
# Initialize, we are starting a new sequence. |
160
|
1
|
|
|
|
|
1
|
push @{$self->{'_seendata'}->{'_seqs'}}, |
|
1
|
|
|
|
|
5
|
|
161
|
|
|
|
|
|
|
$self->sequence_factory->create(); |
162
|
|
|
|
|
|
|
} elsif( $name eq 'GBFEATURE' ) { |
163
|
1
|
|
|
|
|
2
|
my $curseq = $self->{'_seendata'}->{'_seqs'}->[-1]; |
164
|
1
|
|
|
|
|
8
|
my $fthelper = Bio::SeqIO::FTHelper->new(); |
165
|
1
|
|
|
|
|
39
|
$fthelper->verbose($self->verbose()); |
166
|
1
|
|
|
|
|
1
|
push @{$self->{'_seendata'}->{'_feats'}}, $fthelper; |
|
1
|
|
|
|
|
4
|
|
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# } elsif( $name eq 'FEATURE-TABLES' ) { |
170
|
|
|
|
|
|
|
# } elsif( $name eq 'database-xref' ) { |
171
|
|
|
|
|
|
|
# my ($db,$id) = split(/:/,$content); |
172
|
|
|
|
|
|
|
# $curseq->annotation->add_Annotation('dblink', |
173
|
|
|
|
|
|
|
# Bio::Annotation::DBLink->new |
174
|
|
|
|
|
|
|
# ( -database => $db, |
175
|
|
|
|
|
|
|
# -primary_id=> $id)); |
176
|
|
|
|
|
|
|
# } elsif( $name eq 'INTERVAL-LOC' ) { |
177
|
|
|
|
|
|
|
# my $curfeat = $self->{'_seendata'}->{'_feats'}->[-1]; |
178
|
|
|
|
|
|
|
# my ($start,$end,$strand) = |
179
|
|
|
|
|
|
|
# map { $attr->{'{}'.$_}->{'Value'} } qw(STARTPOS |
180
|
|
|
|
|
|
|
# ENDPOS |
181
|
|
|
|
|
|
|
# COMPLEMENT); |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# $curfeat->start($start); |
184
|
|
|
|
|
|
|
# $curfeat->end($end); |
185
|
|
|
|
|
|
|
# $curfeat->strand(-1) if($strand); |
186
|
|
|
|
|
|
|
# } elsif( $name eq 'REFERENCE' ) { |
187
|
|
|
|
|
|
|
# push @{$self->{'_seendata'}->{'_annot'}}, |
188
|
|
|
|
|
|
|
# Bio::Annotation::Reference->new(); |
189
|
|
|
|
|
|
|
# } |
190
|
60
|
|
|
|
|
59
|
$self->{'_characters'} = ''; |
191
|
|
|
|
|
|
|
|
192
|
60
|
|
|
|
|
39
|
push @{$self->{'_state'}}, $name; |
|
60
|
|
|
|
|
79
|
|
193
|
60
|
|
|
|
|
113
|
$self->SUPER::start_element($ele); |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub end_element { |
197
|
60
|
|
|
60
|
1
|
4070
|
my ($self,$ele) = @_; |
198
|
60
|
|
|
|
|
46
|
pop @{$self->{'_state'}}; |
|
60
|
|
|
|
|
68
|
|
199
|
60
|
|
|
|
|
75
|
my $name = uc $ele->{'LocalName'}; |
200
|
60
|
|
|
|
|
62
|
my $curseq = $self->{'_seendata'}->{'_seqs'}->[-1]; |
201
|
60
|
|
|
|
|
48
|
my $curfeat = $self->{'_seendata'}->{'_feats'}->[-1]; |
202
|
|
|
|
|
|
|
|
203
|
60
|
100
|
|
|
|
462
|
if ($name eq 'GBSEQ_LOCUS') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
204
|
1
|
|
|
|
|
7
|
$curseq->display_id($self->{'_characters'}); |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_LENGTH' ) { |
207
|
1
|
|
|
|
|
6
|
$curseq->length($self->{'_characters'}); |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_MOLTYPE' ) { |
210
|
1
|
50
|
|
|
|
7
|
if ($self->{'_characters'} =~ /mRNA|dna/) { |
211
|
1
|
|
|
|
|
8
|
$curseq->alphabet('dna'); |
212
|
|
|
|
|
|
|
} else { |
213
|
0
|
|
|
|
|
0
|
$curseq->alphabet('protein'); |
214
|
|
|
|
|
|
|
} |
215
|
1
|
|
|
|
|
4
|
$curseq->molecule($self->{'_characters'}); |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_TOPOLOGY' ) { |
218
|
1
|
50
|
|
|
|
10
|
$curseq->is_circular(($self->{'_characters'} =~ /^linear$/i) ? 0 : 1); |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_DIVISION' ) { |
221
|
1
|
|
|
|
|
4
|
$curseq->division($self->{'_characters'}); |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
} elsif ($name =~ m/GBSEQ_UPDATE-DATE|GBSEQ_CREATE-DATE/ ) { |
224
|
2
|
|
|
|
|
4
|
my $date = $self->{'_characters'}; |
225
|
|
|
|
|
|
|
# This code was taken from genbank.pm |
226
|
2
|
50
|
|
|
|
15
|
if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) { |
227
|
2
|
50
|
|
|
|
5
|
if( length($date) < 11 ) { # improperly formatted date |
228
|
|
|
|
|
|
|
# But we'll be nice and fix it for them |
229
|
0
|
|
|
|
|
0
|
my ($d,$m,$y) = ($2,$3,$4); |
230
|
0
|
0
|
|
|
|
0
|
$d = "0$d" if( length($d) == 1 ); |
231
|
|
|
|
|
|
|
# guess the century here |
232
|
0
|
0
|
|
|
|
0
|
if( length($y) == 2 ) { |
233
|
|
|
|
|
|
|
# arbitrarily guess that '60' means 1960 |
234
|
0
|
0
|
|
|
|
0
|
$y = ($y > 60) ? "19$y" : "20$y"; |
235
|
0
|
|
|
|
|
0
|
$self->warn("Date was malformed, guessing the century for $date to be $y\n"); |
236
|
|
|
|
|
|
|
} |
237
|
0
|
|
|
|
|
0
|
$date = [join('-',$d,$m,$y)]; |
238
|
|
|
|
|
|
|
} |
239
|
2
|
|
|
|
|
5
|
$curseq->add_date($date); |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_DEFINITION' ) { |
243
|
1
|
|
|
|
|
7
|
$curseq->description($self->{'_characters'}); |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_PRIMARY-ACCESSION' ) { |
246
|
1
|
|
|
|
|
7
|
$curseq->accession_number($self->{'_characters'}); |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_ACCESSION-VERSION' ) { |
249
|
|
|
|
|
|
|
# also taken from genbank.pm |
250
|
1
|
|
|
|
|
5
|
$self->{'_characters'} =~ m/^\w+\.(\d+)/; |
251
|
1
|
50
|
|
|
|
4
|
if ($1) { |
252
|
1
|
|
|
|
|
6
|
$curseq->version($1); |
253
|
1
|
|
|
|
|
3
|
$curseq->seq_version($1); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQID' ) { |
257
|
3
|
100
|
|
|
|
10
|
if ($self->{'_characters'} =~ m/gi\|(\d+)/) { |
258
|
1
|
|
|
|
|
7
|
$curseq->primary_id($1); |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_SOURCE') { |
262
|
1
|
|
|
|
|
4
|
$self->{'_taxa'}->{'_common'} = $self->{'_characters'}; |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_ORGANISM' ) { |
265
|
|
|
|
|
|
|
# taken from genbank.pm |
266
|
1
|
|
|
|
|
3
|
my @organell_names = ("chloroplast", "mitochondr"); |
267
|
1
|
|
|
|
|
3
|
my @spflds = split(' ', $self->{'_characters'}); |
268
|
|
|
|
|
|
|
|
269
|
1
|
|
|
|
|
2
|
$_ = $self->{'_characters'}; |
270
|
1
|
50
|
|
|
|
1
|
if (grep { $_ =~ /^$spflds[0]/i; } @organell_names) { |
|
2
|
|
|
|
|
22
|
|
271
|
0
|
|
|
|
|
0
|
$self->{'_taxa'}->{'_organelle'} = shift(@spflds); |
272
|
|
|
|
|
|
|
} |
273
|
1
|
|
|
|
|
2
|
$self->{'_taxa'}->{'_genus'} = shift(@spflds); |
274
|
1
|
50
|
|
|
|
4
|
$self->{'_taxa'}->{'_species'} = shift(@spflds) if (@spflds); |
275
|
1
|
50
|
|
|
|
3
|
$self->{'_taxa'}->{'_sub_species'} = shift(@spflds) if (@spflds); |
276
|
1
|
|
|
|
|
3
|
$self->{'_taxa'}->{'_ns_name'} = $self->{'_characters'}; |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_TAXONOMY' ) { |
279
|
|
|
|
|
|
|
# taken from genbank.pm |
280
|
1
|
|
|
|
|
2
|
$_ = $self->{'_characters'}; |
281
|
1
|
|
|
|
|
1
|
my @class; |
282
|
1
|
|
|
|
|
8
|
push (@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $_); |
|
9
|
|
|
|
|
12
|
|
|
9
|
|
|
|
|
7
|
|
|
9
|
|
|
|
|
10
|
|
283
|
|
|
|
|
|
|
|
284
|
1
|
50
|
33
|
|
|
12
|
next unless $self->{'_taxa'}->{'_genus'} and $self->{'_taxa'}->{'_genus'} !~ /^(unknown|None)$/oi; |
285
|
1
|
50
|
|
|
|
5
|
if ($class[0] eq 'Viruses') { |
|
|
50
|
|
|
|
|
|
286
|
0
|
|
|
|
|
0
|
push( @class, $self->{'_taxa'}->{'_ns_name'} ); |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
elsif ($class[$#class] eq $self->{'_taxa'}->{'_genus'}) { |
289
|
1
|
|
|
|
|
2
|
push( @class, $self->{'_taxa'}->{'_species'} ); |
290
|
|
|
|
|
|
|
} else { |
291
|
0
|
|
|
|
|
0
|
push( @class, $self->{'_taxa'}->{'_genus'}, $self->{'_taxa'}->{'_species'} ); |
292
|
|
|
|
|
|
|
} |
293
|
1
|
|
|
|
|
2
|
@class = reverse @class; |
294
|
|
|
|
|
|
|
|
295
|
1
|
|
|
|
|
8
|
my $make = Bio::Species->new(); |
296
|
1
|
|
|
|
|
3
|
$make->classification( \@class, "FORCE"); |
297
|
1
|
50
|
|
|
|
8
|
$make->common_name($self->{'_taxa'}->{'_common'}) if $self->{'_taxa'}->{'_common'}; |
298
|
1
|
50
|
|
|
|
3
|
unless ($class[-1] eq 'Viruses') { |
299
|
1
|
50
|
|
|
|
3
|
$make->sub_species( $self->{'_taxa'}->{'_sub_species'} ) if $self->{'_taxa'}->{'_sub_species'}; |
300
|
|
|
|
|
|
|
} |
301
|
1
|
50
|
|
|
|
7
|
$make->organelle( $self->{'_taxa'}->{'_organelle'} ) if $self->{'_taxa'}->{'_organelle'}; |
302
|
1
|
|
|
|
|
8
|
$curseq->species($make); |
303
|
1
|
|
|
|
|
3
|
delete $self->{'_taxa'}; |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
} elsif( $name eq 'GBSEQ_COMMENT' ) { |
306
|
1
|
50
|
|
|
|
7
|
$curseq->annotation->add_Annotation('comment', Bio::Annotation::Comment->new(-text => $self->{'_characters'} )) if ($self->{'_characters'}); |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
} elsif ($name eq 'GBFEATURE_KEY' ) { |
309
|
1
|
|
|
|
|
6
|
$curfeat->key($self->{'_characters'}); |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
} elsif ($name eq 'GBFEATURE_LOCATION' ) { |
312
|
1
|
|
|
|
|
5
|
$curfeat->loc($self->{'_characters'}); |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
} elsif ($name eq 'GBQUALIFIER_NAME' ) { |
315
|
6
|
|
|
|
|
10
|
$self->{'_feature'}->{"_qualifer_name"} = $self->{'_characters'}; |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
} elsif ($name eq 'GBQUALIFIER_VALUE' ) { |
318
|
6
|
|
|
|
|
7
|
my $qualifier = $self->{'_feature'}->{"_qualifer_name"}; |
319
|
6
|
|
|
|
|
7
|
delete $self->{'_feature'}->{"_qualifer_name"}; |
320
|
|
|
|
|
|
|
|
321
|
6
|
|
50
|
|
|
13
|
$curfeat->field->{$qualifier} ||= []; |
322
|
6
|
|
|
|
|
3
|
push(@{$curfeat->field->{$qualifier}}, $self->{'_characters'}); |
|
6
|
|
|
|
|
8
|
|
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
} elsif ($name eq 'GBSEQ_SEQUENCE' ) { |
325
|
1
|
|
|
|
|
7
|
$curseq->seq($self->{'_characters'}); |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
} elsif( $name eq 'GBFEATURE' ) { |
328
|
1
|
|
|
|
|
2
|
shift @{$self->{'_seendata'}->{'_feats'}}; |
|
1
|
|
|
|
|
3
|
|
329
|
|
|
|
|
|
|
# copied from genbank.pm |
330
|
1
|
50
|
|
|
|
2
|
if (!defined($curfeat)) { |
331
|
0
|
|
|
|
|
0
|
$self->warn("Unexpected error in feature table for ".$curseq->display_id." Skipping feature, attempting to recover"); |
332
|
|
|
|
|
|
|
} else { |
333
|
1
|
|
|
|
|
6
|
my $feat = $curfeat->_generic_seqfeature($self->location_factory(), $curseq->display_id); |
334
|
1
|
50
|
33
|
|
|
3
|
if ($curseq->species && ($feat->primary_tag eq 'source') && |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
335
|
|
|
|
|
|
|
$feat->has_tag('db_xref') && (! $curseq->species->ncbi_taxid())) { |
336
|
1
|
|
|
|
|
3
|
foreach my $tagval ($feat->get_tag_values('db_xref')) { |
337
|
1
|
50
|
|
|
|
4
|
if (index($tagval,"taxon:") == 0) { |
338
|
1
|
|
|
|
|
2
|
$curseq->species->ncbi_taxid(substr($tagval,6)); |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
} |
342
|
1
|
|
|
|
|
8
|
$curseq->add_SeqFeature($feat); |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# if( $name eq 'REFERENCE') { |
347
|
|
|
|
|
|
|
# my $ref = pop @{$self->{'_seendata'}->{'_annot'}}; |
348
|
|
|
|
|
|
|
# $curseq->annotation->add_Annotation('reference',$ref); |
349
|
|
|
|
|
|
|
# } |
350
|
60
|
|
|
|
|
112
|
$self->SUPER::end_element($ele); |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# Characters should be buffered because we may not always get the entire string. Once the entire string is read |
354
|
|
|
|
|
|
|
# process it in end_element. |
355
|
|
|
|
|
|
|
sub characters { |
356
|
122
|
|
|
122
|
1
|
3865
|
my ($self,$data) = @_; |
357
|
122
|
50
|
|
|
|
85
|
if( ! @{$self->{'_state'}} ) { |
|
122
|
|
|
|
|
176
|
|
358
|
0
|
|
|
|
|
0
|
$self->warn("Calling characters with no previous start_element call. Ignoring data"); |
359
|
|
|
|
|
|
|
} else { |
360
|
|
|
|
|
|
|
# my $curseq = $self->{'_seendata'}->{'_seqs'}->[-1]; |
361
|
|
|
|
|
|
|
# my $curfeat = $self->{'_seendata'}->{'_feats'}->[-1]; |
362
|
|
|
|
|
|
|
# my $curannot = $self->{'_seendata'}->{'_annot'}->[-1]; |
363
|
|
|
|
|
|
|
# my $name = $self->{'_state'}->[-1]; |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
# if ($name eq 'GBSEQ_LOCUS' ) { |
366
|
122
|
|
|
|
|
147
|
$self->{'_characters'} .= $data->{'Data'}; |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_LENGTH' ) { |
369
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_MOLTYPE' ) { |
372
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_TOPOLOGY' ) { |
375
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_DIVISION' ) { |
378
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# } elsif ($name =~ m/GBSEQ_UPDATE-DATE|GBSEQ_CREATE-DATE/ ) { |
381
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_DEFINITION' ) { |
384
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_PRIMARY-ACCESSION' ) { |
387
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_ACCESSION-VERSION' ) { |
390
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQID' ) { |
393
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_SOURCE') { |
396
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_ORGANISM' ) { |
399
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_TAXONOMY' ) { |
402
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_COMMENT' ) { |
405
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# } elsif ($name eq 'GBFEATURE_KEY' ) { |
408
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# } elsif ($name eq 'GBFEATURE_LOCATION' ) { |
411
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
# } elsif ($name eq 'GBQUALIFIER_NAME' ) { |
414
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
# } elsif ($name eq 'GBQUALIFIER_VALUE' ) { |
417
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
# } elsif ($name eq 'GBINTERVAL_FROM' ) { |
420
|
|
|
|
|
|
|
# $self->{'_feature'}->{'_interval_from'} = $data->{'Data'}; |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
# } elsif ($name eq 'GBINTERVAL_TO' ) { |
423
|
|
|
|
|
|
|
# $self->{'_feature'}->{'_interval_to'} = $data->{'Data'}; |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
# } elsif ($name eq 'GBINTERVAL_ACCESSION' ) { |
426
|
|
|
|
|
|
|
# $self->{'_feature'}->{'_interval_accession'} = $data->{'Data'}; |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# } elsif ($name eq 'GBSEQ_SEQUENCE' ) { |
429
|
|
|
|
|
|
|
# $self->{'_characters'} .= $data->{'Data'}; |
430
|
|
|
|
|
|
|
# } |
431
|
|
|
|
|
|
|
} |
432
|
122
|
|
|
|
|
185
|
$self->SUPER::characters($data); |
433
|
|
|
|
|
|
|
} |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
1; |
436
|
|
|
|
|
|
|
|