| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# BioPerl module for Bio::SeqIO::genbank |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
# Cared for by Bioperl project bioperl-l(at)bioperl.org |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Copyright Elia Stupka and contributors see AUTHORS section |
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::SeqIO::genbank - GenBank sequence input/output stream |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
It is probably best not to use this object directly, but |
|
21
|
|
|
|
|
|
|
rather go through the SeqIO handler: |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
$stream = Bio::SeqIO->new(-file => $filename, |
|
24
|
|
|
|
|
|
|
-format => 'GenBank'); |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
while ( my $seq = $stream->next_seq ) { |
|
27
|
|
|
|
|
|
|
# do something with $seq |
|
28
|
|
|
|
|
|
|
} |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
This object can transform Bio::Seq objects to and from GenBank flat |
|
34
|
|
|
|
|
|
|
file databases. |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
There is some flexibility here about how to write GenBank output |
|
37
|
|
|
|
|
|
|
that is not fully documented. |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head2 Optional functions |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=over 3 |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=item _show_dna() |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
(output only) shows the dna or not |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=item _post_sort() |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
(output only) provides a sorting func which is applied to the FTHelpers |
|
50
|
|
|
|
|
|
|
before printing |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=item _id_generation_func() |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
This is function which is called as |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
print "ID ", $func($seq), "\n"; |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
To generate the ID line. If it is not there, it generates a sensible ID |
|
59
|
|
|
|
|
|
|
line using a number of tools. |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
If you want to output annotations in Genbank format they need to be |
|
62
|
|
|
|
|
|
|
stored in a Bio::Annotation::Collection object which is accessible |
|
63
|
|
|
|
|
|
|
through the Bio::SeqI interface method L. |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
The following are the names of the keys which are pulled from a |
|
66
|
|
|
|
|
|
|
L object: |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
reference - Should contain Bio::Annotation::Reference objects |
|
69
|
|
|
|
|
|
|
comment - Should contain Bio::Annotation::Comment objects |
|
70
|
|
|
|
|
|
|
dblink - Should contain a Bio::Annotation::DBLink object |
|
71
|
|
|
|
|
|
|
segment - Should contain a Bio::Annotation::SimpleValue object |
|
72
|
|
|
|
|
|
|
origin - Should contain a Bio::Annotation::SimpleValue object |
|
73
|
|
|
|
|
|
|
wgs - Should contain a Bio::Annotation::SimpleValue object |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=back |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head1 Where does the data go? |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Data parsed in Bio::SeqIO::genbank is stored in a variety of data |
|
80
|
|
|
|
|
|
|
fields in the sequence object that is returned. Here is a partial list |
|
81
|
|
|
|
|
|
|
of fields. |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you |
|
84
|
|
|
|
|
|
|
the top level object which defines a function called NAME() which |
|
85
|
|
|
|
|
|
|
stores this information. |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
Items listed as Annotation 'NAME' tell you the data is stored the |
|
88
|
|
|
|
|
|
|
associated Bio::AnnotationCollectionI object which is associated with |
|
89
|
|
|
|
|
|
|
Bio::Seq objects. If it is explicitly requested that no annotations |
|
90
|
|
|
|
|
|
|
should be stored when parsing a record of course they will not be |
|
91
|
|
|
|
|
|
|
available when you try and get them. If you are having this problem |
|
92
|
|
|
|
|
|
|
look at the type of SeqBuilder that is being used to contruct your |
|
93
|
|
|
|
|
|
|
sequence object. |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Comments Annotation 'comment' |
|
96
|
|
|
|
|
|
|
References Annotation 'reference' |
|
97
|
|
|
|
|
|
|
Segment Annotation 'segment' |
|
98
|
|
|
|
|
|
|
Origin Annotation 'origin' |
|
99
|
|
|
|
|
|
|
Dbsource Annotation 'dblink' |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Accessions PrimarySeq accession_number() |
|
102
|
|
|
|
|
|
|
Secondary accessions RichSeq get_secondary_accessions() |
|
103
|
|
|
|
|
|
|
GI number PrimarySeq primary_id() |
|
104
|
|
|
|
|
|
|
LOCUS PrimarySeq display_id() |
|
105
|
|
|
|
|
|
|
Keywords RichSeq get_keywords() |
|
106
|
|
|
|
|
|
|
Dates RichSeq get_dates() |
|
107
|
|
|
|
|
|
|
Molecule RichSeq molecule() |
|
108
|
|
|
|
|
|
|
Seq Version RichSeq seq_version() |
|
109
|
|
|
|
|
|
|
PID RichSeq pid() |
|
110
|
|
|
|
|
|
|
Division RichSeq division() |
|
111
|
|
|
|
|
|
|
Features Seq get_SeqFeatures() |
|
112
|
|
|
|
|
|
|
Alphabet PrimarySeq alphabet() |
|
113
|
|
|
|
|
|
|
Definition PrimarySeq description() or desc() |
|
114
|
|
|
|
|
|
|
Version PrimarySeq version() |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Sequence PrimarySeq seq() |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
There is more information in the Feature-Annotation HOWTO about each |
|
119
|
|
|
|
|
|
|
field and how it is mapped to the Sequence object |
|
120
|
|
|
|
|
|
|
L. |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=head1 FEEDBACK |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head2 Mailing Lists |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
|
127
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
|
128
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
|
131
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=head2 Support |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
I |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
|
140
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
|
141
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
|
142
|
|
|
|
|
|
|
with code and data examples if at all possible. |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=head2 Reporting Bugs |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
|
147
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the web: |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head1 AUTHOR - Bioperl Project |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
bioperl-l at bioperl.org |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Original author Elia Stupka, elia -at- tigem.it |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Ewan Birney birney at ebi.ac.uk |
|
160
|
|
|
|
|
|
|
Jason Stajich jason at bioperl.org |
|
161
|
|
|
|
|
|
|
Chris Mungall cjm at fruitfly.bdgp.berkeley.edu |
|
162
|
|
|
|
|
|
|
Lincoln Stein lstein at cshl.org |
|
163
|
|
|
|
|
|
|
Heikki Lehvaslaiho, heikki at ebi.ac.uk |
|
164
|
|
|
|
|
|
|
Hilmar Lapp, hlapp at gmx.net |
|
165
|
|
|
|
|
|
|
Donald G. Jackson, donald.jackson at bms.com |
|
166
|
|
|
|
|
|
|
James Wasmuth, james.wasmuth at ed.ac.uk |
|
167
|
|
|
|
|
|
|
Brian Osborne, bosborne at alum.mit.edu |
|
168
|
|
|
|
|
|
|
Chris Fields, cjfields at bioperl dot org |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=head1 APPENDIX |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
|
173
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=cut |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# Let the code begin... |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
package Bio::SeqIO::genbank; |
|
180
|
16
|
|
|
16
|
|
602
|
use strict; |
|
|
16
|
|
|
|
|
21
|
|
|
|
16
|
|
|
|
|
455
|
|
|
181
|
|
|
|
|
|
|
|
|
182
|
16
|
|
|
16
|
|
4065
|
use Bio::SeqIO::FTHelper; |
|
|
16
|
|
|
|
|
26
|
|
|
|
16
|
|
|
|
|
322
|
|
|
183
|
16
|
|
|
16
|
|
66
|
use Bio::SeqFeature::Generic; |
|
|
16
|
|
|
|
|
18
|
|
|
|
16
|
|
|
|
|
276
|
|
|
184
|
16
|
|
|
16
|
|
4378
|
use Bio::Species; |
|
|
16
|
|
|
|
|
26
|
|
|
|
16
|
|
|
|
|
329
|
|
|
185
|
16
|
|
|
16
|
|
74
|
use Bio::Seq::SeqFactory; |
|
|
16
|
|
|
|
|
15
|
|
|
|
16
|
|
|
|
|
511
|
|
|
186
|
16
|
|
|
16
|
|
53
|
use Bio::Annotation::Collection; |
|
|
16
|
|
|
|
|
17
|
|
|
|
16
|
|
|
|
|
264
|
|
|
187
|
16
|
|
|
16
|
|
4642
|
use Bio::Annotation::Comment; |
|
|
16
|
|
|
|
|
24
|
|
|
|
16
|
|
|
|
|
321
|
|
|
188
|
16
|
|
|
16
|
|
4630
|
use Bio::Annotation::Reference; |
|
|
16
|
|
|
|
|
29
|
|
|
|
16
|
|
|
|
|
376
|
|
|
189
|
16
|
|
|
16
|
|
70
|
use Bio::Annotation::DBLink; |
|
|
16
|
|
|
|
|
13
|
|
|
|
16
|
|
|
|
|
267
|
|
|
190
|
|
|
|
|
|
|
|
|
191
|
16
|
|
|
16
|
|
49
|
use base qw(Bio::SeqIO); |
|
|
16
|
|
|
|
|
17
|
|
|
|
16
|
|
|
|
|
106724
|
|
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# Note that a qualifier that exceeds one line (i.e. a long label) will |
|
194
|
|
|
|
|
|
|
# automatically be quoted regardless: |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
our $FTQUAL_LINE_LENGTH = 60; |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
our %FTQUAL_NO_QUOTE = map {$_ => 1} qw( |
|
199
|
|
|
|
|
|
|
anticodon citation |
|
200
|
|
|
|
|
|
|
codon codon_start |
|
201
|
|
|
|
|
|
|
cons_splice direction |
|
202
|
|
|
|
|
|
|
evidence label |
|
203
|
|
|
|
|
|
|
mod_base number |
|
204
|
|
|
|
|
|
|
rpt_type rpt_unit |
|
205
|
|
|
|
|
|
|
transl_except transl_table |
|
206
|
|
|
|
|
|
|
usedin |
|
207
|
|
|
|
|
|
|
); |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
our %DBSOURCE = map {$_ => 1} qw( |
|
210
|
|
|
|
|
|
|
EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs |
|
211
|
|
|
|
|
|
|
TIGR GO InterPro Pfam PROSITE SGD GermOnline |
|
212
|
|
|
|
|
|
|
HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG |
|
213
|
|
|
|
|
|
|
H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR |
|
214
|
|
|
|
|
|
|
MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome |
|
215
|
|
|
|
|
|
|
UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE |
|
216
|
|
|
|
|
|
|
PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE |
|
217
|
|
|
|
|
|
|
Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase |
|
218
|
|
|
|
|
|
|
TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP |
|
219
|
|
|
|
|
|
|
PhotoList Gramene WormBase WormPep Genew ZFIN |
|
220
|
|
|
|
|
|
|
PeroxiBase MaizeDB TAIR DrugBank REBASE HPA |
|
221
|
|
|
|
|
|
|
swissprot GenBank GenPept REFSEQ embl PDB UniProtKB |
|
222
|
|
|
|
|
|
|
DIP PeptideAtlas PRIDE CYGD HOGENOME Gene3D |
|
223
|
|
|
|
|
|
|
Project); |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
our %VALID_MOLTYPE = map {$_ => 1} qw(NA DNA RNA tRNA rRNA cDNA cRNA ms-DNA |
|
226
|
|
|
|
|
|
|
mRNA uRNA ss-RNA ss-DNA snRNA snoRNA PRT); |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
our %VALID_ALPHABET = ( |
|
229
|
|
|
|
|
|
|
'bp' => 'dna', |
|
230
|
|
|
|
|
|
|
'aa' => 'protein', |
|
231
|
|
|
|
|
|
|
'rc' => '' # rc = release candidate; file has no sequences |
|
232
|
|
|
|
|
|
|
); |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
sub _initialize { |
|
235
|
115
|
|
|
115
|
|
280
|
my($self, @args) = @_; |
|
236
|
|
|
|
|
|
|
|
|
237
|
115
|
|
|
|
|
477
|
$self->SUPER::_initialize(@args); |
|
238
|
|
|
|
|
|
|
# hash for functions for decoding keys. |
|
239
|
115
|
|
|
|
|
439
|
$self->{'_func_ftunit_hash'} = {}; |
|
240
|
115
|
|
|
|
|
451
|
$self->_show_dna(1); # sets this to one by default. People can change it |
|
241
|
115
|
50
|
|
|
|
423
|
if ( not defined $self->sequence_factory ) { |
|
242
|
115
|
|
|
|
|
510
|
$self->sequence_factory |
|
243
|
|
|
|
|
|
|
(Bio::Seq::SeqFactory->new(-verbose => $self->verbose, |
|
244
|
|
|
|
|
|
|
-type => 'Bio::Seq::RichSeq')); |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
} |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
=head2 next_seq |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
Title : next_seq |
|
251
|
|
|
|
|
|
|
Usage : $seq = $stream->next_seq() |
|
252
|
|
|
|
|
|
|
Function: returns the next sequence in the stream |
|
253
|
|
|
|
|
|
|
Returns : Bio::Seq object |
|
254
|
|
|
|
|
|
|
Args : |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=cut |
|
257
|
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
sub next_seq { |
|
259
|
117
|
|
|
117
|
1
|
6491
|
my ($self, @args) = @_; |
|
260
|
117
|
|
|
|
|
302
|
my %args = @args; |
|
261
|
117
|
|
|
|
|
332
|
my $builder = $self->sequence_builder; |
|
262
|
117
|
|
|
|
|
183
|
my $seq; |
|
263
|
|
|
|
|
|
|
my %params; |
|
264
|
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
RECORDSTART: |
|
266
|
117
|
|
|
|
|
134
|
while (1) { |
|
267
|
119
|
|
|
|
|
141
|
my $buffer; |
|
268
|
119
|
|
|
|
|
127
|
my ( @acc, @features ); |
|
269
|
0
|
|
|
|
|
0
|
my ( $display_id, $annotation ); |
|
270
|
0
|
|
|
|
|
0
|
my $species; |
|
271
|
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# initialize; we may come here because of starting over |
|
273
|
119
|
|
|
|
|
220
|
@features = (); |
|
274
|
119
|
|
|
|
|
158
|
$annotation = undef; |
|
275
|
119
|
|
|
|
|
450
|
@acc = (); |
|
276
|
119
|
|
|
|
|
152
|
$species = undef; |
|
277
|
119
|
|
|
|
|
454
|
%params = ( -verbose => $self->verbose ); # reset hash |
|
278
|
119
|
|
|
|
|
576
|
local ($/) = "\n"; |
|
279
|
119
|
|
|
|
|
497
|
while ( defined( $buffer = $self->_readline ) ) { |
|
280
|
189
|
100
|
|
|
|
742
|
last if index( $buffer, 'LOCUS ' ) == 0; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
119
|
100
|
|
|
|
327
|
return unless defined $buffer; # end of file |
|
283
|
113
|
50
|
|
|
|
663
|
$buffer =~ /^LOCUS\s+(\S.*)$/o |
|
284
|
|
|
|
|
|
|
or $self->throw( "GenBank stream with bad LOCUS line. " |
|
285
|
|
|
|
|
|
|
. "Not GenBank in my book. Got '$buffer'"); |
|
286
|
113
|
|
|
|
|
621
|
my @tokens = split( ' ', $1 ); |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# this is important to have the id for display in e.g. FTHelper, |
|
289
|
|
|
|
|
|
|
# otherwise you won't know which entry caused an error |
|
290
|
113
|
|
|
|
|
200
|
$display_id = shift @tokens; |
|
291
|
113
|
|
|
|
|
261
|
$params{'-display_id'} = $display_id; |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# may still be useful if we don't want the seq |
|
294
|
113
|
|
|
|
|
169
|
my $seqlength = shift @tokens; |
|
295
|
113
|
50
|
|
|
|
282
|
if ( exists $VALID_ALPHABET{$seqlength} ) { |
|
296
|
|
|
|
|
|
|
# moved one token too far. No locus name? |
|
297
|
0
|
|
|
|
|
0
|
$self->warn( "Bad LOCUS name? Changing [$params{'-display_id'}] " |
|
298
|
|
|
|
|
|
|
. "to 'unknown' and length to '$display_id'" |
|
299
|
|
|
|
|
|
|
); |
|
300
|
0
|
|
|
|
|
0
|
$params{'-display_id'} = 'unknown'; |
|
301
|
0
|
|
|
|
|
0
|
$params{'-length'} = $display_id; |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
# add token back... |
|
304
|
0
|
|
|
|
|
0
|
unshift @tokens, $seqlength; |
|
305
|
|
|
|
|
|
|
} |
|
306
|
|
|
|
|
|
|
else { |
|
307
|
113
|
|
|
|
|
234
|
$params{'-length'} = $seqlength; |
|
308
|
|
|
|
|
|
|
} |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# the alphabet of the entry |
|
311
|
|
|
|
|
|
|
# shouldn't assign alphabet unless one |
|
312
|
|
|
|
|
|
|
# is specifically designated (such as for rc files) |
|
313
|
113
|
|
|
|
|
238
|
my $alphabet = lc( shift @tokens ); |
|
314
|
|
|
|
|
|
|
$params{'-alphabet'} = |
|
315
|
|
|
|
|
|
|
( exists $VALID_ALPHABET{$alphabet} ) |
|
316
|
113
|
50
|
|
|
|
383
|
? $VALID_ALPHABET{$alphabet} |
|
317
|
|
|
|
|
|
|
: $self->warn("Unknown alphabet: $alphabet"); |
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
# for aa there is usually no 'molecule' (mRNA etc) |
|
320
|
113
|
100
|
|
|
|
301
|
if ( $params{'-alphabet'} eq 'protein' ) { |
|
321
|
10
|
|
|
|
|
25
|
$params{'-molecule'} = 'PRT'; |
|
322
|
|
|
|
|
|
|
} |
|
323
|
|
|
|
|
|
|
else { |
|
324
|
103
|
|
|
|
|
198
|
$params{'-molecule'} = shift(@tokens); |
|
325
|
|
|
|
|
|
|
} |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
# take care of lower case issues |
|
328
|
113
|
100
|
66
|
|
|
761
|
if ( $params{'-molecule'} eq 'dna' or $params{'-molecule'} eq 'rna' ) { |
|
329
|
4
|
|
|
|
|
10
|
$params{'-molecule'} = uc $params{'-molecule'}; |
|
330
|
|
|
|
|
|
|
} |
|
331
|
|
|
|
|
|
|
$self->debug( "Unrecognized molecule type: " . $params{'-molecule'} ) |
|
332
|
113
|
50
|
|
|
|
496
|
if not exists( $VALID_MOLTYPE{ $params{'-molecule'} } ); |
|
333
|
|
|
|
|
|
|
|
|
334
|
113
|
|
|
|
|
200
|
my $circ = shift @tokens; |
|
335
|
113
|
100
|
|
|
|
270
|
if ( $circ eq 'circular' ) { |
|
336
|
8
|
|
|
|
|
21
|
$params{'-is_circular'} = 1; |
|
337
|
8
|
|
|
|
|
24
|
$params{'-division'} = shift @tokens; |
|
338
|
|
|
|
|
|
|
} |
|
339
|
|
|
|
|
|
|
else { |
|
340
|
|
|
|
|
|
|
# 'linear' or 'circular' may actually be omitted altogether |
|
341
|
105
|
100
|
|
|
|
408
|
$params{'-division'} = |
|
342
|
|
|
|
|
|
|
( CORE::length($circ) == 3 ) ? $circ : shift @tokens; |
|
343
|
|
|
|
|
|
|
} |
|
344
|
113
|
|
|
|
|
284
|
my $date = join( ' ', @tokens ); # we lump together the rest |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# this is per request bug #1513 |
|
347
|
|
|
|
|
|
|
# we can handle: |
|
348
|
|
|
|
|
|
|
# 9-10-2003 |
|
349
|
|
|
|
|
|
|
# 9-10-03 |
|
350
|
|
|
|
|
|
|
# 09-10-2003 |
|
351
|
|
|
|
|
|
|
# 09-10-03 |
|
352
|
113
|
100
|
|
|
|
882
|
if ( $date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/ ) { |
|
353
|
107
|
50
|
|
|
|
252
|
if ( length($date) < 11 ) { |
|
354
|
|
|
|
|
|
|
# improperly formatted date |
|
355
|
|
|
|
|
|
|
# But we'll be nice and fix it for them |
|
356
|
0
|
|
|
|
|
0
|
my ( $d, $m, $y ) = ( $2, $3, $4 ); |
|
357
|
0
|
0
|
|
|
|
0
|
if ( length($d) == 1 ) { |
|
358
|
0
|
|
|
|
|
0
|
$d = "0$d"; |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# guess the century here |
|
362
|
0
|
0
|
|
|
|
0
|
if ( length($y) == 2 ) { |
|
363
|
0
|
0
|
|
|
|
0
|
if ( $y > 60 ) { # arbitrarily guess that '60' means 1960 |
|
364
|
0
|
|
|
|
|
0
|
$y = "19$y"; |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
else { |
|
367
|
0
|
|
|
|
|
0
|
$y = "20$y"; |
|
368
|
|
|
|
|
|
|
} |
|
369
|
0
|
|
|
|
|
0
|
$self->warn( "Date was malformed, guessing the " |
|
370
|
|
|
|
|
|
|
. "century for $date to be $y\n" |
|
371
|
|
|
|
|
|
|
); |
|
372
|
|
|
|
|
|
|
} |
|
373
|
0
|
|
|
|
|
0
|
$params{'-dates'} = [ join( '-', $d, $m, $y ) ]; |
|
374
|
|
|
|
|
|
|
} |
|
375
|
|
|
|
|
|
|
else { |
|
376
|
107
|
|
|
|
|
320
|
$params{'-dates'} = [$date]; |
|
377
|
|
|
|
|
|
|
} |
|
378
|
|
|
|
|
|
|
} |
|
379
|
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# set them all at once |
|
381
|
113
|
|
|
|
|
667
|
$builder->add_slot_value(%params); |
|
382
|
113
|
|
|
|
|
315
|
%params = (); |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# parse the rest if desired, otherwise start over |
|
385
|
113
|
100
|
|
|
|
312
|
if ( not $builder->want_object ) { |
|
386
|
2
|
|
|
|
|
7
|
$builder->make_object; |
|
387
|
2
|
|
|
|
|
11
|
next RECORDSTART; |
|
388
|
|
|
|
|
|
|
} |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
# set up annotation depending on what the builder wants |
|
391
|
111
|
100
|
|
|
|
318
|
if ( $builder->want_slot('annotation') ) { |
|
392
|
106
|
|
|
|
|
778
|
$annotation = Bio::Annotation::Collection->new; |
|
393
|
|
|
|
|
|
|
} |
|
394
|
|
|
|
|
|
|
|
|
395
|
111
|
|
|
|
|
314
|
$buffer = $self->_readline; |
|
396
|
111
|
|
|
|
|
385
|
while ( defined( my $line = $buffer ) ) { |
|
397
|
|
|
|
|
|
|
# Description line(s) |
|
398
|
739
|
100
|
|
|
|
1952
|
if ($line =~ /^DEFINITION\s+(\S.*\S)/) { |
|
399
|
110
|
|
|
|
|
318
|
my @desc = ($1); |
|
400
|
110
|
|
|
|
|
307
|
while ( defined( $line = $self->_readline ) ) { |
|
401
|
172
|
100
|
|
|
|
573
|
if ($line =~ /^\s+(.*)/) { |
|
402
|
62
|
|
|
|
|
130
|
push( @desc, $1 ); |
|
403
|
62
|
|
|
|
|
149
|
next; |
|
404
|
|
|
|
|
|
|
} |
|
405
|
110
|
|
|
|
|
142
|
last; |
|
406
|
|
|
|
|
|
|
} |
|
407
|
110
|
|
|
|
|
605
|
$builder->add_slot_value( -desc => join( ' ', @desc ) ); |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
# we'll continue right here because DEFINITION |
|
410
|
|
|
|
|
|
|
# always comes at the top of the entry |
|
411
|
110
|
|
|
|
|
212
|
$buffer = $line; |
|
412
|
|
|
|
|
|
|
} |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# accession number (there can be multiple accessions) |
|
415
|
739
|
100
|
|
|
|
8817
|
if ($line =~ /^ACCESSION\s+(\S.*\S)/) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
416
|
110
|
|
|
|
|
511
|
push( @acc, split( /\s+/, $1 ) ); |
|
417
|
110
|
|
|
|
|
500
|
while ( defined( $line = $self->_readline ) ) { |
|
418
|
113
|
100
|
|
|
|
401
|
if ($line =~ /^\s+(.*)/) { |
|
419
|
3
|
|
|
|
|
15
|
push( @acc, split( /\s+/, $1 ) ); |
|
420
|
3
|
|
|
|
|
10
|
next; |
|
421
|
|
|
|
|
|
|
} |
|
422
|
110
|
|
|
|
|
135
|
last; |
|
423
|
|
|
|
|
|
|
} |
|
424
|
110
|
|
|
|
|
180
|
$buffer = $line; |
|
425
|
110
|
|
|
|
|
262
|
next; |
|
426
|
|
|
|
|
|
|
} |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# PID |
|
429
|
|
|
|
|
|
|
elsif ($line =~ /^PID\s+(\S+)/) { |
|
430
|
1
|
|
|
|
|
3
|
$params{'-pid'} = $1; |
|
431
|
|
|
|
|
|
|
} |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
# Version number |
|
434
|
|
|
|
|
|
|
elsif ($line =~ /^VERSION\s+(\S.+)$/) { |
|
435
|
91
|
|
|
|
|
384
|
my ( $acc, $gi ) = split( ' ', $1 ); |
|
436
|
91
|
100
|
|
|
|
413
|
if ( $acc =~ /^\w+\.(\d+)/ ) { |
|
437
|
87
|
|
|
|
|
231
|
$params{'-version'} = $1; |
|
438
|
87
|
|
|
|
|
193
|
$params{'-seq_version'} = $1; |
|
439
|
|
|
|
|
|
|
} |
|
440
|
91
|
100
|
66
|
|
|
531
|
if ( $gi && ( index( $gi, "GI:" ) == 0 ) ) { |
|
441
|
89
|
|
|
|
|
301
|
$params{'-primary_id'} = substr( $gi, 3 ); |
|
442
|
|
|
|
|
|
|
} |
|
443
|
|
|
|
|
|
|
} |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# Keywords |
|
446
|
|
|
|
|
|
|
elsif ($line =~ /^KEYWORDS\s+(\S.*)/) { |
|
447
|
107
|
|
|
|
|
474
|
my @kw = split( /\s*\;\s*/, $1 ); |
|
448
|
107
|
|
|
|
|
294
|
while ( defined( $line = $self->_readline ) ) { |
|
449
|
138
|
|
|
|
|
241
|
chomp $line; |
|
450
|
138
|
100
|
|
|
|
398
|
if ($line =~ /^\s+(.*)/) { |
|
451
|
31
|
|
|
|
|
218
|
push( @kw, split( /\s*\;\s*/, $1 ) ); |
|
452
|
31
|
|
|
|
|
58
|
next; |
|
453
|
|
|
|
|
|
|
} |
|
454
|
107
|
|
|
|
|
350
|
last; |
|
455
|
|
|
|
|
|
|
} |
|
456
|
|
|
|
|
|
|
|
|
457
|
107
|
50
|
|
|
|
524
|
@kw && $kw[-1] =~ s/\.$//; |
|
458
|
107
|
|
|
|
|
251
|
$params{'-keywords'} = \@kw; |
|
459
|
107
|
|
|
|
|
152
|
$buffer = $line; |
|
460
|
107
|
|
|
|
|
309
|
next; |
|
461
|
|
|
|
|
|
|
} |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# Organism name and phylogenetic information |
|
464
|
|
|
|
|
|
|
elsif ($line =~ /^SOURCE\s+\S/) { |
|
465
|
106
|
100
|
|
|
|
334
|
if ( $builder->want_slot('species') ) { |
|
466
|
101
|
|
|
|
|
360
|
$species = $self->_read_GenBank_Species( \$buffer ); |
|
467
|
101
|
|
|
|
|
720
|
$builder->add_slot_value( -species => $species ); |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
else { |
|
470
|
5
|
|
|
|
|
10
|
while ( defined( $buffer = $self->_readline ) ) { |
|
471
|
18
|
100
|
|
|
|
35
|
last if substr( $buffer, 0, 1 ) ne ' '; |
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
} |
|
474
|
106
|
|
|
|
|
493
|
next; |
|
475
|
|
|
|
|
|
|
} |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# References |
|
478
|
|
|
|
|
|
|
elsif ($line =~ /^REFERENCE\s+\S/) { |
|
479
|
109
|
100
|
|
|
|
276
|
if ($annotation) { |
|
480
|
100
|
|
|
|
|
440
|
my @refs = $self->_read_GenBank_References( \$buffer ); |
|
481
|
100
|
|
|
|
|
226
|
foreach my $ref (@refs) { |
|
482
|
417
|
|
|
|
|
989
|
$annotation->add_Annotation( 'reference', $ref ); |
|
483
|
|
|
|
|
|
|
} |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
else { |
|
486
|
9
|
|
|
|
|
14
|
while ( defined( $buffer = $self->_readline ) ) { |
|
487
|
47
|
100
|
|
|
|
92
|
last if substr( $buffer, 0, 1 ) ne ' '; |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
} |
|
490
|
109
|
|
|
|
|
337
|
next; |
|
491
|
|
|
|
|
|
|
} |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# Project |
|
494
|
|
|
|
|
|
|
elsif ($line =~ /^PROJECT\s+(\S.*)/) { |
|
495
|
2
|
50
|
|
|
|
7
|
if ($annotation) { |
|
496
|
2
|
|
|
|
|
22
|
my $project = |
|
497
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new( -value => $1 ); |
|
498
|
2
|
|
|
|
|
10
|
$annotation->add_Annotation( 'project', $project ); |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
} |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# Comments may be plain text or Structured Comments. |
|
503
|
|
|
|
|
|
|
# Structured Comments are made up of tag/value pairs and have beginning |
|
504
|
|
|
|
|
|
|
# and end delimiters like ##*-Data-START## and ##*-Data-END## |
|
505
|
|
|
|
|
|
|
elsif ($line =~ /^COMMENT\s+(\S.*)/) { |
|
506
|
60
|
100
|
|
|
|
123
|
if ($annotation) { |
|
507
|
58
|
|
|
|
|
159
|
my $comment = $1; |
|
508
|
58
|
|
|
|
|
180
|
while ( defined( $line = $self->_readline ) ) { |
|
509
|
481
|
100
|
|
|
|
1032
|
last if ($line =~ /^\S/); |
|
510
|
423
|
|
|
|
|
960
|
$comment .= $line; |
|
511
|
|
|
|
|
|
|
} |
|
512
|
58
|
|
|
|
|
635
|
$comment =~ s/ +/ /g; |
|
513
|
|
|
|
|
|
|
# Structured Comment, do not remove returns in the tabular section |
|
514
|
58
|
100
|
|
|
|
246
|
if ( my ( $text, $table )= $comment |
|
515
|
|
|
|
|
|
|
=~ /([^#]*)(##\S+Data-START##.+?##\S+Data-END##)/is |
|
516
|
|
|
|
|
|
|
) { |
|
517
|
2
|
100
|
|
|
|
13
|
$text =~ s/\n/ /g if $text; |
|
518
|
2
|
|
|
|
|
11
|
$table =~ s/START##/START##\n/; |
|
519
|
2
|
|
|
|
|
16
|
$table =~ s/^\s+//gm; |
|
520
|
2
|
|
|
|
|
10
|
$comment = $text . "\n" . $table; |
|
521
|
|
|
|
|
|
|
} |
|
522
|
|
|
|
|
|
|
# Plain text, remove returns |
|
523
|
|
|
|
|
|
|
else { |
|
524
|
56
|
|
|
|
|
317
|
$comment =~ s/\n/ /g; |
|
525
|
|
|
|
|
|
|
} |
|
526
|
58
|
|
|
|
|
614
|
$annotation->add_Annotation( |
|
527
|
|
|
|
|
|
|
'comment', |
|
528
|
|
|
|
|
|
|
Bio::Annotation::Comment->new( |
|
529
|
|
|
|
|
|
|
-text => $comment, |
|
530
|
|
|
|
|
|
|
-tagname => 'comment' |
|
531
|
|
|
|
|
|
|
) |
|
532
|
|
|
|
|
|
|
); |
|
533
|
58
|
|
|
|
|
343
|
$buffer = $line; |
|
534
|
|
|
|
|
|
|
} |
|
535
|
|
|
|
|
|
|
else { |
|
536
|
2
|
|
|
|
|
6
|
while ( defined( $buffer = $self->_readline ) ) { |
|
537
|
7
|
100
|
|
|
|
16
|
last if substr( $buffer, 0, 1 ) ne ' '; |
|
538
|
|
|
|
|
|
|
} |
|
539
|
|
|
|
|
|
|
} |
|
540
|
60
|
|
|
|
|
145
|
next; |
|
541
|
|
|
|
|
|
|
} |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# Corresponding Genbank nucleotide id, Genpept only |
|
544
|
|
|
|
|
|
|
elsif ($line =~ /^DB(?:SOURCE|LINK)\s+(\S.+)/) { |
|
545
|
20
|
50
|
|
|
|
59
|
if ($annotation) { |
|
546
|
20
|
|
|
|
|
49
|
my $dbsource = $1; |
|
547
|
20
|
|
|
|
|
64
|
while ( defined( $line = $self->_readline ) ) { |
|
548
|
85
|
100
|
|
|
|
196
|
last if ($line =~ /^\S/); |
|
549
|
65
|
|
|
|
|
138
|
$dbsource .= $line; |
|
550
|
|
|
|
|
|
|
} |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# deal with UniProKB dbsources |
|
553
|
20
|
100
|
|
|
|
99
|
if ( $dbsource =~ |
|
554
|
|
|
|
|
|
|
s/(UniProt(?:KB)?|swissprot):\s+locus\s+(\S+)\,.+\n// |
|
555
|
|
|
|
|
|
|
) { |
|
556
|
5
|
|
|
|
|
51
|
$annotation->add_Annotation( |
|
557
|
|
|
|
|
|
|
'dblink', |
|
558
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
|
559
|
|
|
|
|
|
|
-primary_id => $2, |
|
560
|
|
|
|
|
|
|
-database => $1, |
|
561
|
|
|
|
|
|
|
-tagname => 'dblink' |
|
562
|
|
|
|
|
|
|
) |
|
563
|
|
|
|
|
|
|
); |
|
564
|
5
|
50
|
|
|
|
47
|
if ( $dbsource =~ s/\s+created:\s+([^\.]+)\.\n// ) { |
|
565
|
5
|
|
|
|
|
43
|
$annotation->add_Annotation( |
|
566
|
|
|
|
|
|
|
'swissprot_dates', |
|
567
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new( |
|
568
|
|
|
|
|
|
|
-tagname => 'date_created', |
|
569
|
|
|
|
|
|
|
-value => $1 |
|
570
|
|
|
|
|
|
|
) |
|
571
|
|
|
|
|
|
|
); |
|
572
|
|
|
|
|
|
|
} |
|
573
|
5
|
|
|
|
|
63
|
while ( $dbsource =~ |
|
574
|
|
|
|
|
|
|
s/\s+(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g |
|
575
|
|
|
|
|
|
|
) { |
|
576
|
5
|
|
|
|
|
25
|
$annotation->add_Annotation( |
|
577
|
|
|
|
|
|
|
'swissprot_dates', |
|
578
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new( |
|
579
|
|
|
|
|
|
|
-tagname => 'date_updated', |
|
580
|
|
|
|
|
|
|
-value => $2 |
|
581
|
|
|
|
|
|
|
) |
|
582
|
|
|
|
|
|
|
); |
|
583
|
|
|
|
|
|
|
} |
|
584
|
5
|
|
|
|
|
39
|
$dbsource =~ s/\n/ /g; |
|
585
|
5
|
50
|
|
|
|
59
|
if ( $dbsource =~ |
|
|
|
0
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
s/\s+xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ |
|
587
|
|
|
|
|
|
|
) { |
|
588
|
|
|
|
|
|
|
# will use $i to determine even or odd |
|
589
|
|
|
|
|
|
|
# for swissprot the accessions are paired |
|
590
|
5
|
|
|
|
|
9
|
my $i = 0; |
|
591
|
5
|
|
|
|
|
44
|
for my $dbsrc ( split( /,\s+/, $1 ) ) { |
|
592
|
49
|
50
|
66
|
|
|
244
|
if ( $dbsrc =~ /(\S+)\.(\d+)/ |
|
593
|
|
|
|
|
|
|
or $dbsrc =~ /(\S+)/ |
|
594
|
|
|
|
|
|
|
) { |
|
595
|
49
|
|
|
|
|
108
|
my ( $id, $version ) = ( $1, $2 ); |
|
596
|
49
|
100
|
|
|
|
85
|
$version = '' unless defined $version; |
|
597
|
49
|
100
|
|
|
|
140
|
my $db = ( $id =~ /^\d\S{3}/ ) ? 'PDB' |
|
|
|
100
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
: ( $i++ % 2 ) ? 'GenPept' |
|
599
|
|
|
|
|
|
|
: 'GenBank'; |
|
600
|
|
|
|
|
|
|
|
|
601
|
49
|
|
|
|
|
184
|
$annotation->add_Annotation( |
|
602
|
|
|
|
|
|
|
'dblink', |
|
603
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
|
604
|
|
|
|
|
|
|
-primary_id => $id, |
|
605
|
|
|
|
|
|
|
-version => $version, |
|
606
|
|
|
|
|
|
|
-database => $db, |
|
607
|
|
|
|
|
|
|
-tagname => 'dblink' |
|
608
|
|
|
|
|
|
|
) |
|
609
|
|
|
|
|
|
|
); |
|
610
|
|
|
|
|
|
|
} |
|
611
|
|
|
|
|
|
|
} |
|
612
|
|
|
|
|
|
|
} |
|
613
|
|
|
|
|
|
|
elsif ( $dbsource =~ s/\s+xrefs:\s+(.+)\s+xrefs/xrefs/i ) { |
|
614
|
|
|
|
|
|
|
# download screwed up and ncbi didn't put acc in for gi numbers |
|
615
|
0
|
|
|
|
|
0
|
my $i = 0; |
|
616
|
0
|
|
|
|
|
0
|
for my $id ( split( /\,\s+/, $1 ) ) { |
|
617
|
0
|
|
|
|
|
0
|
my ( $acc, $db ); |
|
618
|
0
|
0
|
|
|
|
0
|
if ( $id =~ /gi:\s+(\d+)/ ) { |
|
|
|
0
|
|
|
|
|
|
|
619
|
0
|
|
|
|
|
0
|
$acc = $1; |
|
620
|
0
|
0
|
|
|
|
0
|
$db = ( $i++ % 2 ) ? 'GenPept' : 'GenBank'; |
|
621
|
|
|
|
|
|
|
} |
|
622
|
|
|
|
|
|
|
elsif ( $id =~ /pdb\s+accession\s+(\S+)/ ) { |
|
623
|
0
|
|
|
|
|
0
|
$acc = $1; |
|
624
|
0
|
|
|
|
|
0
|
$db = 'PDB'; |
|
625
|
|
|
|
|
|
|
} |
|
626
|
|
|
|
|
|
|
else { |
|
627
|
0
|
|
|
|
|
0
|
$acc = $id; |
|
628
|
0
|
|
|
|
|
0
|
$db = ''; |
|
629
|
|
|
|
|
|
|
} |
|
630
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation( |
|
631
|
|
|
|
|
|
|
'dblink', |
|
632
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
|
633
|
|
|
|
|
|
|
-primary_id => $acc, |
|
634
|
|
|
|
|
|
|
-database => $db, |
|
635
|
|
|
|
|
|
|
-tagname => 'dblink' |
|
636
|
|
|
|
|
|
|
) |
|
637
|
|
|
|
|
|
|
); |
|
638
|
|
|
|
|
|
|
} |
|
639
|
|
|
|
|
|
|
} |
|
640
|
|
|
|
|
|
|
else { |
|
641
|
0
|
|
|
|
|
0
|
$self->debug("Cannot match $dbsource\n"); |
|
642
|
|
|
|
|
|
|
} |
|
643
|
5
|
50
|
|
|
|
91
|
if ( $dbsource =~ s/xrefs\s+ |
|
644
|
|
|
|
|
|
|
\(non\-sequence\s+databases\):\s+ |
|
645
|
|
|
|
|
|
|
((?:\S+,\s+)+\S+)//x |
|
646
|
|
|
|
|
|
|
) { |
|
647
|
5
|
|
|
|
|
69
|
for my $id ( split( /\,\s+/, $1 ) ) { |
|
648
|
86
|
|
|
|
|
73
|
my $db; |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
# this is because GenBank dropped the spaces!!! |
|
651
|
|
|
|
|
|
|
# I'm sure we're not going to get this right |
|
652
|
|
|
|
|
|
|
##if ( $id =~ s/^://i ) { |
|
653
|
|
|
|
|
|
|
## $db = $1; |
|
654
|
|
|
|
|
|
|
##} |
|
655
|
86
|
|
|
|
|
170
|
$db = substr( $id, 0, index( $id, ':' ) ); |
|
656
|
86
|
100
|
|
|
|
179
|
if ( not exists $DBSOURCE{$db} ) { |
|
657
|
16
|
|
|
|
|
18
|
$db = ''; # do we want 'GenBank' here? |
|
658
|
|
|
|
|
|
|
} |
|
659
|
86
|
|
|
|
|
109
|
$id = substr( $id, index( $id, ':' ) + 1 ); |
|
660
|
86
|
|
|
|
|
266
|
$annotation->add_Annotation( |
|
661
|
|
|
|
|
|
|
'dblink', |
|
662
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
|
663
|
|
|
|
|
|
|
-primary_id => $id, |
|
664
|
|
|
|
|
|
|
-database => $db, |
|
665
|
|
|
|
|
|
|
-tagname => 'dblink' |
|
666
|
|
|
|
|
|
|
) |
|
667
|
|
|
|
|
|
|
); |
|
668
|
|
|
|
|
|
|
} |
|
669
|
|
|
|
|
|
|
} |
|
670
|
|
|
|
|
|
|
} |
|
671
|
|
|
|
|
|
|
else { |
|
672
|
15
|
100
|
|
|
|
121
|
if ( $dbsource =~ |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
/^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ |
|
674
|
|
|
|
|
|
|
) { |
|
675
|
4
|
|
|
|
|
15
|
my ( $db, $id, $version ) = ( $1, $2, $3 ); |
|
676
|
4
|
|
100
|
|
|
35
|
$annotation->add_Annotation( |
|
677
|
|
|
|
|
|
|
'dblink', |
|
678
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
|
679
|
|
|
|
|
|
|
-primary_id => $id, |
|
680
|
|
|
|
|
|
|
-version => $version, |
|
681
|
|
|
|
|
|
|
-database => $db || 'GenBank', |
|
682
|
|
|
|
|
|
|
-tagname => 'dblink' |
|
683
|
|
|
|
|
|
|
) |
|
684
|
|
|
|
|
|
|
); |
|
685
|
|
|
|
|
|
|
} |
|
686
|
|
|
|
|
|
|
elsif ( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)/ ) { |
|
687
|
1
|
|
|
|
|
6
|
my ( $db, $id ) = ( $1, $2 ); |
|
688
|
1
|
|
50
|
|
|
13
|
$annotation->add_Annotation( |
|
689
|
|
|
|
|
|
|
'dblink', |
|
690
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
|
691
|
|
|
|
|
|
|
-primary_id => $id, |
|
692
|
|
|
|
|
|
|
-database => $db || 'GenBank', |
|
693
|
|
|
|
|
|
|
-tagname => 'dblink' |
|
694
|
|
|
|
|
|
|
) |
|
695
|
|
|
|
|
|
|
); |
|
696
|
|
|
|
|
|
|
} |
|
697
|
|
|
|
|
|
|
elsif ( $dbsource =~ /(\S+)([\.:])\s*(\S+)/ ) { |
|
698
|
10
|
|
|
|
|
16
|
my ( $db, $version ); |
|
699
|
10
|
|
|
|
|
21
|
my @ids = (); |
|
700
|
10
|
100
|
|
|
|
38
|
if ( $2 eq ':' ) { |
|
701
|
9
|
|
|
|
|
19
|
$db = $1; |
|
702
|
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
# Genbank 192 release notes say this: "The second |
|
704
|
|
|
|
|
|
|
# field can consist of multiple comma-separated |
|
705
|
|
|
|
|
|
|
# identifiers, if a sequence record has multiple |
|
706
|
|
|
|
|
|
|
# DBLINK cross-references of a given type." |
|
707
|
|
|
|
|
|
|
# For example: DBLINK Project:100,200,300" |
|
708
|
9
|
|
|
|
|
31
|
@ids = split( /,/, $3 ); |
|
709
|
|
|
|
|
|
|
} |
|
710
|
|
|
|
|
|
|
else { |
|
711
|
1
|
|
|
|
|
4
|
( $db, $version ) = ( 'GenBank', $3 ); |
|
712
|
1
|
|
|
|
|
3
|
$ids[0] = $1; |
|
713
|
|
|
|
|
|
|
} |
|
714
|
|
|
|
|
|
|
|
|
715
|
10
|
|
|
|
|
25
|
foreach my $id (@ids) { |
|
716
|
12
|
|
|
|
|
117
|
$annotation->add_Annotation( |
|
717
|
|
|
|
|
|
|
'dblink', |
|
718
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
|
719
|
|
|
|
|
|
|
-primary_id => $id, |
|
720
|
|
|
|
|
|
|
-version => $version, |
|
721
|
|
|
|
|
|
|
-database => $db, |
|
722
|
|
|
|
|
|
|
-tagname => 'dblink' |
|
723
|
|
|
|
|
|
|
) |
|
724
|
|
|
|
|
|
|
); |
|
725
|
|
|
|
|
|
|
} |
|
726
|
|
|
|
|
|
|
} |
|
727
|
|
|
|
|
|
|
else { |
|
728
|
0
|
|
|
|
|
0
|
$self->warn( |
|
729
|
|
|
|
|
|
|
"Unrecognized DBSOURCE data: $dbsource\n"); |
|
730
|
|
|
|
|
|
|
} |
|
731
|
|
|
|
|
|
|
} |
|
732
|
20
|
|
|
|
|
48
|
$buffer = $line; |
|
733
|
|
|
|
|
|
|
} |
|
734
|
|
|
|
|
|
|
else { |
|
735
|
0
|
|
|
|
|
0
|
while ( defined( $buffer = $self->_readline ) ) { |
|
736
|
0
|
0
|
|
|
|
0
|
last if substr( $buffer, 0, 1 ) ne ' '; |
|
737
|
|
|
|
|
|
|
} |
|
738
|
|
|
|
|
|
|
} |
|
739
|
20
|
|
|
|
|
58
|
next; |
|
740
|
|
|
|
|
|
|
} |
|
741
|
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
# Exit at start of Feature table, or start of sequence |
|
743
|
227
|
100
|
|
|
|
1003
|
if ($line =~ /^(FEATURES|ORIGIN)/) { |
|
744
|
111
|
|
|
|
|
137
|
my $trap; |
|
745
|
|
|
|
|
|
|
} |
|
746
|
227
|
100
|
|
|
|
950
|
last if ($line =~ /^(FEATURES|ORIGIN)/); |
|
747
|
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
# Get next line and loop again |
|
749
|
116
|
|
|
|
|
325
|
$buffer = $self->_readline; |
|
750
|
|
|
|
|
|
|
} |
|
751
|
111
|
50
|
|
|
|
252
|
return unless defined $buffer; |
|
752
|
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
# add them all at once for efficiency |
|
754
|
111
|
|
|
|
|
892
|
$builder->add_slot_value( |
|
755
|
|
|
|
|
|
|
-accession_number => shift(@acc), |
|
756
|
|
|
|
|
|
|
-secondary_accessions => \@acc, |
|
757
|
|
|
|
|
|
|
%params |
|
758
|
|
|
|
|
|
|
); |
|
759
|
111
|
100
|
|
|
|
462
|
$builder->add_slot_value( -annotation => $annotation ) if $annotation; |
|
760
|
111
|
|
|
|
|
273
|
%params = (); # reset before possible re-use to avoid setting twice |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
# start over if we don't want to continue with this entry |
|
763
|
111
|
50
|
|
|
|
357
|
if ( not $builder->want_object ) { |
|
764
|
0
|
|
|
|
|
0
|
$builder->make_object; |
|
765
|
0
|
|
|
|
|
0
|
next RECORDSTART; |
|
766
|
|
|
|
|
|
|
} |
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
# some "minimal" formats may not necessarily have a feature table |
|
769
|
111
|
100
|
66
|
|
|
347
|
if ( $builder->want_slot('features') |
|
|
|
|
100
|
|
|
|
|
|
770
|
|
|
|
|
|
|
and defined $buffer |
|
771
|
|
|
|
|
|
|
and $buffer =~ /^FEATURES/o |
|
772
|
|
|
|
|
|
|
) { |
|
773
|
|
|
|
|
|
|
# need to read the first line of the feature table |
|
774
|
104
|
|
|
|
|
363
|
$buffer = $self->_readline; |
|
775
|
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
# DO NOT read lines in the while condition -- this is done |
|
777
|
|
|
|
|
|
|
# as a side effect in _read_FTHelper_GenBank! |
|
778
|
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
# part of new circular spec: |
|
780
|
|
|
|
|
|
|
# commented out for now until kinks worked out |
|
781
|
|
|
|
|
|
|
#my $sourceEnd = 0; |
|
782
|
|
|
|
|
|
|
#$sourceEnd = $2 if ($buffer =~ /(\d+?)\.\.(\d+?)$/); |
|
783
|
|
|
|
|
|
|
|
|
784
|
104
|
|
|
|
|
284
|
while ( defined $buffer ) { |
|
785
|
|
|
|
|
|
|
# check immediately -- not at the end of the loop |
|
786
|
|
|
|
|
|
|
# note: GenPept entries obviously do not have a BASE line |
|
787
|
8384
|
100
|
|
|
|
57132
|
last if ( $buffer =~ /^BASE|ORIGIN|CONTIG|WGS/o ); |
|
788
|
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
# slurp in one feature at a time -- at return, the start of |
|
790
|
|
|
|
|
|
|
# the next feature will have been read already, so we need |
|
791
|
|
|
|
|
|
|
# to pass a reference, and the called method must set this |
|
792
|
|
|
|
|
|
|
# to the last line read before returning |
|
793
|
8280
|
|
|
|
|
17235
|
my $ftunit = $self->_read_FTHelper_GenBank( \$buffer ); |
|
794
|
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
# implement new circular spec: features that cross the origin are now |
|
796
|
|
|
|
|
|
|
# seamless instead of being 2 separate joined features |
|
797
|
|
|
|
|
|
|
# commented out until kinks get worked out |
|
798
|
|
|
|
|
|
|
#if ((! $args{'-nojoin'}) && $ftunit->{'loc'} =~ /^join\((\d+?)\.\.(\d+?),(\d+?)..(\d+?)\)$/ |
|
799
|
|
|
|
|
|
|
#&& $sourceEnd == $2 && $3 == 1) { |
|
800
|
|
|
|
|
|
|
#my $start = $1; |
|
801
|
|
|
|
|
|
|
#my $end = $2 + $4; |
|
802
|
|
|
|
|
|
|
#$ftunit->{'loc'} = "$start..$end"; |
|
803
|
|
|
|
|
|
|
#} |
|
804
|
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
# fix suggested by James Diggans |
|
806
|
|
|
|
|
|
|
|
|
807
|
8280
|
50
|
|
|
|
11652
|
if ( not defined $ftunit ) { |
|
808
|
|
|
|
|
|
|
# GRRRR. We have fallen over. Try to recover |
|
809
|
|
|
|
|
|
|
$self->warn( "Unexpected error in feature table for " |
|
810
|
0
|
|
|
|
|
0
|
. $params{'-display_id'} |
|
811
|
|
|
|
|
|
|
. " Skipping feature, attempting to recover" ); |
|
812
|
|
|
|
|
|
|
|
|
813
|
0
|
0
|
0
|
|
|
0
|
unless ( $buffer =~ /^\s{5,5}\S+/o |
|
814
|
|
|
|
|
|
|
or $buffer =~ /^\S+/o |
|
815
|
|
|
|
|
|
|
) { |
|
816
|
0
|
|
|
|
|
0
|
$buffer = $self->_readline; |
|
817
|
|
|
|
|
|
|
} |
|
818
|
0
|
|
|
|
|
0
|
next; # back to reading FTHelpers |
|
819
|
|
|
|
|
|
|
} |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
# process ftunit |
|
822
|
8280
|
|
|
|
|
19705
|
my $feat = |
|
823
|
|
|
|
|
|
|
$ftunit->_generic_seqfeature( $self->location_factory, |
|
824
|
|
|
|
|
|
|
$display_id ); |
|
825
|
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
# add taxon_id from source if available |
|
827
|
8280
|
100
|
100
|
|
|
21029
|
if ( $species |
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
828
|
|
|
|
|
|
|
and $feat->primary_tag eq 'source' |
|
829
|
|
|
|
|
|
|
and $feat->has_tag('db_xref') |
|
830
|
|
|
|
|
|
|
and ( not $species->ncbi_taxid |
|
831
|
|
|
|
|
|
|
or ( $species->ncbi_taxid |
|
832
|
|
|
|
|
|
|
and $species->ncbi_taxid =~ /^list/ ) ) |
|
833
|
|
|
|
|
|
|
) { |
|
834
|
95
|
|
|
|
|
298
|
foreach my $tagval ( $feat->get_tag_values('db_xref') ) { |
|
835
|
96
|
100
|
|
|
|
366
|
if ( index( $tagval, "taxon:" ) == 0 ) { |
|
836
|
95
|
|
|
|
|
339
|
$species->ncbi_taxid( substr( $tagval, 6 ) ); |
|
837
|
95
|
|
|
|
|
227
|
last; |
|
838
|
|
|
|
|
|
|
} |
|
839
|
|
|
|
|
|
|
} |
|
840
|
|
|
|
|
|
|
} |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
# add feature to list of features |
|
843
|
8280
|
|
|
|
|
26341
|
push( @features, $feat ); |
|
844
|
|
|
|
|
|
|
} |
|
845
|
104
|
|
|
|
|
626
|
$builder->add_slot_value( -features => \@features ); |
|
846
|
|
|
|
|
|
|
} |
|
847
|
|
|
|
|
|
|
|
|
848
|
111
|
50
|
|
|
|
402
|
if ( defined $buffer ) { |
|
849
|
|
|
|
|
|
|
# CONTIG lines: TODO, this needs to be cleaned up |
|
850
|
111
|
100
|
|
|
|
1334
|
if ($buffer =~/^CONTIG\s+(.*)/o) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
851
|
7
|
|
|
|
|
23
|
my $ctg = $1; |
|
852
|
7
|
|
|
|
|
31
|
while ( defined( $buffer = $self->_readline ) ) { |
|
853
|
29
|
100
|
|
|
|
167
|
last if $buffer =~ m{^ORIGIN|//}o; |
|
854
|
22
|
|
|
|
|
57
|
$buffer =~ s/\s+(.*)/$1/; |
|
855
|
22
|
|
|
|
|
44
|
$ctg .= $buffer; |
|
856
|
|
|
|
|
|
|
} |
|
857
|
7
|
50
|
|
|
|
27
|
if ($ctg) { |
|
858
|
7
|
|
|
|
|
74
|
$annotation->add_Annotation( |
|
859
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new( |
|
860
|
|
|
|
|
|
|
-tagname => 'contig', |
|
861
|
|
|
|
|
|
|
-value => $ctg |
|
862
|
|
|
|
|
|
|
) |
|
863
|
|
|
|
|
|
|
); |
|
864
|
|
|
|
|
|
|
} |
|
865
|
|
|
|
|
|
|
} |
|
866
|
|
|
|
|
|
|
elsif ($buffer =~ /^WGS|WGS_SCAFLD\s+/o) { # catch WGS/WGS_SCAFLD lines |
|
867
|
1
|
|
|
|
|
12
|
while ( $buffer =~ s/(^WGS|WGS_SCAFLD)\s+// ) { # gulp lines |
|
868
|
3
|
|
|
|
|
8
|
chomp $buffer; |
|
869
|
3
|
|
|
|
|
35
|
$annotation->add_Annotation( |
|
870
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new( |
|
871
|
|
|
|
|
|
|
-value => $buffer, |
|
872
|
|
|
|
|
|
|
-tagname => lc $1 |
|
873
|
|
|
|
|
|
|
) |
|
874
|
|
|
|
|
|
|
); |
|
875
|
3
|
|
|
|
|
14
|
$buffer = $self->_readline; |
|
876
|
|
|
|
|
|
|
} |
|
877
|
|
|
|
|
|
|
} |
|
878
|
|
|
|
|
|
|
elsif ( $buffer !~ m{^ORIGIN|//}o ) { # advance to the sequence, if any |
|
879
|
53
|
|
|
|
|
196
|
while ( defined( $buffer = $self->_readline ) ) { |
|
880
|
148
|
100
|
|
|
|
557
|
last if $buffer =~ m{^(ORIGIN|//)}; |
|
881
|
|
|
|
|
|
|
} |
|
882
|
|
|
|
|
|
|
} |
|
883
|
|
|
|
|
|
|
} |
|
884
|
111
|
50
|
|
|
|
445
|
if ( not $builder->want_object ) { |
|
885
|
0
|
|
|
|
|
0
|
$builder->make_object; # implicit end-of-object |
|
886
|
0
|
|
|
|
|
0
|
next RECORDSTART; |
|
887
|
|
|
|
|
|
|
} |
|
888
|
111
|
100
|
33
|
|
|
356
|
if ( $builder->want_slot('seq') ) { |
|
|
|
50
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
# the fact that we want a sequence does not necessarily mean that |
|
890
|
|
|
|
|
|
|
# there also is a sequence ... |
|
891
|
101
|
100
|
100
|
|
|
839
|
if ( defined $buffer and $buffer =~ s/^ORIGIN\s+// ) { |
|
892
|
93
|
100
|
66
|
|
|
516
|
if ( $annotation and length($buffer) > 0 ) { |
|
893
|
1
|
|
|
|
|
10
|
$annotation->add_Annotation( |
|
894
|
|
|
|
|
|
|
'origin', |
|
895
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new( |
|
896
|
|
|
|
|
|
|
-tagname => 'origin', |
|
897
|
|
|
|
|
|
|
-value => $buffer |
|
898
|
|
|
|
|
|
|
) |
|
899
|
|
|
|
|
|
|
); |
|
900
|
|
|
|
|
|
|
} |
|
901
|
93
|
|
|
|
|
153
|
my $seqc = ''; |
|
902
|
93
|
|
|
|
|
294
|
while ( defined( $buffer = $self->_readline ) ) { |
|
903
|
32735
|
100
|
|
|
|
42408
|
last if $buffer =~ m{^//}; |
|
904
|
32643
|
|
|
|
|
29546
|
$buffer = uc $buffer; |
|
905
|
32643
|
|
|
|
|
206020
|
$buffer =~ s/[^A-Za-z]//g; |
|
906
|
32643
|
|
|
|
|
55632
|
$seqc .= $buffer; |
|
907
|
|
|
|
|
|
|
} |
|
908
|
|
|
|
|
|
|
|
|
909
|
93
|
|
|
|
|
443
|
$builder->add_slot_value( -seq => $seqc ); |
|
910
|
|
|
|
|
|
|
} |
|
911
|
|
|
|
|
|
|
} |
|
912
|
|
|
|
|
|
|
elsif ( defined($buffer) and ( substr( $buffer, 0, 2 ) ne '//' ) ) { |
|
913
|
|
|
|
|
|
|
# advance to the end of the record |
|
914
|
10
|
|
|
|
|
20
|
while ( defined( $buffer = $self->_readline ) ) { |
|
915
|
172
|
100
|
|
|
|
338
|
last if substr( $buffer, 0, 2 ) eq '//'; |
|
916
|
|
|
|
|
|
|
} |
|
917
|
|
|
|
|
|
|
} |
|
918
|
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
# Unlikely, but maybe the sequence is so weird that we don't want it |
|
920
|
|
|
|
|
|
|
# anymore. We don't want to return undef if the stream's not exhausted |
|
921
|
|
|
|
|
|
|
# yet. |
|
922
|
111
|
|
|
|
|
392
|
$seq = $builder->make_object; |
|
923
|
111
|
50
|
|
|
|
311
|
next RECORDSTART unless $seq; |
|
924
|
111
|
|
|
|
|
2687
|
last RECORDSTART; |
|
925
|
|
|
|
|
|
|
} # end while RECORDSTART |
|
926
|
|
|
|
|
|
|
|
|
927
|
111
|
|
|
|
|
533
|
return $seq; |
|
928
|
|
|
|
|
|
|
} |
|
929
|
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
=head2 write_seq |
|
931
|
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
Title : write_seq |
|
933
|
|
|
|
|
|
|
Usage : $stream->write_seq($seq) |
|
934
|
|
|
|
|
|
|
Function: writes the $seq object (must be seq) to the stream |
|
935
|
|
|
|
|
|
|
Returns : 1 for success and 0 for error |
|
936
|
|
|
|
|
|
|
Args : array of 1 to n Bio::SeqI objects |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
=cut |
|
939
|
|
|
|
|
|
|
|
|
940
|
|
|
|
|
|
|
sub write_seq { |
|
941
|
25
|
|
|
25
|
1
|
83
|
my ($self,@seqs) = @_; |
|
942
|
|
|
|
|
|
|
|
|
943
|
25
|
|
|
|
|
69
|
foreach my $seq ( @seqs ) { |
|
944
|
25
|
50
|
|
|
|
105
|
$self->throw("Attempting to write with no seq!") unless defined $seq; |
|
945
|
|
|
|
|
|
|
|
|
946
|
25
|
50
|
33
|
|
|
205
|
if ( not ref $seq or not $seq->isa('Bio::SeqI') ) { |
|
947
|
0
|
|
|
|
|
0
|
$self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); |
|
948
|
|
|
|
|
|
|
} |
|
949
|
|
|
|
|
|
|
|
|
950
|
25
|
|
|
|
|
123
|
my $str = $seq->seq; |
|
951
|
25
|
|
|
|
|
124
|
my $len = $seq->length; |
|
952
|
25
|
|
|
|
|
99
|
my $alpha = $seq->alphabet; |
|
953
|
|
|
|
|
|
|
|
|
954
|
25
|
|
|
|
|
41
|
my ($div, $mol); |
|
955
|
25
|
100
|
66
|
|
|
235
|
if ( not $seq->can('division') |
|
956
|
|
|
|
|
|
|
or not defined($div = $seq->division) |
|
957
|
|
|
|
|
|
|
) { |
|
958
|
2
|
|
|
|
|
4
|
$div = 'UNK'; |
|
959
|
|
|
|
|
|
|
} |
|
960
|
25
|
100
|
66
|
|
|
190
|
if ( not $seq->can('molecule') |
|
961
|
|
|
|
|
|
|
or not defined ($mol = $seq->molecule) |
|
962
|
|
|
|
|
|
|
) { |
|
963
|
2
|
|
100
|
|
|
10
|
$mol = $alpha || 'DNA'; |
|
964
|
|
|
|
|
|
|
} |
|
965
|
|
|
|
|
|
|
|
|
966
|
25
|
100
|
|
|
|
92
|
my $circular = ($seq->is_circular) ? 'circular' : 'linear '; |
|
967
|
|
|
|
|
|
|
|
|
968
|
25
|
|
|
|
|
164
|
local($^W) = 0; # supressing warnings about uninitialized fields. |
|
969
|
|
|
|
|
|
|
|
|
970
|
25
|
|
|
|
|
39
|
my $temp_line; |
|
971
|
25
|
50
|
|
|
|
87
|
if ( $self->_id_generation_func ) { |
|
972
|
0
|
|
|
|
|
0
|
$temp_line = &{$self->_id_generation_func}($seq); |
|
|
0
|
|
|
|
|
0
|
|
|
973
|
|
|
|
|
|
|
} |
|
974
|
|
|
|
|
|
|
else { |
|
975
|
25
|
|
|
|
|
44
|
my $date = ''; |
|
976
|
25
|
100
|
|
|
|
132
|
if ( $seq->can('get_dates') ) { |
|
977
|
23
|
|
|
|
|
71
|
($date) = $seq->get_dates; |
|
978
|
|
|
|
|
|
|
} |
|
979
|
|
|
|
|
|
|
|
|
980
|
25
|
50
|
|
|
|
100
|
$self->warn("No whitespace allowed in GenBank display id [". $seq->display_id. "]") |
|
981
|
|
|
|
|
|
|
if $seq->display_id =~ /\s/; |
|
982
|
|
|
|
|
|
|
|
|
983
|
25
|
100
|
|
|
|
127
|
my @data = ( lc($alpha) eq 'protein' ) ? ('aa', '', '') : ('bp', '', $mol); |
|
984
|
25
|
|
|
|
|
99
|
$temp_line = sprintf ("%-12s%-15s%13s %s%4s%-8s%-8s %3s %-s\n", |
|
985
|
|
|
|
|
|
|
'LOCUS', $seq->id, $len, |
|
986
|
|
|
|
|
|
|
@data, $circular, $div, $date); |
|
987
|
|
|
|
|
|
|
} |
|
988
|
|
|
|
|
|
|
|
|
989
|
25
|
|
|
|
|
244
|
$self->_print($temp_line); |
|
990
|
25
|
|
|
|
|
112
|
$self->_write_line_GenBank_regex("DEFINITION ", " ", |
|
991
|
|
|
|
|
|
|
$seq->desc, "\\s\+\|\$",80); |
|
992
|
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
# if there, write the accession line |
|
994
|
|
|
|
|
|
|
|
|
995
|
25
|
50
|
|
|
|
90
|
if ( $self->_ac_generation_func ) { |
|
996
|
0
|
|
|
|
|
0
|
$temp_line = &{$self->_ac_generation_func}($seq); |
|
|
0
|
|
|
|
|
0
|
|
|
997
|
0
|
|
|
|
|
0
|
$self->_print("ACCESSION $temp_line\n"); |
|
998
|
|
|
|
|
|
|
} |
|
999
|
|
|
|
|
|
|
else { |
|
1000
|
25
|
|
|
|
|
61
|
my @acc = (); |
|
1001
|
25
|
|
|
|
|
111
|
push @acc, $seq->accession_number; |
|
1002
|
25
|
100
|
|
|
|
131
|
if ( $seq->isa('Bio::Seq::RichSeqI') ) { |
|
1003
|
23
|
|
|
|
|
158
|
push @acc, $seq->get_secondary_accessions; |
|
1004
|
|
|
|
|
|
|
} |
|
1005
|
25
|
|
|
|
|
136
|
$self->_print("ACCESSION ", join(" ", @acc), "\n"); |
|
1006
|
|
|
|
|
|
|
# otherwise - cannot print |
|
1007
|
|
|
|
|
|
|
} |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
# if PID defined, print it |
|
1010
|
25
|
50
|
66
|
|
|
170
|
if ($seq->isa('Bio::Seq::RichSeqI') and $seq->pid) { |
|
1011
|
0
|
|
|
|
|
0
|
$self->_print("PID ", $seq->pid, "\n"); |
|
1012
|
|
|
|
|
|
|
} |
|
1013
|
|
|
|
|
|
|
|
|
1014
|
|
|
|
|
|
|
# if there, write the version line |
|
1015
|
25
|
50
|
100
|
|
|
86
|
if ( defined $self->_sv_generation_func ) { |
|
|
|
100
|
|
|
|
|
|
|
1016
|
0
|
|
|
|
|
0
|
$temp_line = &{$self->_sv_generation_func}($seq); |
|
|
0
|
|
|
|
|
0
|
|
|
1017
|
0
|
0
|
|
|
|
0
|
if ( $temp_line ) { |
|
1018
|
0
|
|
|
|
|
0
|
$self->_print("VERSION $temp_line\n"); |
|
1019
|
|
|
|
|
|
|
} |
|
1020
|
|
|
|
|
|
|
} |
|
1021
|
|
|
|
|
|
|
elsif ($seq->isa('Bio::Seq::RichSeqI') and defined($seq->seq_version)) { |
|
1022
|
18
|
|
|
|
|
68
|
my $id = $seq->primary_id; # this may be a GI number |
|
1023
|
18
|
50
|
33
|
|
|
213
|
my $data = (defined $id and $id =~ /^\d+$/) ? " GI:$id" : ""; |
|
1024
|
18
|
|
|
|
|
58
|
$self->_print("VERSION ", |
|
1025
|
|
|
|
|
|
|
$seq->accession_number, ".", |
|
1026
|
|
|
|
|
|
|
$seq->seq_version, $data, "\n"); |
|
1027
|
|
|
|
|
|
|
} |
|
1028
|
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
# if there, write the PROJECT line |
|
1030
|
25
|
|
|
|
|
78
|
for my $proj ( $seq->annotation->get_Annotations('project') ) { |
|
1031
|
2
|
|
|
|
|
11
|
$self->_print("PROJECT ".$proj->value."\n"); |
|
1032
|
|
|
|
|
|
|
} |
|
1033
|
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
# if there, write the DBSOURCE line |
|
1035
|
25
|
|
|
|
|
65
|
foreach my $ref ( $seq->annotation->get_Annotations('dblink') ) { |
|
1036
|
2
|
|
|
|
|
11
|
my ($db, $id) = ($ref->database, $ref->primary_id); |
|
1037
|
2
|
100
|
|
|
|
9
|
my $prefix = $db eq 'Project' ? 'DBLINK' : 'DBSOURCE'; |
|
1038
|
2
|
100
|
|
|
|
13
|
my $text = $db eq 'GenBank' ? '' |
|
|
|
50
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
: $db eq 'Project' ? "$db:$id" |
|
1040
|
|
|
|
|
|
|
: "$db accession $id"; |
|
1041
|
2
|
|
|
|
|
13
|
$self->_print(sprintf ("%-11s %s\n", $prefix, $text)); |
|
1042
|
|
|
|
|
|
|
} |
|
1043
|
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
# if there, write the keywords line |
|
1045
|
25
|
50
|
|
|
|
82
|
if ( defined $self->_kw_generation_func ) { |
|
|
|
100
|
|
|
|
|
|
|
1046
|
0
|
|
|
|
|
0
|
$temp_line = &{$self->_kw_generation_func}($seq); |
|
|
0
|
|
|
|
|
0
|
|
|
1047
|
0
|
|
|
|
|
0
|
$self->_print("KEYWORDS $temp_line\n"); |
|
1048
|
|
|
|
|
|
|
} |
|
1049
|
|
|
|
|
|
|
elsif ( $seq->can('keywords') ) { |
|
1050
|
23
|
|
|
|
|
66
|
my $kw = $seq->keywords; |
|
1051
|
23
|
50
|
|
|
|
96
|
$kw .= '.' if ( $kw !~ /\.$/ ); |
|
1052
|
23
|
|
|
|
|
81
|
$self->_print("KEYWORDS $kw\n"); |
|
1053
|
|
|
|
|
|
|
} |
|
1054
|
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
# SEGMENT if it exists |
|
1056
|
25
|
|
|
|
|
79
|
foreach my $ref ( $seq->annotation->get_Annotations('segment') ) { |
|
1057
|
0
|
|
|
|
|
0
|
$self->_print(sprintf ("%-11s %s\n",'SEGMENT', |
|
1058
|
|
|
|
|
|
|
$ref->value)); |
|
1059
|
|
|
|
|
|
|
} |
|
1060
|
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
# Organism lines |
|
1062
|
25
|
100
|
|
|
|
85
|
if (my $spec = $seq->species) { |
|
1063
|
22
|
50
|
|
|
|
260
|
my ($on, $sn, $cn) = ($spec->can('organelle') ? $spec->organelle : '', |
|
1064
|
|
|
|
|
|
|
$spec->scientific_name, |
|
1065
|
|
|
|
|
|
|
$spec->common_name); |
|
1066
|
22
|
|
|
|
|
39
|
my @classification; |
|
1067
|
22
|
50
|
|
|
|
115
|
if ($spec->isa('Bio::Species')) { |
|
1068
|
22
|
|
|
|
|
78
|
@classification = $spec->classification; |
|
1069
|
22
|
|
|
|
|
46
|
shift @classification; |
|
1070
|
|
|
|
|
|
|
} |
|
1071
|
|
|
|
|
|
|
else { |
|
1072
|
|
|
|
|
|
|
# Bio::Taxon should have a DB handle of some type attached, so |
|
1073
|
|
|
|
|
|
|
# derive the classification from that |
|
1074
|
0
|
|
|
|
|
0
|
my $node = $spec; |
|
1075
|
0
|
|
|
|
|
0
|
while ($node) { |
|
1076
|
0
|
|
0
|
|
|
0
|
$node = $node->ancestor || last; |
|
1077
|
0
|
|
|
|
|
0
|
unshift @classification, $node->node_name; |
|
1078
|
|
|
|
|
|
|
#$node eq $root && last; |
|
1079
|
|
|
|
|
|
|
} |
|
1080
|
0
|
|
|
|
|
0
|
@classification = reverse @classification; |
|
1081
|
|
|
|
|
|
|
} |
|
1082
|
22
|
50
|
|
|
|
83
|
my $abname = $spec->name('abbreviated') ? # from genbank file |
|
1083
|
|
|
|
|
|
|
$spec->name('abbreviated')->[0] : $sn; |
|
1084
|
22
|
100
|
|
|
|
74
|
my $sl = $on ? "$on " : ''; |
|
1085
|
22
|
100
|
|
|
|
74
|
$sl .= $cn ? "$abname ($cn)" : $abname; |
|
1086
|
|
|
|
|
|
|
|
|
1087
|
22
|
|
|
|
|
85
|
$self->_write_line_GenBank_regex("SOURCE ", ' 'x12, $sl, "\\s\+\|\$", 80); |
|
1088
|
22
|
|
|
|
|
73
|
$self->_print(" ORGANISM ", $spec->scientific_name, "\n"); |
|
1089
|
22
|
|
|
|
|
106
|
my $OC = join('; ', reverse @classification) . '.'; |
|
1090
|
22
|
|
|
|
|
94
|
$self->_write_line_GenBank_regex(' 'x12,' 'x12, $OC, "\\s\+\|\$", 80); |
|
1091
|
|
|
|
|
|
|
} |
|
1092
|
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
# Reference lines |
|
1094
|
25
|
|
|
|
|
61
|
my $count = 1; |
|
1095
|
25
|
|
|
|
|
79
|
foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { |
|
1096
|
47
|
|
|
|
|
80
|
$temp_line = "REFERENCE $count"; |
|
1097
|
47
|
50
|
|
|
|
149
|
if ($ref->start) { |
|
|
|
0
|
|
|
|
|
|
|
1098
|
47
|
100
|
|
|
|
140
|
$temp_line .= sprintf (" (%s %d to %d)", |
|
1099
|
|
|
|
|
|
|
($seq->alphabet() eq "protein" ? |
|
1100
|
|
|
|
|
|
|
"residues" : "bases"), |
|
1101
|
|
|
|
|
|
|
$ref->start, $ref->end); |
|
1102
|
|
|
|
|
|
|
} |
|
1103
|
|
|
|
|
|
|
elsif ($ref->gb_reference) { |
|
1104
|
0
|
|
|
|
|
0
|
$temp_line .= sprintf (" (%s)", $ref->gb_reference); |
|
1105
|
|
|
|
|
|
|
} |
|
1106
|
47
|
|
|
|
|
164
|
$self->_print("$temp_line\n"); |
|
1107
|
47
|
|
|
|
|
122
|
$self->_write_line_GenBank_regex(" AUTHORS ", ' 'x12, |
|
1108
|
|
|
|
|
|
|
$ref->authors, "\\s\+\|\$", 80); |
|
1109
|
47
|
100
|
|
|
|
153
|
$self->_write_line_GenBank_regex(" CONSRTM ", ' 'x12, |
|
1110
|
|
|
|
|
|
|
$ref->consortium, "\\s\+\|\$", 80) if $ref->consortium; |
|
1111
|
47
|
|
|
|
|
142
|
$self->_write_line_GenBank_regex(" TITLE ", ' 'x12, |
|
1112
|
|
|
|
|
|
|
$ref->title, "\\s\+\|\$", 80); |
|
1113
|
47
|
|
|
|
|
134
|
$self->_write_line_GenBank_regex(" JOURNAL ", ' 'x12, |
|
1114
|
|
|
|
|
|
|
$ref->location, "\\s\+\|\$", 80); |
|
1115
|
47
|
100
|
|
|
|
120
|
if ( $ref->medline) { |
|
1116
|
5
|
|
|
|
|
18
|
$self->_write_line_GenBank_regex(" MEDLINE ", ' 'x12, |
|
1117
|
|
|
|
|
|
|
$ref->medline, "\\s\+\|\$", 80); |
|
1118
|
|
|
|
|
|
|
# I am assuming that pubmed entries only exist when there |
|
1119
|
|
|
|
|
|
|
# are also MEDLINE entries due to the indentation |
|
1120
|
|
|
|
|
|
|
} |
|
1121
|
|
|
|
|
|
|
# This could be a wrong assumption |
|
1122
|
47
|
100
|
|
|
|
128
|
if ( $ref->pubmed ) { |
|
1123
|
13
|
|
|
|
|
84
|
$self->_write_line_GenBank_regex(" PUBMED ", ' 'x12, |
|
1124
|
|
|
|
|
|
|
$ref->pubmed, "\\s\+\|\$", 80); |
|
1125
|
|
|
|
|
|
|
} |
|
1126
|
|
|
|
|
|
|
# put remark at the end |
|
1127
|
47
|
50
|
|
|
|
158
|
if ($ref->comment) { |
|
1128
|
0
|
|
|
|
|
0
|
$self->_write_line_GenBank_regex(" REMARK ", ' 'x12, |
|
1129
|
|
|
|
|
|
|
$ref->comment, "\\s\+\|\$", 80); |
|
1130
|
|
|
|
|
|
|
} |
|
1131
|
47
|
|
|
|
|
84
|
$count++; |
|
1132
|
|
|
|
|
|
|
} |
|
1133
|
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
# Comment lines |
|
1135
|
25
|
|
|
|
|
155
|
foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { |
|
1136
|
13
|
|
|
|
|
39
|
$self->_write_line_GenBank_regex("COMMENT ", ' 'x12, |
|
1137
|
|
|
|
|
|
|
$comment->text, "\\s\+\|\$", 80); |
|
1138
|
|
|
|
|
|
|
} |
|
1139
|
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
# FEATURES section |
|
1141
|
25
|
|
|
|
|
74
|
$self->_print("FEATURES Location/Qualifiers\n"); |
|
1142
|
|
|
|
|
|
|
|
|
1143
|
25
|
50
|
|
|
|
70
|
if ( defined $self->_post_sort ) { |
|
1144
|
|
|
|
|
|
|
# we need to read things into an array. Process. Sort them. Print 'em |
|
1145
|
0
|
|
|
|
|
0
|
my $post_sort_func = $self->_post_sort; |
|
1146
|
0
|
|
|
|
|
0
|
my @fth; |
|
1147
|
|
|
|
|
|
|
|
|
1148
|
0
|
|
|
|
|
0
|
foreach my $sf ( $seq->top_SeqFeatures ) { |
|
1149
|
0
|
|
|
|
|
0
|
push @fth, Bio::SeqIO::FTHelper::from_SeqFeature($sf, $seq); |
|
1150
|
|
|
|
|
|
|
} |
|
1151
|
|
|
|
|
|
|
|
|
1152
|
0
|
|
|
|
|
0
|
@fth = sort { &$post_sort_func($a, $b) } @fth; |
|
|
0
|
|
|
|
|
0
|
|
|
1153
|
|
|
|
|
|
|
|
|
1154
|
0
|
|
|
|
|
0
|
foreach my $fth ( @fth ) { |
|
1155
|
0
|
|
|
|
|
0
|
$self->_print_GenBank_FTHelper($fth); |
|
1156
|
|
|
|
|
|
|
} |
|
1157
|
|
|
|
|
|
|
} |
|
1158
|
|
|
|
|
|
|
else { |
|
1159
|
|
|
|
|
|
|
# not post sorted. And so we can print as we get them. |
|
1160
|
|
|
|
|
|
|
# lower memory load... |
|
1161
|
25
|
|
|
|
|
110
|
foreach my $sf ( $seq->top_SeqFeatures ) { |
|
1162
|
430
|
|
|
|
|
1016
|
my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf, $seq); |
|
1163
|
430
|
|
|
|
|
444
|
foreach my $fth ( @fth ) { |
|
1164
|
430
|
50
|
|
|
|
1088
|
if ( ! $fth->isa('Bio::SeqIO::FTHelper') ) { |
|
1165
|
0
|
|
|
|
|
0
|
$sf->throw("Cannot process FTHelper... $fth"); |
|
1166
|
|
|
|
|
|
|
} |
|
1167
|
430
|
|
|
|
|
841
|
$self->_print_GenBank_FTHelper($fth); |
|
1168
|
|
|
|
|
|
|
} |
|
1169
|
|
|
|
|
|
|
} |
|
1170
|
|
|
|
|
|
|
} |
|
1171
|
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
# deal with WGS; WGS_SCAFLD present only if WGS is also present |
|
1173
|
25
|
50
|
|
|
|
180
|
if ($seq->annotation->get_Annotations('wgs')) { |
|
1174
|
0
|
|
|
|
|
0
|
foreach my $wgs (map {$seq->annotation->get_Annotations($_)} |
|
|
0
|
|
|
|
|
0
|
|
|
1175
|
|
|
|
|
|
|
qw(wgs wgs_scaffold) |
|
1176
|
|
|
|
|
|
|
) { |
|
1177
|
0
|
|
|
|
|
0
|
$self->_print(sprintf ("%-11s %s\n", |
|
1178
|
|
|
|
|
|
|
uc($wgs->tagname), |
|
1179
|
|
|
|
|
|
|
$wgs->value)); |
|
1180
|
|
|
|
|
|
|
} |
|
1181
|
0
|
|
|
|
|
0
|
$self->_show_dna(0); |
|
1182
|
|
|
|
|
|
|
} |
|
1183
|
25
|
100
|
|
|
|
80
|
if ($seq->annotation->get_Annotations('contig')) { |
|
1184
|
1
|
|
|
|
|
4
|
my $ct = 0; |
|
1185
|
1
|
|
|
|
|
2
|
my $cline; |
|
1186
|
1
|
|
|
|
|
4
|
foreach my $contig ($seq->annotation->get_Annotations('contig')) { |
|
1187
|
1
|
50
|
|
|
|
5
|
unless ($ct) { |
|
1188
|
1
|
|
|
|
|
6
|
$cline = uc($contig->tagname) . " " . $contig->value . "\n"; |
|
1189
|
|
|
|
|
|
|
} |
|
1190
|
|
|
|
|
|
|
else { |
|
1191
|
0
|
|
|
|
|
0
|
$cline = " " . $contig->value . "\n"; |
|
1192
|
|
|
|
|
|
|
} |
|
1193
|
1
|
|
|
|
|
5
|
$self->_print($cline); |
|
1194
|
1
|
|
|
|
|
2
|
$ct++; |
|
1195
|
|
|
|
|
|
|
} |
|
1196
|
|
|
|
|
|
|
} |
|
1197
|
25
|
100
|
|
|
|
93
|
if ( $seq->length == 0 ) { |
|
1198
|
1
|
|
|
|
|
2
|
$self->_show_dna(0); |
|
1199
|
|
|
|
|
|
|
} |
|
1200
|
|
|
|
|
|
|
|
|
1201
|
25
|
100
|
|
|
|
83
|
if ( $self->_show_dna == 0 ) { |
|
1202
|
1
|
|
|
|
|
2
|
$self->_print("\n//\n"); |
|
1203
|
1
|
|
|
|
|
19
|
return; |
|
1204
|
|
|
|
|
|
|
} |
|
1205
|
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
# finished printing features. |
|
1207
|
|
|
|
|
|
|
|
|
1208
|
24
|
|
|
|
|
382
|
$str =~ tr/A-Z/a-z/; |
|
1209
|
|
|
|
|
|
|
|
|
1210
|
24
|
|
|
|
|
84
|
my ($o) = $seq->annotation->get_Annotations('origin'); |
|
1211
|
24
|
50
|
|
|
|
160
|
$self->_print(sprintf("%-12s%s\n", |
|
1212
|
|
|
|
|
|
|
'ORIGIN', $o ? $o->value : '')); |
|
1213
|
|
|
|
|
|
|
# print out the sequence |
|
1214
|
24
|
|
|
|
|
45
|
my $nuc = 60; # Number of nucleotides per line |
|
1215
|
24
|
|
|
|
|
84
|
my $whole_pat = 'a10' x 6; # Pattern for unpacking a whole line |
|
1216
|
24
|
|
|
|
|
38
|
my $out_pat = 'A11' x 6; # Pattern for packing a line |
|
1217
|
24
|
|
|
|
|
37
|
my $length = length $str; |
|
1218
|
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
# Calculate the number of nucleotides which fit on whole lines |
|
1220
|
24
|
|
|
|
|
102
|
my $whole = int($length / $nuc) * $nuc; |
|
1221
|
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
# Print the whole lines |
|
1223
|
24
|
|
|
|
|
32
|
my $i; |
|
1224
|
24
|
|
|
|
|
99
|
for ($i = 0; $i < $whole; $i += $nuc) { |
|
1225
|
1953
|
|
|
|
|
6419
|
my $blocks = pack $out_pat, |
|
1226
|
|
|
|
|
|
|
unpack $whole_pat, |
|
1227
|
|
|
|
|
|
|
substr($str, $i, $nuc); |
|
1228
|
1953
|
|
|
|
|
2642
|
chop $blocks; |
|
1229
|
1953
|
|
|
|
|
5236
|
$self->_print(sprintf("%9d $blocks\n", $i + $nuc - 59)); |
|
1230
|
|
|
|
|
|
|
} |
|
1231
|
|
|
|
|
|
|
|
|
1232
|
|
|
|
|
|
|
# Print the last line |
|
1233
|
24
|
100
|
|
|
|
88
|
if (my $last = substr($str, $i)) { |
|
1234
|
19
|
|
|
|
|
33
|
my $last_len = length($last); |
|
1235
|
19
|
|
|
|
|
111
|
my $last_pat = 'a10' x int($last_len / 10) |
|
1236
|
|
|
|
|
|
|
. 'a' . $last_len % 10; |
|
1237
|
19
|
|
|
|
|
93
|
my $blocks = pack $out_pat, |
|
1238
|
|
|
|
|
|
|
unpack($last_pat, $last); |
|
1239
|
19
|
|
|
|
|
116
|
$blocks =~ s/ +$//; |
|
1240
|
19
|
|
|
|
|
176
|
$self->_print(sprintf("%9d $blocks\n", |
|
1241
|
|
|
|
|
|
|
$length - $last_len + 1)); |
|
1242
|
|
|
|
|
|
|
} |
|
1243
|
|
|
|
|
|
|
|
|
1244
|
24
|
|
|
|
|
65
|
$self->_print("//\n"); |
|
1245
|
|
|
|
|
|
|
|
|
1246
|
24
|
50
|
33
|
|
|
81
|
$self->flush if $self->_flush_on_write && defined $self->_fh; |
|
1247
|
24
|
|
|
|
|
190
|
return 1; |
|
1248
|
|
|
|
|
|
|
} |
|
1249
|
|
|
|
|
|
|
} |
|
1250
|
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
=head2 _print_GenBank_FTHelper |
|
1252
|
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
Title : _print_GenBank_FTHelper |
|
1254
|
|
|
|
|
|
|
Usage : |
|
1255
|
|
|
|
|
|
|
Function: |
|
1256
|
|
|
|
|
|
|
Example : |
|
1257
|
|
|
|
|
|
|
Returns : |
|
1258
|
|
|
|
|
|
|
Args : |
|
1259
|
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
=cut |
|
1261
|
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
sub _print_GenBank_FTHelper { |
|
1263
|
430
|
|
|
430
|
|
383
|
my ( $self, $fth ) = @_; |
|
1264
|
|
|
|
|
|
|
|
|
1265
|
430
|
50
|
33
|
|
|
1582
|
if ( not ref $fth or not $fth->isa('Bio::SeqIO::FTHelper') ) { |
|
1266
|
0
|
|
|
|
|
0
|
$fth->warn( |
|
1267
|
|
|
|
|
|
|
"$fth is not a FTHelper class. Attempting to print but there could be issues" |
|
1268
|
|
|
|
|
|
|
); |
|
1269
|
|
|
|
|
|
|
} |
|
1270
|
|
|
|
|
|
|
|
|
1271
|
430
|
50
|
|
|
|
680
|
my $spacer = ( length $fth->key >= 15 ) ? ' ' : ''; |
|
1272
|
430
|
|
|
|
|
783
|
$self->_write_line_GenBank_regex( |
|
1273
|
|
|
|
|
|
|
sprintf( " %-16s%s", $fth->key, $spacer ), |
|
1274
|
|
|
|
|
|
|
" " x 21, $fth->loc, "\,\|\$", 80 ); |
|
1275
|
|
|
|
|
|
|
|
|
1276
|
430
|
|
|
|
|
561
|
foreach my $tag ( keys %{ $fth->field } ) { |
|
|
430
|
|
|
|
|
829
|
|
|
1277
|
|
|
|
|
|
|
# Account for hash structure in Annotation::DBLink, not the expected array |
|
1278
|
997
|
50
|
66
|
|
|
2208
|
if ( $tag eq 'db_xref' and grep /HASH/, @{ $fth->field->{$tag} }) { |
|
|
140
|
|
|
|
|
278
|
|
|
1279
|
0
|
|
|
|
|
0
|
for my $ref ( @{ $fth->field->{$tag} } ) { |
|
|
0
|
|
|
|
|
0
|
|
|
1280
|
0
|
|
|
|
|
0
|
my $db = $ref->{'database'}; |
|
1281
|
0
|
|
|
|
|
0
|
my $id = $ref->{'primary_id'}; |
|
1282
|
0
|
|
|
|
|
0
|
$self->_write_line_GenBank_regex |
|
1283
|
|
|
|
|
|
|
( " " x 21, " " x 21, |
|
1284
|
|
|
|
|
|
|
"/$tag=\"$db:$id\"", "\.\|\$", 80 ); |
|
1285
|
|
|
|
|
|
|
} |
|
1286
|
|
|
|
|
|
|
} |
|
1287
|
|
|
|
|
|
|
# The usual case, where all values are found in an array |
|
1288
|
|
|
|
|
|
|
else { |
|
1289
|
997
|
|
|
|
|
841
|
foreach my $value ( @{ $fth->field->{$tag} } ) { |
|
|
997
|
|
|
|
|
1548
|
|
|
1290
|
1085
|
|
|
|
|
1379
|
$value =~ s/\"/\"\"/g; |
|
1291
|
1085
|
50
|
100
|
|
|
3357
|
if ( $value eq "_no_value" ) { |
|
|
|
100
|
|
|
|
|
|
|
1292
|
0
|
|
|
|
|
0
|
$self->_write_line_GenBank_regex |
|
1293
|
|
|
|
|
|
|
( " " x 21, " " x 21, |
|
1294
|
|
|
|
|
|
|
"/$tag", "\.\|\$", 80 ); |
|
1295
|
|
|
|
|
|
|
} |
|
1296
|
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
# There are almost 3x more quoted qualifier values and they |
|
1298
|
|
|
|
|
|
|
# are more common too so we take quoted ones first. |
|
1299
|
|
|
|
|
|
|
# Long qualifiers, that will be line wrapped, are always quoted |
|
1300
|
|
|
|
|
|
|
elsif ( not $FTQUAL_NO_QUOTE{$tag} |
|
1301
|
|
|
|
|
|
|
or length("/$tag=$value") >= $FTQUAL_LINE_LENGTH |
|
1302
|
|
|
|
|
|
|
) { |
|
1303
|
976
|
100
|
|
|
|
2213
|
my ($pat) = ( $value =~ /\s/ ? '\s|$' : '.|$' ); |
|
1304
|
976
|
|
|
|
|
2831
|
$self->_write_line_GenBank_regex |
|
1305
|
|
|
|
|
|
|
( " " x 21, " " x 21, |
|
1306
|
|
|
|
|
|
|
"/$tag=\"$value\"", $pat, 80 ); |
|
1307
|
|
|
|
|
|
|
} |
|
1308
|
|
|
|
|
|
|
else { |
|
1309
|
109
|
|
|
|
|
357
|
$self->_write_line_GenBank_regex |
|
1310
|
|
|
|
|
|
|
( " " x 21, " " x 21, |
|
1311
|
|
|
|
|
|
|
"/$tag=$value", "\.\|\$", 80 ); |
|
1312
|
|
|
|
|
|
|
} |
|
1313
|
|
|
|
|
|
|
} |
|
1314
|
|
|
|
|
|
|
} |
|
1315
|
|
|
|
|
|
|
} |
|
1316
|
|
|
|
|
|
|
} |
|
1317
|
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
=head2 _read_GenBank_References |
|
1319
|
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
Title : _read_GenBank_References |
|
1321
|
|
|
|
|
|
|
Usage : |
|
1322
|
|
|
|
|
|
|
Function: Reads references from GenBank format. Internal function really |
|
1323
|
|
|
|
|
|
|
Returns : |
|
1324
|
|
|
|
|
|
|
Args : |
|
1325
|
|
|
|
|
|
|
|
|
1326
|
|
|
|
|
|
|
=cut |
|
1327
|
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
sub _read_GenBank_References { |
|
1329
|
100
|
|
|
100
|
|
159
|
my ($self, $buffer) = @_; |
|
1330
|
100
|
|
|
|
|
134
|
my (@refs); |
|
1331
|
|
|
|
|
|
|
my $ref; |
|
1332
|
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
# assumme things are starting with RN |
|
1334
|
100
|
50
|
|
|
|
504
|
if ( $$buffer !~ /^REFERENCE/ ) { |
|
1335
|
0
|
|
|
|
|
0
|
warn("Not parsing line '$$buffer' which maybe important"); |
|
1336
|
|
|
|
|
|
|
} |
|
1337
|
|
|
|
|
|
|
|
|
1338
|
100
|
|
|
|
|
159
|
my $line = $$buffer; |
|
1339
|
|
|
|
|
|
|
|
|
1340
|
100
|
|
|
|
|
127
|
my (@title,@loc,@authors,@consort,@com,@medline,@pubmed); |
|
1341
|
|
|
|
|
|
|
|
|
1342
|
|
|
|
|
|
|
REFLOOP: |
|
1343
|
100
|
|
66
|
|
|
416
|
while( defined($line) or defined($line = $self->_readline) ) { |
|
1344
|
1379
|
100
|
|
|
|
3196
|
if ($line =~ /^\s{2}AUTHORS\s+(.*)/o) { |
|
1345
|
402
|
|
|
|
|
1197
|
push @authors, $1; |
|
1346
|
402
|
|
|
|
|
780
|
while ( defined($line = $self->_readline) ) { |
|
1347
|
1040
|
100
|
|
|
|
2410
|
if ($line =~ /^\s{9,}(.*)/o) { |
|
1348
|
638
|
|
|
|
|
975
|
push @authors, $1; |
|
1349
|
638
|
|
|
|
|
1066
|
next; |
|
1350
|
|
|
|
|
|
|
} |
|
1351
|
402
|
|
|
|
|
398
|
last; |
|
1352
|
|
|
|
|
|
|
} |
|
1353
|
402
|
|
|
|
|
1622
|
$ref->authors(join(' ', @authors)); |
|
1354
|
|
|
|
|
|
|
} |
|
1355
|
|
|
|
|
|
|
|
|
1356
|
1379
|
100
|
|
|
|
2132
|
if ($line =~ /^\s{2}CONSRTM\s+(.*)/o) { |
|
1357
|
35
|
|
|
|
|
298
|
push @consort, $1; |
|
1358
|
35
|
|
|
|
|
86
|
while ( defined($line = $self->_readline) ) { |
|
1359
|
36
|
100
|
|
|
|
108
|
if ($line =~ /^\s{9,}(.*)/o) { |
|
1360
|
1
|
|
|
|
|
4
|
push @consort, $1; |
|
1361
|
1
|
|
|
|
|
3
|
next; |
|
1362
|
|
|
|
|
|
|
} |
|
1363
|
35
|
|
|
|
|
45
|
last; |
|
1364
|
|
|
|
|
|
|
} |
|
1365
|
35
|
|
|
|
|
157
|
$ref->consortium(join(' ', @consort)); |
|
1366
|
|
|
|
|
|
|
} |
|
1367
|
|
|
|
|
|
|
|
|
1368
|
1379
|
100
|
|
|
|
2622
|
if ($line =~ /^\s{2}TITLE\s+(.*)/o) { |
|
1369
|
408
|
|
|
|
|
729
|
push @title, $1; |
|
1370
|
408
|
|
|
|
|
781
|
while ( defined($line = $self->_readline) ) { |
|
1371
|
693
|
100
|
|
|
|
1756
|
if ($line =~ /^\s{9,}(.*)/o) { |
|
1372
|
285
|
|
|
|
|
477
|
push @title, $1; |
|
1373
|
285
|
|
|
|
|
550
|
next; |
|
1374
|
|
|
|
|
|
|
} |
|
1375
|
408
|
|
|
|
|
478
|
last; |
|
1376
|
|
|
|
|
|
|
} |
|
1377
|
408
|
|
|
|
|
1651
|
$ref->title(join(' ', @title)); |
|
1378
|
|
|
|
|
|
|
} |
|
1379
|
|
|
|
|
|
|
|
|
1380
|
1379
|
100
|
|
|
|
2685
|
if ($line =~ /^\s{2}JOURNAL\s+(.*)/o) { |
|
1381
|
416
|
|
|
|
|
751
|
push @loc, $1; |
|
1382
|
416
|
|
|
|
|
761
|
while ( defined($line = $self->_readline) ) { |
|
1383
|
|
|
|
|
|
|
# we only match when there are at least 4 spaces |
|
1384
|
|
|
|
|
|
|
# there is probably a better way to match this |
|
1385
|
|
|
|
|
|
|
# as it assumes that the describing tag is short enough |
|
1386
|
565
|
100
|
|
|
|
1349
|
if ($line =~ /^\s{9,}(.*)/o) { |
|
1387
|
149
|
|
|
|
|
526
|
push @loc, $1; |
|
1388
|
149
|
|
|
|
|
279
|
next; |
|
1389
|
|
|
|
|
|
|
} |
|
1390
|
416
|
|
|
|
|
392
|
last; |
|
1391
|
|
|
|
|
|
|
} |
|
1392
|
416
|
|
|
|
|
1500
|
$ref->location(join(' ', @loc)); |
|
1393
|
416
|
|
|
|
|
528
|
redo REFLOOP; |
|
1394
|
|
|
|
|
|
|
} |
|
1395
|
|
|
|
|
|
|
|
|
1396
|
963
|
100
|
|
|
|
1476
|
if ($line =~ /^\s{2}REMARK\s+(.*)/o) { |
|
1397
|
59
|
|
|
|
|
102
|
push @com, $1; |
|
1398
|
59
|
|
|
|
|
103
|
while ( defined($line = $self->_readline) ) { |
|
1399
|
83
|
100
|
|
|
|
190
|
if ($line =~ /^\s{9,}(.*)/o) { |
|
1400
|
24
|
|
|
|
|
46
|
push @com, $1; |
|
1401
|
24
|
|
|
|
|
44
|
next; |
|
1402
|
|
|
|
|
|
|
} |
|
1403
|
59
|
|
|
|
|
56
|
last; |
|
1404
|
|
|
|
|
|
|
} |
|
1405
|
59
|
|
|
|
|
188
|
$ref->comment(join(' ', @com)); |
|
1406
|
59
|
|
|
|
|
61
|
redo REFLOOP; |
|
1407
|
|
|
|
|
|
|
} |
|
1408
|
|
|
|
|
|
|
|
|
1409
|
904
|
100
|
|
|
|
1547
|
if ( $line =~ /^\s{2}MEDLINE\s+(.*)/ ) { |
|
1410
|
130
|
|
|
|
|
293
|
push @medline, $1; |
|
1411
|
130
|
|
|
|
|
277
|
while ( defined($line = $self->_readline) ) { |
|
1412
|
130
|
50
|
|
|
|
368
|
if ($line =~ /^\s{9,}(.*)/) { |
|
1413
|
0
|
|
|
|
|
0
|
push @medline, $1; |
|
1414
|
0
|
|
|
|
|
0
|
next; |
|
1415
|
|
|
|
|
|
|
} |
|
1416
|
130
|
|
|
|
|
147
|
last; |
|
1417
|
|
|
|
|
|
|
} |
|
1418
|
130
|
|
|
|
|
392
|
$ref->medline(join(' ', @medline)); |
|
1419
|
130
|
|
|
|
|
150
|
redo REFLOOP; |
|
1420
|
|
|
|
|
|
|
} |
|
1421
|
|
|
|
|
|
|
|
|
1422
|
774
|
100
|
|
|
|
1568
|
if ( $line =~ /^\s{3}PUBMED\s+(.*)/ ) { |
|
1423
|
237
|
|
|
|
|
531
|
push @pubmed, $1; |
|
1424
|
237
|
|
|
|
|
502
|
while ( defined($line = $self->_readline) ) { |
|
1425
|
237
|
50
|
|
|
|
582
|
if ($line =~ /^\s{9,}(.*)/) { |
|
1426
|
0
|
|
|
|
|
0
|
push @pubmed, $1; |
|
1427
|
0
|
|
|
|
|
0
|
next; |
|
1428
|
|
|
|
|
|
|
} |
|
1429
|
237
|
|
|
|
|
234
|
last; |
|
1430
|
|
|
|
|
|
|
} |
|
1431
|
237
|
|
|
|
|
644
|
$ref->pubmed(join(' ', @pubmed)); |
|
1432
|
237
|
|
|
|
|
245
|
redo REFLOOP; |
|
1433
|
|
|
|
|
|
|
} |
|
1434
|
|
|
|
|
|
|
|
|
1435
|
537
|
100
|
|
|
|
1196
|
if ( $line =~ /^REFERENCE/o ) { |
|
1436
|
|
|
|
|
|
|
# store current reference |
|
1437
|
417
|
100
|
|
|
|
1058
|
$self->_add_ref_to_array(\@refs,$ref) if defined $ref; |
|
1438
|
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
# reset |
|
1440
|
417
|
|
|
|
|
661
|
@authors = (); |
|
1441
|
417
|
|
|
|
|
457
|
@title = (); |
|
1442
|
417
|
|
|
|
|
487
|
@loc = (); |
|
1443
|
417
|
|
|
|
|
399
|
@com = (); |
|
1444
|
417
|
|
|
|
|
378
|
@pubmed = (); |
|
1445
|
417
|
|
|
|
|
347
|
@medline = (); |
|
1446
|
|
|
|
|
|
|
|
|
1447
|
|
|
|
|
|
|
# create the new reference object |
|
1448
|
417
|
|
|
|
|
1892
|
$ref = Bio::Annotation::Reference->new(-tagname => 'reference'); |
|
1449
|
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
# check whether start and end base is given |
|
1451
|
417
|
100
|
|
|
|
1913
|
if ($line =~ /^REFERENCE\s+\d+\s+\([a-z]+ (\d+) to (\d+)\)/){ |
|
|
|
100
|
|
|
|
|
|
|
1452
|
286
|
|
|
|
|
720
|
$ref->start($1); |
|
1453
|
286
|
|
|
|
|
550
|
$ref->end($2); |
|
1454
|
|
|
|
|
|
|
} |
|
1455
|
|
|
|
|
|
|
elsif ($line =~ /^REFERENCE\s+\d+\s+\((.*)\)/) { |
|
1456
|
14
|
|
|
|
|
48
|
$ref->gb_reference($1); |
|
1457
|
|
|
|
|
|
|
} |
|
1458
|
|
|
|
|
|
|
} |
|
1459
|
|
|
|
|
|
|
|
|
1460
|
537
|
100
|
|
|
|
2761
|
last if ($line =~ /^(FEATURES)|(COMMENT)/o); |
|
1461
|
|
|
|
|
|
|
|
|
1462
|
437
|
|
|
|
|
1757
|
$line = undef; # Empty $line to trigger read of next line |
|
1463
|
|
|
|
|
|
|
} |
|
1464
|
|
|
|
|
|
|
|
|
1465
|
|
|
|
|
|
|
# store last reference |
|
1466
|
100
|
50
|
|
|
|
402
|
$self->_add_ref_to_array(\@refs, $ref) if defined $ref; |
|
1467
|
|
|
|
|
|
|
|
|
1468
|
100
|
|
|
|
|
141
|
$$buffer = $line; |
|
1469
|
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
#print "\nnumber of references found: ", $#refs+1,"\n"; |
|
1471
|
|
|
|
|
|
|
|
|
1472
|
100
|
|
|
|
|
444
|
return @refs; |
|
1473
|
|
|
|
|
|
|
} |
|
1474
|
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
=head2 _add_ref_to_array |
|
1476
|
|
|
|
|
|
|
|
|
1477
|
|
|
|
|
|
|
Title: _add_ref_to_array |
|
1478
|
|
|
|
|
|
|
Usage: |
|
1479
|
|
|
|
|
|
|
Function: Adds a Reference object to an array of Reference objects, takes |
|
1480
|
|
|
|
|
|
|
care of possible cleanups to be done (currently, only author and title |
|
1481
|
|
|
|
|
|
|
will be chopped of trailing semicolons). |
|
1482
|
|
|
|
|
|
|
Args: A reference to an array of Reference objects and |
|
1483
|
|
|
|
|
|
|
the Reference object to be added |
|
1484
|
|
|
|
|
|
|
Returns: nothing |
|
1485
|
|
|
|
|
|
|
|
|
1486
|
|
|
|
|
|
|
=cut |
|
1487
|
|
|
|
|
|
|
|
|
1488
|
|
|
|
|
|
|
sub _add_ref_to_array { |
|
1489
|
417
|
|
|
417
|
|
485
|
my ($self, $refs, $ref) = @_; |
|
1490
|
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
# first, polish author and title by removing possible trailing semicolons |
|
1492
|
417
|
|
|
|
|
702
|
my $au = $ref->authors; |
|
1493
|
417
|
|
|
|
|
728
|
my $title = $ref->title; |
|
1494
|
417
|
100
|
|
|
|
899
|
$au =~ s/;\s*$//g if $au; |
|
1495
|
417
|
100
|
|
|
|
752
|
$title =~ s/;\s*$//g if $title; |
|
1496
|
417
|
|
|
|
|
685
|
$ref->authors($au); |
|
1497
|
417
|
|
|
|
|
722
|
$ref->title($title); |
|
1498
|
|
|
|
|
|
|
# the rest should be clean already, so go ahead and add it |
|
1499
|
417
|
|
|
|
|
336
|
push @{$refs}, $ref; |
|
|
417
|
|
|
|
|
704
|
|
|
1500
|
|
|
|
|
|
|
} |
|
1501
|
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
=head2 _read_GenBank_Species |
|
1503
|
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
Title : _read_GenBank_Species |
|
1505
|
|
|
|
|
|
|
Usage : |
|
1506
|
|
|
|
|
|
|
Function: Reads the GenBank Organism species and classification |
|
1507
|
|
|
|
|
|
|
lines. Able to deal with unconvential Organism naming |
|
1508
|
|
|
|
|
|
|
formats, and varietas in plants |
|
1509
|
|
|
|
|
|
|
Example : ORGANISM unknown marine gamma proteobacterium NOR5 |
|
1510
|
|
|
|
|
|
|
$genus = undef |
|
1511
|
|
|
|
|
|
|
$species = unknown marine gamma proteobacterium NOR5 |
|
1512
|
|
|
|
|
|
|
|
|
1513
|
|
|
|
|
|
|
ORGANISM Drosophila sp. 'white tip scutellum' |
|
1514
|
|
|
|
|
|
|
$genus = Drosophila |
|
1515
|
|
|
|
|
|
|
$species = sp. 'white tip scutellum' |
|
1516
|
|
|
|
|
|
|
(yes, this really is a species and that is its name) |
|
1517
|
|
|
|
|
|
|
$subspecies = undef |
|
1518
|
|
|
|
|
|
|
|
|
1519
|
|
|
|
|
|
|
ORGANISM Ajellomyces capsulatus var. farciminosus |
|
1520
|
|
|
|
|
|
|
$genus = Ajellomyces |
|
1521
|
|
|
|
|
|
|
$species = capsulatus |
|
1522
|
|
|
|
|
|
|
$subspecies = var. farciminosus |
|
1523
|
|
|
|
|
|
|
|
|
1524
|
|
|
|
|
|
|
ORGANISM Hepatitis delta virus |
|
1525
|
|
|
|
|
|
|
$genus = undef (though this virus has a genus in its lineage, we |
|
1526
|
|
|
|
|
|
|
cannot know that without a database lookup) |
|
1527
|
|
|
|
|
|
|
$species = Hepatitis delta virus |
|
1528
|
|
|
|
|
|
|
|
|
1529
|
|
|
|
|
|
|
Returns : A Bio::Species object |
|
1530
|
|
|
|
|
|
|
Args : A reference to the current line buffer |
|
1531
|
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
=cut |
|
1533
|
|
|
|
|
|
|
|
|
1534
|
|
|
|
|
|
|
sub _read_GenBank_Species { |
|
1535
|
101
|
|
|
101
|
|
126
|
my ($self, $buffer) = @_; |
|
1536
|
|
|
|
|
|
|
|
|
1537
|
101
|
|
|
|
|
426
|
my @unkn_names = ('other', 'unknown organism', 'not specified', 'not shown', |
|
1538
|
|
|
|
|
|
|
'Unspecified', 'Unknown', 'None', 'unclassified', |
|
1539
|
|
|
|
|
|
|
'unidentified organism', 'not supplied'); |
|
1540
|
|
|
|
|
|
|
# dictionary of synonyms for taxid 32644 |
|
1541
|
101
|
|
|
|
|
209
|
my @unkn_genus = ('unknown', 'unclassified', 'uncultured', 'unidentified'); |
|
1542
|
|
|
|
|
|
|
# all above can be part of valid species name |
|
1543
|
|
|
|
|
|
|
|
|
1544
|
101
|
|
|
|
|
156
|
my $line = $$buffer; |
|
1545
|
|
|
|
|
|
|
|
|
1546
|
101
|
|
|
|
|
140
|
my( $sub_species, $species, $genus, $sci_name, $common, |
|
1547
|
|
|
|
|
|
|
$class_lines, $source_flag, $abbr_name, $organelle, $sl ); |
|
1548
|
101
|
|
|
|
|
199
|
my %source = map { $_ => 1 } qw(SOURCE ORGANISM CLASSIFICATION); |
|
|
303
|
|
|
|
|
636
|
|
|
1549
|
|
|
|
|
|
|
|
|
1550
|
|
|
|
|
|
|
# upon first entering the loop, we must not read a new line -- the SOURCE |
|
1551
|
|
|
|
|
|
|
# line is already in the buffer (HL 05/10/2000) |
|
1552
|
101
|
|
|
|
|
145
|
my ($ann, $tag, $data); |
|
1553
|
101
|
|
66
|
|
|
362
|
while (defined($line) or defined($line = $self->_readline)) { |
|
1554
|
|
|
|
|
|
|
# de-HTMLify (links that may be encountered here don't contain |
|
1555
|
|
|
|
|
|
|
# escaped '>', so a simple-minded approach suffices) |
|
1556
|
517
|
|
|
|
|
635
|
$line =~ s{<[^>]+>}{}g; |
|
1557
|
517
|
100
|
|
|
|
1760
|
if ($line =~ m{^(?:\s{0,2})(\w+)\s+(.+)?$}ox) { |
|
1558
|
303
|
|
50
|
|
|
1071
|
($tag, $data) = ($1, $2 || ''); |
|
1559
|
303
|
100
|
66
|
|
|
1240
|
last if ($tag and not exists $source{$tag}); |
|
1560
|
|
|
|
|
|
|
} |
|
1561
|
|
|
|
|
|
|
else { |
|
1562
|
214
|
50
|
|
|
|
385
|
return unless $tag; |
|
1563
|
214
|
|
|
|
|
620
|
($data = $line) =~ s{^\s+}{}; |
|
1564
|
214
|
|
|
|
|
268
|
chomp $data; |
|
1565
|
214
|
100
|
100
|
|
|
1368
|
$tag = 'CLASSIFICATION' if ( $tag ne 'CLASSIFICATION' |
|
|
|
|
100
|
|
|
|
|
|
1566
|
|
|
|
|
|
|
and $tag eq 'ORGANISM' |
|
1567
|
|
|
|
|
|
|
# Don't match "str." or "var." (fix NC_021815), |
|
1568
|
|
|
|
|
|
|
# and don't match ".1" (fix NC_021902) |
|
1569
|
|
|
|
|
|
|
and $line =~ m{(?
|
|
1570
|
|
|
|
|
|
|
} |
|
1571
|
416
|
100
|
|
|
|
1175
|
(exists $ann->{$tag}) ? ($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data); |
|
1572
|
416
|
|
|
|
|
1221
|
$line = undef; |
|
1573
|
|
|
|
|
|
|
} |
|
1574
|
|
|
|
|
|
|
|
|
1575
|
101
|
|
|
|
|
378
|
($sl, $class_lines, $sci_name) = ($ann->{SOURCE}, $ann->{CLASSIFICATION}, $ann->{ORGANISM}); |
|
1576
|
|
|
|
|
|
|
|
|
1577
|
101
|
|
|
|
|
200
|
$$buffer = $line; |
|
1578
|
|
|
|
|
|
|
|
|
1579
|
101
|
50
|
|
|
|
222
|
$sci_name or return; |
|
1580
|
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
# parse out organelle, common name, abbreviated name if present; |
|
1582
|
|
|
|
|
|
|
# this should catch everything, but falls back to |
|
1583
|
|
|
|
|
|
|
# entire SOURCE line just in case |
|
1584
|
101
|
50
|
|
|
|
1103
|
if ($sl =~ m{^(mitochondrion|chloroplast|plastid)? |
|
1585
|
|
|
|
|
|
|
\s*(.*?) |
|
1586
|
|
|
|
|
|
|
\s*(?: \( (.*?) \) )?\.? |
|
1587
|
|
|
|
|
|
|
$ |
|
1588
|
|
|
|
|
|
|
}xms |
|
1589
|
|
|
|
|
|
|
) { |
|
1590
|
101
|
|
|
|
|
339
|
($organelle, $abbr_name, $common) = ($1, $2, $3); # optional |
|
1591
|
|
|
|
|
|
|
} |
|
1592
|
|
|
|
|
|
|
else { |
|
1593
|
0
|
|
|
|
|
0
|
$abbr_name = $sl; # nothing caught; this is a backup! |
|
1594
|
|
|
|
|
|
|
} |
|
1595
|
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
# Convert data in classification lines into classification array. |
|
1597
|
|
|
|
|
|
|
# only split on ';' or '.' so that classification that is 2 or more words will |
|
1598
|
|
|
|
|
|
|
# still get matched, use map() to remove trailing/leading/intervening spaces |
|
1599
|
101
|
|
|
|
|
819
|
my @class = map { $_ =~ s/^\s+//; |
|
|
955
|
|
|
|
|
1424
|
|
|
1600
|
955
|
|
|
|
|
949
|
$_ =~ s/\s+$//; |
|
1601
|
955
|
|
|
|
|
877
|
$_ =~ s/\s{2,}/ /g; |
|
1602
|
955
|
|
|
|
|
1166
|
$_; } |
|
1603
|
|
|
|
|
|
|
split /(?
|
|
1604
|
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
# do we have a genus? |
|
1606
|
101
|
50
|
|
|
|
508
|
my $possible_genus = quotemeta($class[-1]) |
|
1607
|
|
|
|
|
|
|
. ($class[-2] ? "|" . quotemeta($class[-2]) : ''); |
|
1608
|
101
|
100
|
|
|
|
3704
|
if ($sci_name =~ /^($possible_genus)/) { |
|
1609
|
88
|
|
|
|
|
401
|
$genus = $1; |
|
1610
|
88
|
|
|
|
|
1125
|
($species) = $sci_name =~ /^$genus\s+(.+)/; |
|
1611
|
|
|
|
|
|
|
} |
|
1612
|
|
|
|
|
|
|
else { |
|
1613
|
13
|
|
|
|
|
28
|
$species = $sci_name; |
|
1614
|
|
|
|
|
|
|
} |
|
1615
|
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
# is this organism of rank species or is it lower? |
|
1617
|
|
|
|
|
|
|
# (we don't catch everything lower than species, but it doesn't matter - |
|
1618
|
|
|
|
|
|
|
# this is just so we abide by previous behaviour whilst not calling a |
|
1619
|
|
|
|
|
|
|
# species a subspecies) |
|
1620
|
101
|
100
|
66
|
|
|
1430
|
if ($species and $species =~ /(.+)\s+((?:subsp\.|var\.).+)/) { |
|
1621
|
5
|
|
|
|
|
18
|
($species, $sub_species) = ($1, $2); |
|
1622
|
|
|
|
|
|
|
} |
|
1623
|
|
|
|
|
|
|
|
|
1624
|
|
|
|
|
|
|
# Don't make a species object if it's empty or "Unknown" or "None" |
|
1625
|
|
|
|
|
|
|
# return unless $genus and $genus !~ /^(Unknown|None)$/oi; |
|
1626
|
|
|
|
|
|
|
# Don't make a species object if it belongs to taxid 32644 |
|
1627
|
|
|
|
|
|
|
# my $unkn = grep { $_ =~ /^\Q$sl\E$/; } @unkn_names; |
|
1628
|
101
|
|
|
|
|
174
|
my $unkn = grep { $_ eq $sl } @unkn_names; |
|
|
1010
|
|
|
|
|
1131
|
|
|
1629
|
101
|
50
|
33
|
|
|
649
|
return unless (defined $species or defined $genus) and $unkn == 0; |
|
|
|
|
33
|
|
|
|
|
|
1630
|
|
|
|
|
|
|
|
|
1631
|
|
|
|
|
|
|
# Bio::Species array needs array in Species -> Kingdom direction |
|
1632
|
101
|
|
|
|
|
170
|
push @class, $sci_name; |
|
1633
|
101
|
|
|
|
|
213
|
@class = reverse @class; |
|
1634
|
|
|
|
|
|
|
|
|
1635
|
101
|
|
|
|
|
709
|
my $make = Bio::Species->new; |
|
1636
|
101
|
|
|
|
|
324
|
$make->scientific_name($sci_name); |
|
1637
|
101
|
50
|
|
|
|
516
|
$make->classification(@class) if @class > 0; |
|
1638
|
101
|
100
|
|
|
|
372
|
$make->common_name( $common ) if $common; |
|
1639
|
101
|
50
|
|
|
|
465
|
$make->name('abbreviated', $abbr_name) if $abbr_name; |
|
1640
|
101
|
100
|
|
|
|
257
|
$make->organelle($organelle) if $organelle; |
|
1641
|
|
|
|
|
|
|
#$make->sub_species( $sub_species ) if $sub_species; |
|
1642
|
101
|
|
|
|
|
1111
|
return $make; |
|
1643
|
|
|
|
|
|
|
} |
|
1644
|
|
|
|
|
|
|
|
|
1645
|
|
|
|
|
|
|
=head2 _read_FTHelper_GenBank |
|
1646
|
|
|
|
|
|
|
|
|
1647
|
|
|
|
|
|
|
Title : _read_FTHelper_GenBank |
|
1648
|
|
|
|
|
|
|
Usage : _read_FTHelper_GenBank($buffer) |
|
1649
|
|
|
|
|
|
|
Function: reads the next FT key line |
|
1650
|
|
|
|
|
|
|
Example : |
|
1651
|
|
|
|
|
|
|
Returns : Bio::SeqIO::FTHelper object |
|
1652
|
|
|
|
|
|
|
Args : filehandle and reference to a scalar |
|
1653
|
|
|
|
|
|
|
|
|
1654
|
|
|
|
|
|
|
=cut |
|
1655
|
|
|
|
|
|
|
|
|
1656
|
|
|
|
|
|
|
sub _read_FTHelper_GenBank { |
|
1657
|
8280
|
|
|
8280
|
|
7559
|
my ($self, $buffer) = @_; |
|
1658
|
|
|
|
|
|
|
|
|
1659
|
8280
|
|
|
|
|
6221
|
my ($key, # The key of the feature |
|
1660
|
|
|
|
|
|
|
$loc # The location line from the feature |
|
1661
|
|
|
|
|
|
|
); |
|
1662
|
8280
|
|
|
|
|
8458
|
my @qual = (); # An array of lines making up the qualifiers |
|
1663
|
|
|
|
|
|
|
|
|
1664
|
8280
|
50
|
|
|
|
50487
|
if ($$buffer =~ /^\s{5}(\S+)\s+(.+?)\s*$/o) { |
|
1665
|
8280
|
|
|
|
|
15136
|
$key = $1; |
|
1666
|
8280
|
|
|
|
|
10121
|
$loc = $2; |
|
1667
|
|
|
|
|
|
|
# Read all the lines up to the next feature |
|
1668
|
8280
|
|
|
|
|
17686
|
while ( defined(my $line = $self->_readline) ) { |
|
1669
|
88592
|
100
|
|
|
|
386490
|
if ($line =~ /^(\s+)(.+?)\s*$/o) { |
|
1670
|
|
|
|
|
|
|
# Lines inside features are preceded by 21 spaces |
|
1671
|
|
|
|
|
|
|
# A new feature is preceded by 5 spaces |
|
1672
|
88489
|
100
|
|
|
|
123207
|
if (length($1) > 6) { |
|
1673
|
|
|
|
|
|
|
# Add to qualifiers if we're in the qualifiers, or if it's |
|
1674
|
|
|
|
|
|
|
# the first qualifier |
|
1675
|
80312
|
100
|
100
|
|
|
163820
|
if (@qual or (index($2,'/') == 0)) { |
|
1676
|
68121
|
|
|
|
|
157126
|
push @qual, $2; |
|
1677
|
|
|
|
|
|
|
} |
|
1678
|
|
|
|
|
|
|
# We're still in the location line, so append to location |
|
1679
|
|
|
|
|
|
|
else { |
|
1680
|
12191
|
|
|
|
|
29294
|
$loc .= $2; |
|
1681
|
|
|
|
|
|
|
} |
|
1682
|
|
|
|
|
|
|
} |
|
1683
|
|
|
|
|
|
|
else { |
|
1684
|
|
|
|
|
|
|
# We've reached the start of the next feature |
|
1685
|
|
|
|
|
|
|
# Put the first line of the next feature into the buffer |
|
1686
|
8177
|
|
|
|
|
7737
|
$$buffer = $line; |
|
1687
|
8177
|
|
|
|
|
11783
|
last; |
|
1688
|
|
|
|
|
|
|
} |
|
1689
|
|
|
|
|
|
|
} |
|
1690
|
|
|
|
|
|
|
else { |
|
1691
|
|
|
|
|
|
|
# We're at the end of the feature table |
|
1692
|
|
|
|
|
|
|
# Put the first line of the next feature into the buffer |
|
1693
|
103
|
|
|
|
|
188
|
$$buffer = $line; |
|
1694
|
103
|
|
|
|
|
178
|
last; |
|
1695
|
|
|
|
|
|
|
} |
|
1696
|
|
|
|
|
|
|
} |
|
1697
|
|
|
|
|
|
|
} |
|
1698
|
|
|
|
|
|
|
else { |
|
1699
|
|
|
|
|
|
|
# No feature key |
|
1700
|
0
|
|
|
|
|
0
|
$self->debug("no feature key!\n"); |
|
1701
|
|
|
|
|
|
|
# change suggested by JDiggans to avoid infinite loop- |
|
1702
|
|
|
|
|
|
|
# see bugreport 1062. |
|
1703
|
|
|
|
|
|
|
# reset buffer to prevent infinite loop |
|
1704
|
0
|
|
|
|
|
0
|
$$buffer = $self->_readline; |
|
1705
|
0
|
|
|
|
|
0
|
return; |
|
1706
|
|
|
|
|
|
|
} |
|
1707
|
|
|
|
|
|
|
|
|
1708
|
|
|
|
|
|
|
# Make the new FTHelper object |
|
1709
|
8280
|
|
|
|
|
18970
|
my $out = Bio::SeqIO::FTHelper->new; |
|
1710
|
8280
|
|
|
|
|
16946
|
$out->verbose($self->verbose); |
|
1711
|
8280
|
|
|
|
|
13153
|
$out->key($key); |
|
1712
|
8280
|
|
|
|
|
12068
|
$out->loc($loc); |
|
1713
|
|
|
|
|
|
|
|
|
1714
|
|
|
|
|
|
|
# Now parse and add any qualifiers. (@qual is kept |
|
1715
|
|
|
|
|
|
|
# intact to provide informative error messages.) |
|
1716
|
|
|
|
|
|
|
QUAL: |
|
1717
|
8280
|
|
|
|
|
16304
|
for (my $i = 0; $i < @qual; $i++) { |
|
1718
|
55807
|
|
|
|
|
45528
|
my $data = $qual[$i]; |
|
1719
|
55807
|
50
|
|
|
|
196061
|
my ( $qualifier, $value ) = ($data =~ m{^/([^=]+)(?:=\s*(.+))?}) |
|
1720
|
|
|
|
|
|
|
or $self->warn( "cannot see new qualifier in feature $key: " |
|
1721
|
|
|
|
|
|
|
. $data); |
|
1722
|
55807
|
50
|
|
|
|
69452
|
$qualifier = '' if not defined $qualifier; |
|
1723
|
|
|
|
|
|
|
|
|
1724
|
55807
|
100
|
|
|
|
56119
|
if (defined $value) { |
|
1725
|
|
|
|
|
|
|
# Do we have a quoted value? |
|
1726
|
54903
|
100
|
|
|
|
69734
|
if (substr($value, 0, 1) eq '"') { |
|
|
|
100
|
|
|
|
|
|
|
1727
|
|
|
|
|
|
|
# Keep adding to value until we find the trailing quote |
|
1728
|
|
|
|
|
|
|
# and the quotes are balanced |
|
1729
|
52083
|
|
66
|
|
|
191605
|
while ($value !~ /\"$/ or $value =~ tr/"/"/ % 2) { |
|
1730
|
12312
|
50
|
|
|
|
17233
|
if ($i >= $#qual) { |
|
1731
|
0
|
|
|
|
|
0
|
$self->warn( "Unbalanced quote in:\n" |
|
1732
|
|
|
|
|
|
|
. join("\n", @qual) |
|
1733
|
|
|
|
|
|
|
. "No further qualifiers will " |
|
1734
|
|
|
|
|
|
|
. "be added for this feature"); |
|
1735
|
0
|
|
|
|
|
0
|
last QUAL; |
|
1736
|
|
|
|
|
|
|
} |
|
1737
|
|
|
|
|
|
|
# modifying a for-loop variable inside of the loop |
|
1738
|
|
|
|
|
|
|
# is not the best programming style ... |
|
1739
|
12312
|
|
|
|
|
9333
|
$i++; |
|
1740
|
12312
|
|
|
|
|
9545
|
my $next = $qual[$i]; |
|
1741
|
|
|
|
|
|
|
|
|
1742
|
|
|
|
|
|
|
# add to value with a space unless the value appears |
|
1743
|
|
|
|
|
|
|
# to be a sequence (translation for example) |
|
1744
|
|
|
|
|
|
|
# if (($value.$next) =~ /[^A-Za-z\"\-]/o) { |
|
1745
|
|
|
|
|
|
|
# changed to explicitly look for translation tag - cjf 06/8/29 |
|
1746
|
12312
|
100
|
|
|
|
17638
|
if ($qualifier !~ /^translation$/i ) { |
|
1747
|
9870
|
|
|
|
|
10364
|
$value .= " "; |
|
1748
|
|
|
|
|
|
|
} |
|
1749
|
12312
|
|
|
|
|
48784
|
$value .= $next; |
|
1750
|
|
|
|
|
|
|
} |
|
1751
|
|
|
|
|
|
|
# Trim leading and trailing quotes |
|
1752
|
52083
|
|
|
|
|
233636
|
$value =~ s/^"|"$//g; |
|
1753
|
|
|
|
|
|
|
# Undouble internal quotes |
|
1754
|
52083
|
|
|
|
|
51590
|
$value =~ s/""/\"/g; |
|
1755
|
|
|
|
|
|
|
} |
|
1756
|
|
|
|
|
|
|
elsif ( $value =~ /^\(/ ) { # values quoted by ()s |
|
1757
|
|
|
|
|
|
|
# Keep adding to value until we find the trailing bracket |
|
1758
|
|
|
|
|
|
|
# and the ()s are balanced |
|
1759
|
27
|
|
|
|
|
42
|
my $left = ($value =~ tr/\(/\(/); # count left parens |
|
1760
|
27
|
|
|
|
|
34
|
my $right = ($value =~ tr/\)/\)/); # count right parens |
|
1761
|
|
|
|
|
|
|
|
|
1762
|
27
|
|
|
|
|
56
|
while( $left != $right ) { # was "$value !~ /\)$/ or $left != $right" |
|
1763
|
2
|
50
|
|
|
|
7
|
if ( $i >= $#qual) { |
|
1764
|
0
|
|
|
|
|
0
|
$self->warn( "Unbalanced parens in:\n" |
|
1765
|
|
|
|
|
|
|
. join("\n", @qual) |
|
1766
|
|
|
|
|
|
|
. "\nNo further qualifiers will " |
|
1767
|
|
|
|
|
|
|
. "be added for this feature"); |
|
1768
|
0
|
|
|
|
|
0
|
last QUAL; |
|
1769
|
|
|
|
|
|
|
} |
|
1770
|
2
|
|
|
|
|
4
|
$i++; |
|
1771
|
2
|
|
|
|
|
3
|
my $next = $qual[$i]; |
|
1772
|
2
|
|
|
|
|
4
|
$value .= $next; |
|
1773
|
2
|
|
|
|
|
4
|
$left += ($next =~ tr/\(/\(/); |
|
1774
|
2
|
|
|
|
|
6
|
$right += ($next =~ tr/\)/\)/); |
|
1775
|
|
|
|
|
|
|
} |
|
1776
|
|
|
|
|
|
|
} |
|
1777
|
|
|
|
|
|
|
} |
|
1778
|
|
|
|
|
|
|
else { |
|
1779
|
904
|
|
|
|
|
981
|
$value = '_no_value'; |
|
1780
|
|
|
|
|
|
|
} |
|
1781
|
|
|
|
|
|
|
# Store the qualifier |
|
1782
|
55807
|
|
100
|
|
|
76350
|
$out->field->{$qualifier} ||= []; |
|
1783
|
55807
|
|
|
|
|
39024
|
push @{$out->field->{$qualifier}}, $value; |
|
|
55807
|
|
|
|
|
62263
|
|
|
1784
|
|
|
|
|
|
|
} |
|
1785
|
8280
|
|
|
|
|
16612
|
return $out; |
|
1786
|
|
|
|
|
|
|
} |
|
1787
|
|
|
|
|
|
|
|
|
1788
|
|
|
|
|
|
|
=head2 _write_line_GenBank |
|
1789
|
|
|
|
|
|
|
|
|
1790
|
|
|
|
|
|
|
Title : _write_line_GenBank |
|
1791
|
|
|
|
|
|
|
Usage : |
|
1792
|
|
|
|
|
|
|
Function: internal function |
|
1793
|
|
|
|
|
|
|
Example : |
|
1794
|
|
|
|
|
|
|
Returns : |
|
1795
|
|
|
|
|
|
|
Args : |
|
1796
|
|
|
|
|
|
|
|
|
1797
|
|
|
|
|
|
|
=cut |
|
1798
|
|
|
|
|
|
|
|
|
1799
|
|
|
|
|
|
|
sub _write_line_GenBank { |
|
1800
|
0
|
|
|
0
|
|
0
|
my ($self, $pre1, $pre2, $line, $length) = @_; |
|
1801
|
|
|
|
|
|
|
|
|
1802
|
0
|
0
|
|
|
|
0
|
$length or $self->throw("Miscalled write_line_GenBank without length. Programming error!"); |
|
1803
|
0
|
|
|
|
|
0
|
my $subl = $length - length $pre2; |
|
1804
|
0
|
|
|
|
|
0
|
my $linel = length $line; |
|
1805
|
0
|
|
|
|
|
0
|
my $i; |
|
1806
|
|
|
|
|
|
|
|
|
1807
|
0
|
|
|
|
|
0
|
my $subr = substr($line,0,$length - length $pre1); |
|
1808
|
|
|
|
|
|
|
|
|
1809
|
0
|
|
|
|
|
0
|
$self->_print("$pre1$subr\n"); |
|
1810
|
0
|
|
|
|
|
0
|
for($i = ($length - length $pre1); $i < $linel; $i += $subl) { |
|
1811
|
0
|
|
|
|
|
0
|
$subr = substr($line, $i, $subl); |
|
1812
|
0
|
|
|
|
|
0
|
$self->_print("$pre2$subr\n"); |
|
1813
|
|
|
|
|
|
|
} |
|
1814
|
|
|
|
|
|
|
} |
|
1815
|
|
|
|
|
|
|
|
|
1816
|
|
|
|
|
|
|
=head2 _write_line_GenBank_regex |
|
1817
|
|
|
|
|
|
|
|
|
1818
|
|
|
|
|
|
|
Title : _write_line_GenBank_regex |
|
1819
|
|
|
|
|
|
|
Usage : |
|
1820
|
|
|
|
|
|
|
Function: internal function for writing lines of specified |
|
1821
|
|
|
|
|
|
|
length, with different first and the next line |
|
1822
|
|
|
|
|
|
|
left hand headers and split at specific points in the |
|
1823
|
|
|
|
|
|
|
text |
|
1824
|
|
|
|
|
|
|
Example : |
|
1825
|
|
|
|
|
|
|
Returns : nothing |
|
1826
|
|
|
|
|
|
|
Args : file handle, |
|
1827
|
|
|
|
|
|
|
first header, |
|
1828
|
|
|
|
|
|
|
second header, |
|
1829
|
|
|
|
|
|
|
text-line, |
|
1830
|
|
|
|
|
|
|
regex for line breaks, |
|
1831
|
|
|
|
|
|
|
total line length |
|
1832
|
|
|
|
|
|
|
|
|
1833
|
|
|
|
|
|
|
=cut |
|
1834
|
|
|
|
|
|
|
|
|
1835
|
|
|
|
|
|
|
sub _write_line_GenBank_regex { |
|
1836
|
1765
|
|
|
1765
|
|
2516
|
my ($self, $pre1, $pre2, $line, $regex, $length) = @_; |
|
1837
|
|
|
|
|
|
|
|
|
1838
|
|
|
|
|
|
|
#print STDOUT "Going to print with $line!\n"; |
|
1839
|
|
|
|
|
|
|
|
|
1840
|
1765
|
50
|
|
|
|
2721
|
$length or $self->throw("Miscalled write_line_GenBank without length. Programming error!"); |
|
1841
|
|
|
|
|
|
|
|
|
1842
|
1765
|
|
|
|
|
2306
|
my $subl = $length - (length $pre1) - 2; |
|
1843
|
1765
|
|
|
|
|
1843
|
my @lines = (); |
|
1844
|
|
|
|
|
|
|
|
|
1845
|
|
|
|
|
|
|
CHUNK: |
|
1846
|
1765
|
|
|
|
|
2549
|
while ($line) { |
|
1847
|
2129
|
|
|
|
|
3036
|
foreach my $pat ($regex, '[,;\.\/-]\s|'.$regex, '[,;\.\/-]|'.$regex) { |
|
1848
|
2129
|
50
|
|
|
|
26154
|
if ($line =~ m/^(.{0,$subl})($pat)(.*)/ ) { |
|
1849
|
2129
|
|
|
|
|
4839
|
my $l = $1 . $2; |
|
1850
|
2129
|
|
|
|
|
2839
|
$line = substr($line, length $l); |
|
1851
|
|
|
|
|
|
|
# be strict about not padding spaces according to |
|
1852
|
|
|
|
|
|
|
# genbank format |
|
1853
|
2129
|
|
|
|
|
3932
|
$l =~ s/\s+$//; |
|
1854
|
2129
|
50
|
|
|
|
3569
|
next CHUNK if ($l eq ''); |
|
1855
|
2129
|
|
|
|
|
2654
|
push @lines, $l; |
|
1856
|
2129
|
|
|
|
|
5497
|
next CHUNK; |
|
1857
|
|
|
|
|
|
|
} |
|
1858
|
|
|
|
|
|
|
} |
|
1859
|
|
|
|
|
|
|
# if we get here none of the patterns matched $subl or less chars |
|
1860
|
0
|
|
|
|
|
0
|
$self->warn( "trouble dissecting \"$line\"\n into chunks " |
|
1861
|
|
|
|
|
|
|
. "of $subl chars or less - this tag won't print right"); |
|
1862
|
|
|
|
|
|
|
# insert a space char to prevent infinite loops |
|
1863
|
0
|
|
|
|
|
0
|
$line = substr($line, 0, $subl) . " " . substr($line, $subl); |
|
1864
|
|
|
|
|
|
|
} |
|
1865
|
1765
|
|
|
|
|
1737
|
my $s = shift @lines; |
|
1866
|
1765
|
100
|
|
|
|
6388
|
$self->_print("$pre1$s\n") if $s; |
|
1867
|
1765
|
|
|
|
|
5411
|
foreach my $s ( @lines ) { |
|
1868
|
369
|
|
|
|
|
706
|
$self->_print("$pre2$s\n"); |
|
1869
|
|
|
|
|
|
|
} |
|
1870
|
|
|
|
|
|
|
} |
|
1871
|
|
|
|
|
|
|
|
|
1872
|
|
|
|
|
|
|
=head2 _post_sort |
|
1873
|
|
|
|
|
|
|
|
|
1874
|
|
|
|
|
|
|
Title : _post_sort |
|
1875
|
|
|
|
|
|
|
Usage : $obj->_post_sort($newval) |
|
1876
|
|
|
|
|
|
|
Function: |
|
1877
|
|
|
|
|
|
|
Returns : value of _post_sort |
|
1878
|
|
|
|
|
|
|
Args : newvalue (optional) |
|
1879
|
|
|
|
|
|
|
|
|
1880
|
|
|
|
|
|
|
=cut |
|
1881
|
|
|
|
|
|
|
|
|
1882
|
|
|
|
|
|
|
sub _post_sort { |
|
1883
|
25
|
|
|
25
|
|
35
|
my ($obj,$value) = @_; |
|
1884
|
25
|
50
|
|
|
|
70
|
if ( defined $value) { |
|
1885
|
0
|
|
|
|
|
0
|
$obj->{'_post_sort'} = $value; |
|
1886
|
|
|
|
|
|
|
} |
|
1887
|
25
|
|
|
|
|
83
|
return $obj->{'_post_sort'}; |
|
1888
|
|
|
|
|
|
|
} |
|
1889
|
|
|
|
|
|
|
|
|
1890
|
|
|
|
|
|
|
=head2 _show_dna |
|
1891
|
|
|
|
|
|
|
|
|
1892
|
|
|
|
|
|
|
Title : _show_dna |
|
1893
|
|
|
|
|
|
|
Usage : $obj->_show_dna($newval) |
|
1894
|
|
|
|
|
|
|
Function: |
|
1895
|
|
|
|
|
|
|
Returns : value of _show_dna |
|
1896
|
|
|
|
|
|
|
Args : newvalue (optional) |
|
1897
|
|
|
|
|
|
|
|
|
1898
|
|
|
|
|
|
|
=cut |
|
1899
|
|
|
|
|
|
|
|
|
1900
|
|
|
|
|
|
|
sub _show_dna { |
|
1901
|
141
|
|
|
141
|
|
206
|
my ($obj,$value) = @_; |
|
1902
|
141
|
100
|
|
|
|
356
|
if ( defined $value) { |
|
1903
|
116
|
|
|
|
|
244
|
$obj->{'_show_dna'} = $value; |
|
1904
|
|
|
|
|
|
|
} |
|
1905
|
141
|
|
|
|
|
279
|
return $obj->{'_show_dna'}; |
|
1906
|
|
|
|
|
|
|
} |
|
1907
|
|
|
|
|
|
|
|
|
1908
|
|
|
|
|
|
|
=head2 _id_generation_func |
|
1909
|
|
|
|
|
|
|
|
|
1910
|
|
|
|
|
|
|
Title : _id_generation_func |
|
1911
|
|
|
|
|
|
|
Usage : $obj->_id_generation_func($newval) |
|
1912
|
|
|
|
|
|
|
Function: |
|
1913
|
|
|
|
|
|
|
Returns : value of _id_generation_func |
|
1914
|
|
|
|
|
|
|
Args : newvalue (optional) |
|
1915
|
|
|
|
|
|
|
|
|
1916
|
|
|
|
|
|
|
=cut |
|
1917
|
|
|
|
|
|
|
|
|
1918
|
|
|
|
|
|
|
sub _id_generation_func { |
|
1919
|
25
|
|
|
25
|
|
43
|
my ($obj,$value) = @_; |
|
1920
|
25
|
50
|
|
|
|
77
|
if ( defined $value ) { |
|
1921
|
0
|
|
|
|
|
0
|
$obj->{'_id_generation_func'} = $value; |
|
1922
|
|
|
|
|
|
|
} |
|
1923
|
25
|
|
|
|
|
79
|
return $obj->{'_id_generation_func'}; |
|
1924
|
|
|
|
|
|
|
} |
|
1925
|
|
|
|
|
|
|
|
|
1926
|
|
|
|
|
|
|
=head2 _ac_generation_func |
|
1927
|
|
|
|
|
|
|
|
|
1928
|
|
|
|
|
|
|
Title : _ac_generation_func |
|
1929
|
|
|
|
|
|
|
Usage : $obj->_ac_generation_func($newval) |
|
1930
|
|
|
|
|
|
|
Function: |
|
1931
|
|
|
|
|
|
|
Returns : value of _ac_generation_func |
|
1932
|
|
|
|
|
|
|
Args : newvalue (optional) |
|
1933
|
|
|
|
|
|
|
|
|
1934
|
|
|
|
|
|
|
=cut |
|
1935
|
|
|
|
|
|
|
|
|
1936
|
|
|
|
|
|
|
sub _ac_generation_func { |
|
1937
|
25
|
|
|
25
|
|
45
|
my ($obj,$value) = @_; |
|
1938
|
25
|
50
|
|
|
|
207
|
if ( defined $value ) { |
|
1939
|
0
|
|
|
|
|
0
|
$obj->{'_ac_generation_func'} = $value; |
|
1940
|
|
|
|
|
|
|
} |
|
1941
|
25
|
|
|
|
|
83
|
return $obj->{'_ac_generation_func'}; |
|
1942
|
|
|
|
|
|
|
} |
|
1943
|
|
|
|
|
|
|
|
|
1944
|
|
|
|
|
|
|
=head2 _sv_generation_func |
|
1945
|
|
|
|
|
|
|
|
|
1946
|
|
|
|
|
|
|
Title : _sv_generation_func |
|
1947
|
|
|
|
|
|
|
Usage : $obj->_sv_generation_func($newval) |
|
1948
|
|
|
|
|
|
|
Function: |
|
1949
|
|
|
|
|
|
|
Returns : value of _sv_generation_func |
|
1950
|
|
|
|
|
|
|
Args : newvalue (optional) |
|
1951
|
|
|
|
|
|
|
|
|
1952
|
|
|
|
|
|
|
=cut |
|
1953
|
|
|
|
|
|
|
|
|
1954
|
|
|
|
|
|
|
sub _sv_generation_func { |
|
1955
|
25
|
|
|
25
|
|
40
|
my ($obj,$value) = @_; |
|
1956
|
25
|
50
|
|
|
|
85
|
if ( defined $value ) { |
|
1957
|
0
|
|
|
|
|
0
|
$obj->{'_sv_generation_func'} = $value; |
|
1958
|
|
|
|
|
|
|
} |
|
1959
|
25
|
|
|
|
|
288
|
return $obj->{'_sv_generation_func'}; |
|
1960
|
|
|
|
|
|
|
} |
|
1961
|
|
|
|
|
|
|
|
|
1962
|
|
|
|
|
|
|
=head2 _kw_generation_func |
|
1963
|
|
|
|
|
|
|
|
|
1964
|
|
|
|
|
|
|
Title : _kw_generation_func |
|
1965
|
|
|
|
|
|
|
Usage : $obj->_kw_generation_func($newval) |
|
1966
|
|
|
|
|
|
|
Function: |
|
1967
|
|
|
|
|
|
|
Returns : value of _kw_generation_func |
|
1968
|
|
|
|
|
|
|
Args : newvalue (optional) |
|
1969
|
|
|
|
|
|
|
|
|
1970
|
|
|
|
|
|
|
=cut |
|
1971
|
|
|
|
|
|
|
|
|
1972
|
|
|
|
|
|
|
sub _kw_generation_func { |
|
1973
|
25
|
|
|
25
|
|
36
|
my ($obj,$value) = @_; |
|
1974
|
25
|
50
|
|
|
|
72
|
if ( defined $value ) { |
|
1975
|
0
|
|
|
|
|
0
|
$obj->{'_kw_generation_func'} = $value; |
|
1976
|
|
|
|
|
|
|
} |
|
1977
|
25
|
|
|
|
|
169
|
return $obj->{'_kw_generation_func'}; |
|
1978
|
|
|
|
|
|
|
} |
|
1979
|
|
|
|
|
|
|
|
|
1980
|
|
|
|
|
|
|
1; |