line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::SeqIO::Handler::GenericRichSeqHandler |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Cared for by Chris Fields |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Chris Fields |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::SeqIO::Handler::GenericRichSeqHandler - Bio::HandlerI-based |
17
|
|
|
|
|
|
|
data handler for GenBank/EMBL/UniProt (and other) sequence data |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# MyHandler is a GenericRichSeqHandler object. |
22
|
|
|
|
|
|
|
# inside a parser (driver) constructor.... |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
$self->seq_handler($handler || MyHandler->new(-format => 'genbank')); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# in next_seq() in driver... |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
$hobj = $self->seqhandler(); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# roll data up into hashref chunks, pass off into Handler for processing... |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
$hobj->data_handler($data); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# or retrieve Handler methods and pass data directly to Handler methods... |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
my $hmeth = $hobj->handler_methods; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
if ($hmeth->{ $data->{NAME} }) { |
39
|
|
|
|
|
|
|
my $mth = $hmeth->{ $data->{NAME} }; |
40
|
|
|
|
|
|
|
$hobj->$mth($data); |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head1 DESCRIPTION |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
This is an experimental implementation of a sequence-based HandlerBaseI parser |
46
|
|
|
|
|
|
|
and may change over time. It is possible (nay, likely) that the way handler |
47
|
|
|
|
|
|
|
methods are set up will change over development to allow more flexibility. |
48
|
|
|
|
|
|
|
Release pumpkins, please do not add this to a release until the API has settled. |
49
|
|
|
|
|
|
|
It is also likely that write_seq() will not work properly for some data. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Standard Developer caveats: |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
Do not use for production purposes. |
54
|
|
|
|
|
|
|
Not responsible for destroying (your data|computer|world). |
55
|
|
|
|
|
|
|
Do not stare directly at GenericRichSeqHandler. |
56
|
|
|
|
|
|
|
If GenericRichSeqHandler glows, back slowly away and call for help. |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
Consider yourself warned! |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
This class acts as a demonstration on how to handle similar data chunks derived |
61
|
|
|
|
|
|
|
from Bio::SeqIO::gbdriver, Bio::SeqIO::embldriver, and Bio::SeqIO::swissdriver |
62
|
|
|
|
|
|
|
using similar (or the same) handler methods. |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
The modules currently pass all previous tests in t/genbank.t, t/embl.t, and |
65
|
|
|
|
|
|
|
t/swiss.t yet all use the same handler methods (the collected tests for handlers |
66
|
|
|
|
|
|
|
can be found in t/Handler.t). Some tweaking of the methods themselves is |
67
|
|
|
|
|
|
|
probably in order over the long run to ensure that data is consistently handled |
68
|
|
|
|
|
|
|
for each parser. Round-trip tests are probably in order here... |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Though a Bio::Seq::SeqBuilder is employed for building sequence objects no |
71
|
|
|
|
|
|
|
bypassing of data based on builder slots has been implemented (yet); this is |
72
|
|
|
|
|
|
|
planned in the near future. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
As a reminder: this is the current Annotation data chunk (via Data::Dumper): |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
$VAR1 = { |
77
|
|
|
|
|
|
|
'NAME' => 'REFERENCE', |
78
|
|
|
|
|
|
|
'DATA' => '1 (bases 1 to 10001)' |
79
|
|
|
|
|
|
|
'AUTHORS' => 'International Human Genome Sequencing Consortium.' |
80
|
|
|
|
|
|
|
'TITLE' => 'The DNA sequence of Homo sapiens' |
81
|
|
|
|
|
|
|
'JOURNAL' => 'Unpublished (2003)' |
82
|
|
|
|
|
|
|
}; |
83
|
|
|
|
|
|
|
... |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
This is the current SeqFeature data chunk (again via Data::Dumper): |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
$VAR1 = { |
88
|
|
|
|
|
|
|
'mol_type' => 'genomic DNA', |
89
|
|
|
|
|
|
|
'LOCATION' => '<1..>10001', |
90
|
|
|
|
|
|
|
'NAME' => 'FEATURES', |
91
|
|
|
|
|
|
|
'FEATURE_KEY' => 'source', |
92
|
|
|
|
|
|
|
'note' => 'Accession AL451081 sequenced by The Sanger Centre', |
93
|
|
|
|
|
|
|
'db_xref' => 'taxon:9606', |
94
|
|
|
|
|
|
|
'clone' => 'RP11-302I18', |
95
|
|
|
|
|
|
|
'organism' => 'Homo sapiens' |
96
|
|
|
|
|
|
|
}; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=head1 FEEDBACK |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head2 Mailing Lists |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
103
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
104
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
107
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head2 Support |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
I |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
116
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
117
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
118
|
|
|
|
|
|
|
with code and data examples if at all possible. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head2 Reporting Bugs |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
123
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
124
|
|
|
|
|
|
|
web: |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head1 AUTHOR - Chris Fields |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Email cjfields at bioperl dot org |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=head1 APPENDIX |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. Internal |
135
|
|
|
|
|
|
|
methods are usually preceded with a _ |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=cut |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# Let the code begin... |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
package Bio::SeqIO::Handler::GenericRichSeqHandler; |
142
|
1
|
|
|
1
|
|
3
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
23
|
|
143
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
18
|
|
144
|
|
|
|
|
|
|
|
145
|
1
|
|
|
1
|
|
653
|
use Bio::SeqIO::FTHelper; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
21
|
|
146
|
1
|
|
|
1
|
|
5
|
use Bio::Annotation::Collection; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
16
|
|
147
|
1
|
|
|
1
|
|
716
|
use Bio::Annotation::DBLink; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
19
|
|
148
|
1
|
|
|
1
|
|
656
|
use Bio::Annotation::Comment; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
30
|
|
149
|
1
|
|
|
1
|
|
724
|
use Bio::Annotation::Reference; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
23
|
|
150
|
1
|
|
|
1
|
|
5
|
use Bio::Annotation::Collection; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
15
|
|
151
|
1
|
|
|
1
|
|
3
|
use Bio::Annotation::SimpleValue; |
|
1
|
|
|
|
|
0
|
|
|
1
|
|
|
|
|
14
|
|
152
|
1
|
|
|
1
|
|
717
|
use Bio::Annotation::TagTree; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
27
|
|
153
|
1
|
|
|
1
|
|
4
|
use Bio::SeqFeature::Generic; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
14
|
|
154
|
1
|
|
|
1
|
|
732
|
use Bio::Species; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
22
|
|
155
|
1
|
|
|
1
|
|
4
|
use Bio::Taxon; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
12
|
|
156
|
1
|
|
|
1
|
|
3
|
use Bio::DB::Taxonomy; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
13
|
|
157
|
1
|
|
|
1
|
|
3
|
use Bio::Factory::FTLocationFactory; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
17
|
|
158
|
1
|
|
|
1
|
|
3
|
use Data::Dumper; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
43
|
|
159
|
|
|
|
|
|
|
|
160
|
1
|
|
|
1
|
|
3
|
use base qw(Bio::Root::Root Bio::HandlerBaseI); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
731
|
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
my %HANDLERS = ( |
163
|
|
|
|
|
|
|
'genbank' => { |
164
|
|
|
|
|
|
|
'LOCUS' => \&_genbank_locus, |
165
|
|
|
|
|
|
|
'DEFINITION' => \&_generic_description, |
166
|
|
|
|
|
|
|
'ACCESSION' => \&_generic_accession, |
167
|
|
|
|
|
|
|
'VERSION' => \&_generic_version, |
168
|
|
|
|
|
|
|
'KEYWORDS' => \&_generic_keywords, |
169
|
|
|
|
|
|
|
'DBSOURCE' => \&_genbank_dbsource, |
170
|
|
|
|
|
|
|
'DBLINK' => \&_genbank_dbsource, |
171
|
|
|
|
|
|
|
'SOURCE' => \&_generic_species, |
172
|
|
|
|
|
|
|
'REFERENCE' => \&_generic_reference, |
173
|
|
|
|
|
|
|
'COMMENT' => \&_generic_comment, |
174
|
|
|
|
|
|
|
'FEATURES' => \&_generic_seqfeatures, |
175
|
|
|
|
|
|
|
'BASE' => \&noop, # this is generated from scratch |
176
|
|
|
|
|
|
|
'ORIGIN' => \&_generic_seq, |
177
|
|
|
|
|
|
|
# handles anything else (WGS, WGS_SCAFLD, CONTIG, PROJECT) |
178
|
|
|
|
|
|
|
'_DEFAULT_' => \&_generic_simplevalue, |
179
|
|
|
|
|
|
|
}, |
180
|
|
|
|
|
|
|
'embl' => { |
181
|
|
|
|
|
|
|
'ID' => \&_embl_id, |
182
|
|
|
|
|
|
|
'DT' => \&_embl_date, |
183
|
|
|
|
|
|
|
'DR' => \&_generic_dbsource, |
184
|
|
|
|
|
|
|
'SV' => \&_generic_version, |
185
|
|
|
|
|
|
|
'RN' => \&_generic_reference, |
186
|
|
|
|
|
|
|
'KW' => \&_generic_keywords, |
187
|
|
|
|
|
|
|
'DE' => \&_generic_description, |
188
|
|
|
|
|
|
|
'AC' => \&_generic_accession, |
189
|
|
|
|
|
|
|
#'AH' => \&noop, # TPA data not dealt with yet... |
190
|
|
|
|
|
|
|
#'AS' => \&noop, |
191
|
|
|
|
|
|
|
'SQ' => \&_generic_seq, |
192
|
|
|
|
|
|
|
'OS' => \&_generic_species, |
193
|
|
|
|
|
|
|
'CC' => \&_generic_comment, |
194
|
|
|
|
|
|
|
'FT' => \&_generic_seqfeatures, |
195
|
|
|
|
|
|
|
# handles anything else (WGS, TPA, ANN...) |
196
|
|
|
|
|
|
|
'_DEFAULT_' => \&_generic_simplevalue, |
197
|
|
|
|
|
|
|
}, |
198
|
|
|
|
|
|
|
'swiss' => { |
199
|
|
|
|
|
|
|
'ID' => \&_swiss_id, |
200
|
|
|
|
|
|
|
'DT' => \&_swiss_date, |
201
|
|
|
|
|
|
|
'GN' => \&_swiss_genename, |
202
|
|
|
|
|
|
|
'DR' => \&_generic_dbsource, |
203
|
|
|
|
|
|
|
'RN' => \&_generic_reference, |
204
|
|
|
|
|
|
|
'KW' => \&_generic_keywords, |
205
|
|
|
|
|
|
|
'DE' => \&_generic_description, |
206
|
|
|
|
|
|
|
'AC' => \&_generic_accession, |
207
|
|
|
|
|
|
|
'SQ' => \&_generic_seq, |
208
|
|
|
|
|
|
|
'OS' => \&_generic_species, |
209
|
|
|
|
|
|
|
'CC' => \&_generic_comment, |
210
|
|
|
|
|
|
|
'FT' => \&_generic_seqfeatures, |
211
|
|
|
|
|
|
|
# handles anything else, though I don't know what... |
212
|
|
|
|
|
|
|
'_DEFAULT_' => \&_generic_simplevalue, |
213
|
|
|
|
|
|
|
}, |
214
|
|
|
|
|
|
|
); |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# can we do this generically? Seems like a lot of trouble... |
217
|
|
|
|
|
|
|
my %DBSOURCE = map {$_ => 1} qw( |
218
|
|
|
|
|
|
|
EchoBASE IntAct SWISS-2DPAGE ECO2DBASE ECOGENE TIGRFAMs |
219
|
|
|
|
|
|
|
TIGR GO InterPro Pfam PROSITE SGD GermOnline |
220
|
|
|
|
|
|
|
HSSP PhosSite Ensembl RGD AGD ArrayExpress KEGG |
221
|
|
|
|
|
|
|
H-InvDB HGNC LinkHub PANTHER PRINTS SMART SMR |
222
|
|
|
|
|
|
|
MGI MIM RZPD-ProtExp ProDom MEROPS TRANSFAC Reactome |
223
|
|
|
|
|
|
|
UniGene GlycoSuiteDB PIRSF HSC-2DPAGE PHCI-2DPAGE |
224
|
|
|
|
|
|
|
PMMA-2DPAGE Siena-2DPAGE Rat-heart-2DPAGE Aarhus/Ghent-2DPAGE |
225
|
|
|
|
|
|
|
Biocyc MetaCyc Biocyc:Metacyc GenomeReviews FlyBase |
226
|
|
|
|
|
|
|
TMHOBP COMPLUYEAST-2DPAGE OGP DictyBase HAMAP |
227
|
|
|
|
|
|
|
PhotoList Gramene WormBase WormPep Genew ZFIN |
228
|
|
|
|
|
|
|
PeroxiBase MaizeDB TAIR DrugBank REBASE HPA |
229
|
|
|
|
|
|
|
swissprot GenBank GenPept REFSEQ embl PDB UniProtKB); |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
my %NOPROCESS = map {$_ => 1} qw(DBSOURCE ORGANISM FEATURES); |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
our %VALID_ALPHABET = ( |
234
|
|
|
|
|
|
|
'bp' => 'dna', |
235
|
|
|
|
|
|
|
'aa' => 'protein', |
236
|
|
|
|
|
|
|
'rc' => '' # rc = release candidate; file has no sequences |
237
|
|
|
|
|
|
|
); |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=head2 new |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
Title : new |
242
|
|
|
|
|
|
|
Usage : |
243
|
|
|
|
|
|
|
Function: |
244
|
|
|
|
|
|
|
Returns : |
245
|
|
|
|
|
|
|
Args : -format Sequence format to be mapped for handler methods |
246
|
|
|
|
|
|
|
-builder Bio::Seq::SeqBuilder object (normally defined in |
247
|
|
|
|
|
|
|
SequenceStreamI object implementation constructor) |
248
|
|
|
|
|
|
|
Throws : On undefined '-format' sequence format parameter |
249
|
|
|
|
|
|
|
Note : Still under heavy development |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=cut |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
sub new { |
254
|
42
|
|
|
42
|
1
|
89
|
my ($class, @args) = @_; |
255
|
42
|
|
|
|
|
108
|
my $self = $class->SUPER::new(@args); |
256
|
42
|
|
|
|
|
107
|
$self = {@args}; |
257
|
42
|
|
|
|
|
110
|
bless $self,$class; |
258
|
42
|
|
|
|
|
160
|
my ($format, $builder) = $self->_rearrange([qw(FORMAT BUILDER)], @args); |
259
|
42
|
50
|
|
|
|
112
|
$self->throw("Must define sequence record format") if !$format; |
260
|
42
|
|
|
|
|
109
|
$self->format($format); |
261
|
42
|
|
|
|
|
111
|
$self->handler_methods(); |
262
|
42
|
50
|
|
|
|
129
|
$builder && $self->seqbuilder($builder); |
263
|
42
|
|
|
|
|
113
|
$self->location_factory(); |
264
|
42
|
|
|
|
|
155
|
return $self; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=head1 L implementing methods |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head2 handler_methods |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Title : handler_methods |
272
|
|
|
|
|
|
|
Usage : $handler->handler_methods('GenBank') |
273
|
|
|
|
|
|
|
%handlers = $handler->handler_methods(); |
274
|
|
|
|
|
|
|
Function: Retrieve the handler methods used for the current format() in |
275
|
|
|
|
|
|
|
the handler. This assumes the handler methods are already |
276
|
|
|
|
|
|
|
described in the HandlerI-implementing class. |
277
|
|
|
|
|
|
|
Returns : a hash reference with the data type handled and the code ref |
278
|
|
|
|
|
|
|
associated with it. |
279
|
|
|
|
|
|
|
Args : [optional] String representing the sequence format. If set here |
280
|
|
|
|
|
|
|
this will also set sequence_format() |
281
|
|
|
|
|
|
|
Throws : On unimplemented sequence format in %HANDLERS |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=cut |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub handler_methods { |
286
|
75
|
|
|
75
|
1
|
81
|
my $self = shift; |
287
|
75
|
100
|
|
|
|
164
|
if (!($self->{'handlers'})) { |
288
|
|
|
|
|
|
|
$self->throw("No handlers defined for seqformat ",$self->format) |
289
|
42
|
50
|
|
|
|
87
|
unless exists $HANDLERS{$self->format}; |
290
|
42
|
|
|
|
|
84
|
$self->{'handlers'} = $HANDLERS{$self->format}; |
291
|
|
|
|
|
|
|
} |
292
|
75
|
|
|
|
|
91
|
return ($self->{'handlers'}); |
293
|
|
|
|
|
|
|
} |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=head2 data_handler |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
Title : data_handler |
298
|
|
|
|
|
|
|
Usage : $handler->data_handler($data) |
299
|
|
|
|
|
|
|
Function: Centralized method which accepts all data chunks, then distributes |
300
|
|
|
|
|
|
|
to the appropriate methods for processing based on the chunk name |
301
|
|
|
|
|
|
|
from within the HandlerBaseI object. |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
One can also use |
304
|
|
|
|
|
|
|
Returns : None |
305
|
|
|
|
|
|
|
Args : an hash ref containing a data chunk. |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=cut |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
sub data_handler { |
310
|
1646
|
|
|
1646
|
1
|
1582
|
my ($self, $data) = @_; |
311
|
1646
|
|
33
|
|
|
2594
|
my $nm = $data->{NAME} || $self->throw("No name tag defined!"); |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
# this should handle data on the fly w/o caching; any caching should be |
314
|
|
|
|
|
|
|
# done in the driver! |
315
|
|
|
|
|
|
|
my $method = (exists $self->{'handlers'}->{$nm}) ? ($self->{'handlers'}->{$nm}) : |
316
|
1646
|
50
|
|
|
|
3250
|
(exists $self->{'handlers'}->{'_DEFAULT_'}) ? ($self->{'handlers'}->{'_DEFAULT_'}) : |
|
|
100
|
|
|
|
|
|
317
|
|
|
|
|
|
|
undef; |
318
|
1646
|
50
|
|
|
|
2361
|
if (!$method) { |
319
|
0
|
|
|
|
|
0
|
$self->debug("No handler defined for $nm\n"); |
320
|
0
|
|
|
|
|
0
|
return; |
321
|
|
|
|
|
|
|
}; |
322
|
1646
|
|
|
|
|
2501
|
$self->$method($data); |
323
|
|
|
|
|
|
|
} |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=head2 reset_parameters |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
Title : reset_parameters |
328
|
|
|
|
|
|
|
Usage : $handler->reset_parameters() |
329
|
|
|
|
|
|
|
Function: Resets the internal cache of data (normally object parameters for |
330
|
|
|
|
|
|
|
a builder or factory) |
331
|
|
|
|
|
|
|
Returns : None |
332
|
|
|
|
|
|
|
Args : None |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=cut |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub reset_parameters { |
337
|
58
|
|
|
58
|
1
|
82
|
my $self = shift; |
338
|
58
|
|
|
|
|
89
|
$self->{'_params'} = undef; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=head2 format |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
Title : format |
344
|
|
|
|
|
|
|
Usage : $handler->format('GenBank') |
345
|
|
|
|
|
|
|
Function: Get/Set the format for the report/record being parsed. This can be |
346
|
|
|
|
|
|
|
used to set handlers in classes which are capable of processing |
347
|
|
|
|
|
|
|
similar data chunks from multiple driver modules. |
348
|
|
|
|
|
|
|
Returns : String with the sequence format |
349
|
|
|
|
|
|
|
Args : [optional] String with the sequence format |
350
|
|
|
|
|
|
|
Note : The format may be used to set the handlers (as in the |
351
|
|
|
|
|
|
|
current GenericRichSeqHandler implementation) |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=cut |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
sub format { |
356
|
1301
|
|
|
1301
|
1
|
1070
|
my $self = shift; |
357
|
1301
|
100
|
|
|
|
2059
|
return $self->{'_seqformat'} = lc shift if @_; |
358
|
1259
|
|
|
|
|
1858
|
return $self->{'_seqformat'}; |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=head2 get_params |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Title : get_params |
364
|
|
|
|
|
|
|
Usage : $handler->get_params('-species') |
365
|
|
|
|
|
|
|
Function: Convenience method used to retrieve the specified |
366
|
|
|
|
|
|
|
parameters from the internal parameter cache |
367
|
|
|
|
|
|
|
Returns : Hash ref containing parameters requested and data as |
368
|
|
|
|
|
|
|
key-value pairs. Note that some parameter values may be |
369
|
|
|
|
|
|
|
objects, arrays, etc. |
370
|
|
|
|
|
|
|
Args : List (array) representing the parameters requested |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
=cut |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
sub get_params { |
375
|
818
|
|
|
818
|
1
|
891
|
my ($self, @ids) = @_; |
376
|
818
|
|
|
|
|
583
|
my %data; |
377
|
818
|
|
|
|
|
901
|
for my $id (@ids) { |
378
|
818
|
50
|
|
|
|
1472
|
if (!index($id, '-')==0) { |
379
|
818
|
|
|
|
|
1413
|
$id = '-'.$id ; |
380
|
|
|
|
|
|
|
} |
381
|
818
|
100
|
|
|
|
2820
|
$data{$id} = $self->{'_params'}->{$id} if (exists $self->{'_params'}->{$id}); |
382
|
|
|
|
|
|
|
} |
383
|
818
|
|
|
|
|
1426
|
return \%data; |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head2 set_params |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Title : set_params |
389
|
|
|
|
|
|
|
Usage : $handler->set_param({'-species') |
390
|
|
|
|
|
|
|
Function: Convenience method used to set specific parameters |
391
|
|
|
|
|
|
|
Returns : None |
392
|
|
|
|
|
|
|
Args : Hash ref containing the data to be passed as key-value pairs |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=cut |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
sub set_params { |
397
|
0
|
|
|
0
|
1
|
0
|
shift->throw('Not implemented yet!'); |
398
|
|
|
|
|
|
|
} |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=head1 Methods unique to this implementation |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=head2 seqbuilder |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
Title : seqbuilder |
405
|
|
|
|
|
|
|
Usage : |
406
|
|
|
|
|
|
|
Function: |
407
|
|
|
|
|
|
|
Returns : |
408
|
|
|
|
|
|
|
Args : |
409
|
|
|
|
|
|
|
Throws : |
410
|
|
|
|
|
|
|
Note : |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=cut |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
sub seqbuilder { |
415
|
104
|
|
|
104
|
1
|
118
|
my $self = shift; |
416
|
104
|
100
|
|
|
|
224
|
return $self->{'_seqbuilder'} = shift if @_; |
417
|
62
|
|
|
|
|
99
|
return $self->{'_seqbuilder'}; |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=head2 build_sequence |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
Title : build_sequence |
423
|
|
|
|
|
|
|
Usage : |
424
|
|
|
|
|
|
|
Function: |
425
|
|
|
|
|
|
|
Returns : |
426
|
|
|
|
|
|
|
Args : |
427
|
|
|
|
|
|
|
Throws : |
428
|
|
|
|
|
|
|
Note : |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=cut |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
sub build_sequence { |
433
|
62
|
|
|
62
|
1
|
87
|
my $self = shift; |
434
|
62
|
|
|
|
|
126
|
my $builder = $self->seqbuilder(); |
435
|
62
|
|
|
|
|
58
|
my $seq; |
436
|
62
|
100
|
|
|
|
119
|
if (defined($self->{'_params'})) { |
437
|
58
|
|
|
|
|
69
|
$builder->add_slot_value(%{ $self->{'_params'} }); |
|
58
|
|
|
|
|
448
|
|
438
|
58
|
|
|
|
|
172
|
$seq = $builder->make_object(); |
439
|
58
|
|
|
|
|
131
|
$self->reset_parameters; |
440
|
|
|
|
|
|
|
} |
441
|
62
|
100
|
|
|
|
639
|
return $seq if $seq; |
442
|
4
|
|
|
|
|
14
|
return 0; |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
=head2 location_factory |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
Title : location_factory |
448
|
|
|
|
|
|
|
Usage : |
449
|
|
|
|
|
|
|
Function: |
450
|
|
|
|
|
|
|
Returns : |
451
|
|
|
|
|
|
|
Args : |
452
|
|
|
|
|
|
|
Throws : |
453
|
|
|
|
|
|
|
Note : |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=cut |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
sub location_factory { |
458
|
42
|
|
|
42
|
1
|
49
|
my ($self, $factory) = @_; |
459
|
42
|
50
|
|
|
|
145
|
if ($factory) { |
|
|
50
|
|
|
|
|
|
460
|
0
|
0
|
0
|
|
|
0
|
$self->throw("Must have a Bio::Factory::LocationFactoryI when ". |
461
|
|
|
|
|
|
|
"explicitly setting factory()") unless |
462
|
|
|
|
|
|
|
(ref($factory) && $factory->isa('Bio::Factory::LocationFactoryI')); |
463
|
0
|
|
|
|
|
0
|
$self->{'_locfactory'} = $factory; |
464
|
|
|
|
|
|
|
} elsif (!defined($self->{'_locfactory'})) { |
465
|
42
|
|
|
|
|
180
|
$self->{'_locfactory'} = Bio::Factory::FTLocationFactory->new() |
466
|
|
|
|
|
|
|
} |
467
|
42
|
|
|
|
|
54
|
return $self->{'_locfactory'}; |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
=head2 annotation_collection |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
Title : annotation_collection |
473
|
|
|
|
|
|
|
Usage : |
474
|
|
|
|
|
|
|
Function: |
475
|
|
|
|
|
|
|
Returns : |
476
|
|
|
|
|
|
|
Args : |
477
|
|
|
|
|
|
|
Throws : |
478
|
|
|
|
|
|
|
Note : |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=cut |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
sub annotation_collection { |
483
|
746
|
|
|
746
|
1
|
644
|
my ($self, $coll) = @_; |
484
|
746
|
50
|
|
|
|
1611
|
if ($coll) { |
|
|
100
|
|
|
|
|
|
485
|
0
|
0
|
0
|
|
|
0
|
$self->throw("Must have Bio::AnnotationCollectionI ". |
486
|
|
|
|
|
|
|
"when explicitly setting collection()") |
487
|
|
|
|
|
|
|
unless (ref($coll) && $coll->isa('Bio::AnnotationCollectionI')); |
488
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-annotation'} = $coll; |
489
|
|
|
|
|
|
|
} elsif (!exists($self->{'_params'}->{'-annotation'})) { |
490
|
57
|
|
|
|
|
276
|
$self->{'_params'}->{'-annotation'} = Bio::Annotation::Collection->new() |
491
|
|
|
|
|
|
|
} |
492
|
746
|
|
|
|
|
2205
|
return $self->{'_params'}->{'-annotation'}; |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
####################### SEQUENCE HANDLERS ####################### |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# any sequence data |
498
|
|
|
|
|
|
|
sub _generic_seq { |
499
|
53
|
|
|
53
|
|
72
|
my ($self, $data) = @_; |
500
|
53
|
|
|
|
|
313
|
$self->{'_params'}->{'-seq'} = $data->{DATA}; |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
####################### RAW DATA HANDLERS ####################### |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# GenBank LOCUS line |
506
|
|
|
|
|
|
|
sub _genbank_locus { |
507
|
31
|
|
|
31
|
|
33
|
my ($self, $data) = @_; |
508
|
31
|
|
|
|
|
189
|
my (@tokens) = split m{\s+}, $data->{DATA}; |
509
|
31
|
|
|
|
|
45
|
my $display_id = shift @tokens; |
510
|
31
|
|
|
|
|
86
|
$self->{'_params'}->{'-display_id'} = $display_id; |
511
|
31
|
|
|
|
|
37
|
my $seqlength = shift @tokens; |
512
|
31
|
50
|
|
|
|
62
|
if (exists $VALID_ALPHABET{$seqlength}) { |
513
|
|
|
|
|
|
|
# moved one token too far. No locus name? |
514
|
|
|
|
|
|
|
$self->warn("Bad LOCUS name? Changing [".$self->{'_params'}->{'-display_id'}. |
515
|
0
|
|
|
|
|
0
|
"] to 'unknown' and length to ".$self->{'_params'}->{'-display_id'}); |
516
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-length'} = $self->{'_params'}->{'-display_id'}; |
517
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-display_id'} = 'unknown'; |
518
|
|
|
|
|
|
|
# add token back... |
519
|
0
|
|
|
|
|
0
|
unshift @tokens, $seqlength; |
520
|
|
|
|
|
|
|
} else { |
521
|
31
|
|
|
|
|
52
|
$self->{'_params'}->{'-length'} = $seqlength; |
522
|
|
|
|
|
|
|
} |
523
|
31
|
|
|
|
|
39
|
my $alphabet = lc(shift @tokens); |
524
|
|
|
|
|
|
|
$self->{'_params'}->{'-alphabet'} = |
525
|
31
|
50
|
|
|
|
78
|
(exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} : |
526
|
|
|
|
|
|
|
$self->warn("Unknown alphabet: $alphabet"); |
527
|
31
|
50
|
66
|
|
|
103
|
if (($self->{'_params'}->{'-alphabet'} eq 'dna') || (@tokens > 2)) { |
528
|
31
|
|
|
|
|
58
|
$self->{'_params'}->{'-molecule'} = shift(@tokens); |
529
|
31
|
|
|
|
|
43
|
my $circ = shift(@tokens); |
530
|
31
|
100
|
|
|
|
54
|
if ($circ eq 'circular') { |
531
|
2
|
|
|
|
|
6
|
$self->{'_params'}->{'-is_circular'} = 1; |
532
|
2
|
|
|
|
|
6
|
$self->{'_params'}->{'-division'} = shift(@tokens); |
533
|
|
|
|
|
|
|
} else { |
534
|
|
|
|
|
|
|
# 'linear' or 'circular' may actually be omitted altogether |
535
|
29
|
100
|
|
|
|
65
|
$self->{'_params'}->{'-division'} = |
536
|
|
|
|
|
|
|
(CORE::length($circ) == 3 ) ? $circ : shift(@tokens); |
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
} else { |
539
|
0
|
0
|
|
|
|
0
|
$self->{'_params'}->{'-molecule'} = 'PRT' if($self->{'_params'}->{'-alphabet'} eq 'aa'); |
540
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-division'} = shift(@tokens); |
541
|
|
|
|
|
|
|
} |
542
|
31
|
|
|
|
|
64
|
my $date = join(' ', @tokens); |
543
|
|
|
|
|
|
|
# maybe use Date::Time for dates? |
544
|
31
|
100
|
66
|
|
|
246
|
if($date && $date =~ s{\s*((\d{1,2})-(\w{3})-(\d{2,4})).*}{$1}) { |
545
|
|
|
|
|
|
|
|
546
|
30
|
50
|
|
|
|
55
|
if( length($date) < 11 ) { |
547
|
|
|
|
|
|
|
# improperly formatted date |
548
|
|
|
|
|
|
|
# But we'll be nice and fix it for them |
549
|
0
|
|
|
|
|
0
|
my ($d,$m,$y) = ($2,$3,$4); |
550
|
0
|
0
|
|
|
|
0
|
if( length($d) == 1 ) { |
551
|
0
|
|
|
|
|
0
|
$d = "0$d"; |
552
|
|
|
|
|
|
|
} |
553
|
|
|
|
|
|
|
# guess the century here |
554
|
0
|
0
|
|
|
|
0
|
if( length($y) == 2 ) { |
555
|
0
|
0
|
|
|
|
0
|
if( $y > 60 ) { # arbitrarily guess that '60' means 1960 |
556
|
0
|
|
|
|
|
0
|
$y = "19$y"; |
557
|
|
|
|
|
|
|
} else { |
558
|
0
|
|
|
|
|
0
|
$y = "20$y"; |
559
|
|
|
|
|
|
|
} |
560
|
0
|
|
|
|
|
0
|
$self->warn("Date was malformed, guessing the century for $date to be $y\n"); |
561
|
|
|
|
|
|
|
} |
562
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-dates'} = [join('-',$d,$m,$y)]; |
563
|
|
|
|
|
|
|
} else { |
564
|
30
|
|
|
|
|
94
|
$self->{'_params'}->{'-dates'} = [$date]; |
565
|
|
|
|
|
|
|
} |
566
|
|
|
|
|
|
|
} |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
# EMBL ID line |
570
|
|
|
|
|
|
|
sub _embl_id { |
571
|
10
|
|
|
10
|
|
11
|
my ($self, $data) = @_; |
572
|
10
|
|
|
|
|
13
|
my $alphabet; |
573
|
10
|
|
|
|
|
14
|
my ($name, $sv, $topology, $mol, $div); |
574
|
10
|
|
|
|
|
17
|
my $line = $data->{DATA}; |
575
|
|
|
|
|
|
|
#$self->debug("$line\n"); |
576
|
10
|
|
|
|
|
18
|
my ($idtype) = $line =~ tr/;/;/; |
577
|
10
|
100
|
|
|
|
27
|
if ( $idtype == 6) { # New style headers contain exactly six semicolons. |
|
|
100
|
|
|
|
|
|
578
|
|
|
|
|
|
|
# New style header (EMBL Release >= 87, after June 2006) |
579
|
1
|
|
|
|
|
3
|
my $topology; |
580
|
|
|
|
|
|
|
my $sv; |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
# ID DQ299383; SV 1; linear; mRNA; STD; MAM; 431 BP. |
583
|
|
|
|
|
|
|
# This regexp comes from the new2old.pl conversion script, from EBI |
584
|
1
|
50
|
|
|
|
7
|
if ($line =~ m/^(\w+);\s+SV (\d+); (\w+); ([^;]+); (\w{3}); (\w{3}); (\d+) \w{2}\./) { |
585
|
1
|
|
|
|
|
6
|
($name, $sv, $topology, $mol, $div) = ($1, $2, $3, $4, $6); |
586
|
|
|
|
|
|
|
} else { |
587
|
0
|
|
|
|
|
0
|
$self->throw("Unrecognized EMBL ID line:[$line]"); |
588
|
|
|
|
|
|
|
} |
589
|
1
|
50
|
|
|
|
3
|
if (defined($sv)) { |
590
|
1
|
|
|
|
|
3
|
$self->{'_params'}->{'-seq_version'} = $sv; |
591
|
1
|
|
|
|
|
3
|
$self->{'_params'}->{'-version'} = $sv; |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
|
594
|
1
|
50
|
|
|
|
2
|
if ($topology eq "circular") { |
595
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-is_circular'} = 1; |
596
|
|
|
|
|
|
|
} |
597
|
|
|
|
|
|
|
|
598
|
1
|
50
|
|
|
|
3
|
if (defined $mol ) { |
599
|
1
|
50
|
|
|
|
4
|
if ($mol =~ /DNA/) { |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
600
|
0
|
|
|
|
|
0
|
$alphabet='dna'; |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
elsif ($mol =~ /RNA/) { |
603
|
1
|
|
|
|
|
2
|
$alphabet='rna'; |
604
|
|
|
|
|
|
|
} |
605
|
|
|
|
|
|
|
elsif ($mol =~ /AA/) { |
606
|
0
|
|
|
|
|
0
|
$alphabet='protein'; |
607
|
|
|
|
|
|
|
} |
608
|
|
|
|
|
|
|
} |
609
|
|
|
|
|
|
|
} elsif ($idtype) { # has internal ';' |
610
|
|
|
|
|
|
|
# Old style header (EMBL Release < 87, before June 2006) |
611
|
8
|
50
|
|
|
|
55
|
if ($line =~ m{^(\S+)[^;]*;\s+(\S+)[^;]*;\s+(\S+)[^;]*;}) { |
612
|
8
|
|
|
|
|
30
|
($name, $mol, $div) = ($1, $2, $3); |
613
|
|
|
|
|
|
|
#$self->debug("[$name][$mol][$div]"); |
614
|
|
|
|
|
|
|
} |
615
|
|
|
|
|
|
|
|
616
|
8
|
50
|
|
|
|
15
|
if($mol) { |
617
|
8
|
50
|
|
|
|
20
|
if ( $mol =~ m{circular} ) { |
618
|
0
|
|
|
|
|
0
|
$self->{'_params'}->{'-is_circular'} = 1; |
619
|
0
|
|
|
|
|
0
|
$mol =~ s{circular }{}; |
620
|
|
|
|
|
|
|
} |
621
|
8
|
50
|
|
|
|
19
|
if (defined $mol ) { |
622
|
8
|
100
|
|
|
|
26
|
if ($mol =~ /DNA/) { |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
623
|
7
|
|
|
|
|
13
|
$alphabet='dna'; |
624
|
|
|
|
|
|
|
} |
625
|
|
|
|
|
|
|
elsif ($mol =~ /RNA/) { |
626
|
1
|
|
|
|
|
2
|
$alphabet='rna'; |
627
|
|
|
|
|
|
|
} |
628
|
|
|
|
|
|
|
elsif ($mol =~ /AA/) { |
629
|
0
|
|
|
|
|
0
|
$alphabet='protein'; |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
} |
632
|
|
|
|
|
|
|
} |
633
|
|
|
|
|
|
|
} else { |
634
|
1
|
|
|
|
|
2
|
$name = $data->{DATA}; |
635
|
|
|
|
|
|
|
} |
636
|
10
|
50
|
33
|
|
|
52
|
unless( defined $name && length($name) ) { |
637
|
0
|
|
|
|
|
0
|
$name = "unknown_id"; |
638
|
|
|
|
|
|
|
} |
639
|
10
|
|
|
|
|
36
|
$self->{'_params'}->{'-display_id'} = $name; |
640
|
10
|
|
|
|
|
16
|
$self->{'_params'}->{'-alphabet'} = $alphabet; |
641
|
10
|
100
|
|
|
|
26
|
$self->{'_params'}->{'-division'} = $div if $div; |
642
|
10
|
100
|
|
|
|
40
|
$self->{'_params'}->{'-molecule'} = $mol if $mol; |
643
|
|
|
|
|
|
|
} |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
# UniProt/SwissProt ID line |
646
|
|
|
|
|
|
|
sub _swiss_id { |
647
|
17
|
|
|
17
|
|
19
|
my ($self, $data) = @_; |
648
|
17
|
|
|
|
|
18
|
my ($name, $seq_div); |
649
|
17
|
50
|
|
|
|
112
|
if($data->{DATA} =~ m{^ |
650
|
|
|
|
|
|
|
(\S+) \s+ # $1 entryname |
651
|
|
|
|
|
|
|
([^\s;]+); \s+ # $2 DataClass |
652
|
|
|
|
|
|
|
(?:PRT;)? \s+ # Molecule Type (optional) |
653
|
|
|
|
|
|
|
[0-9]+[ ]AA \. # Sequencelength (capture?) |
654
|
|
|
|
|
|
|
$ |
655
|
|
|
|
|
|
|
}ox ) { |
656
|
17
|
|
|
|
|
47
|
($name, $seq_div) = ($1, $2); |
657
|
17
|
50
|
100
|
|
|
127
|
$self->{'_params'}->{'-namespace'} = |
|
|
100
|
66
|
|
|
|
|
658
|
|
|
|
|
|
|
($seq_div eq 'Reviewed' || $seq_div eq 'STANDARD') ? 'Swiss-Prot' : |
659
|
|
|
|
|
|
|
($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY') ? 'TrEMBL' : |
660
|
|
|
|
|
|
|
$seq_div; |
661
|
|
|
|
|
|
|
# we shouldn't be setting the division, but for now... |
662
|
17
|
|
|
|
|
52
|
my ($junk, $division) = split q(_), $name; |
663
|
17
|
|
|
|
|
44
|
$self->{'_params'}->{'-division'} = $division; |
664
|
17
|
|
|
|
|
28
|
$self->{'_params'}->{'-alphabet'} = 'protein'; |
665
|
|
|
|
|
|
|
# this is important to have the id for display in e.g. FTHelper, otherwise |
666
|
|
|
|
|
|
|
# you won't know which entry caused an error |
667
|
17
|
|
|
|
|
43
|
$self->{'_params'}->{'-display_id'} = $name; |
668
|
|
|
|
|
|
|
} else { |
669
|
0
|
|
|
|
|
0
|
$self->throw("Unrecognized UniProt/SwissProt ID line:[".$data->{DATA}."]"); |
670
|
|
|
|
|
|
|
} |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# UniProt/SwissProt GN line |
674
|
|
|
|
|
|
|
sub _swiss_genename { |
675
|
17
|
|
|
17
|
|
22
|
my ($self, $data) = @_; |
676
|
|
|
|
|
|
|
#$self->debug(Dumper($data)); |
677
|
17
|
|
|
|
|
38
|
my $genename = $data->{DATA}; |
678
|
17
|
|
|
|
|
17
|
my $gn; |
679
|
17
|
50
|
|
|
|
35
|
if ($genename) { |
680
|
17
|
|
|
|
|
19
|
my @stags; |
681
|
17
|
100
|
|
|
|
56
|
if ($genename =~ /\w=\w/) { |
682
|
|
|
|
|
|
|
# new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP) |
683
|
10
|
|
|
|
|
35
|
for my $n (split(m{\s+and\s+},$genename)) { |
684
|
12
|
|
|
|
|
10
|
my @genenames; |
685
|
12
|
|
|
|
|
50
|
for my $section (split(m{\s*;\s*},$n)) { |
686
|
15
|
|
|
|
|
43
|
my ($tag, $rest) = split("=",$section); |
687
|
15
|
|
50
|
|
|
35
|
$rest ||= ''; |
688
|
15
|
|
|
|
|
49
|
for my $val (split(m{\s*,\s*},$rest)) { |
689
|
19
|
|
|
|
|
47
|
push @genenames, [$tag => $val]; |
690
|
|
|
|
|
|
|
} |
691
|
|
|
|
|
|
|
} |
692
|
12
|
|
|
|
|
37
|
push @stags, ['gene_name' => \@genenames]; |
693
|
|
|
|
|
|
|
} |
694
|
|
|
|
|
|
|
} else { |
695
|
|
|
|
|
|
|
# old format |
696
|
7
|
|
|
|
|
22
|
for my $section (split(/ AND /, $genename)) { |
697
|
9
|
|
|
|
|
9
|
my @genenames; |
698
|
9
|
|
|
|
|
30
|
$section =~ s/[\(\)\.]//g; |
699
|
9
|
|
|
|
|
25
|
my @names = split(m{\s+OR\s+}, $section); |
700
|
9
|
|
|
|
|
37
|
push @genenames, ['Name' => shift @names]; |
701
|
9
|
|
|
|
|
16
|
push @genenames, map {['Synonyms' => $_]} @names; |
|
10
|
|
|
|
|
14
|
|
702
|
9
|
|
|
|
|
45
|
push @stags, ['gene_name' => \@genenames] |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
} #use Data::Dumper; print Dumper $gn, $genename;# exit; |
705
|
17
|
|
|
|
|
129
|
my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name', |
706
|
|
|
|
|
|
|
-value => ['gene_names' => \@stags]); |
707
|
17
|
|
|
|
|
41
|
$self->annotation_collection->add_Annotation('gene_name', $gn); |
708
|
|
|
|
|
|
|
} |
709
|
|
|
|
|
|
|
} |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
# GenBank VERSION line |
712
|
|
|
|
|
|
|
# old EMBL SV line (now obsolete) |
713
|
|
|
|
|
|
|
# UniProt/SwissProt? |
714
|
|
|
|
|
|
|
sub _generic_version { |
715
|
28
|
|
|
28
|
|
33
|
my ($self, $data) = @_; |
716
|
28
|
|
|
|
|
83
|
my ($acc,$gi) = split(' ',$data->{DATA}); |
717
|
28
|
100
|
|
|
|
106
|
if($acc =~ m{^\w+\.(\d+)}xmso) { |
718
|
27
|
|
|
|
|
64
|
$self->{'_params'}->{'-version'} = $1; |
719
|
27
|
|
|
|
|
55
|
$self->{'_params'}->{'-seq_version'} = $1; |
720
|
|
|
|
|
|
|
} |
721
|
28
|
100
|
66
|
|
|
124
|
if($gi && (index($gi,"GI:") == 0)) { |
722
|
24
|
|
|
|
|
80
|
$self->{'_params'}->{'-primary_id'} = substr($gi,3); |
723
|
|
|
|
|
|
|
} |
724
|
|
|
|
|
|
|
} |
725
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
# EMBL DT lines |
727
|
|
|
|
|
|
|
sub _embl_date { |
728
|
5
|
|
|
5
|
|
8
|
my ($self, $data) = @_; |
729
|
5
|
|
|
|
|
35
|
while ($data->{DATA} =~ m{(\S+)\s\((.*?)\)}g) { |
730
|
10
|
|
|
|
|
24
|
my ($date, $version) = ($1, $2); |
731
|
10
|
|
|
|
|
11
|
$date =~ tr{,}{}d; # remove comma if new version |
732
|
10
|
50
|
|
|
|
54
|
if ($version =~ m{\(Rel\.\s(\d+),\sCreated\)}xmso ) { |
|
|
50
|
|
|
|
|
|
733
|
0
|
|
|
|
|
0
|
my $release = Bio::Annotation::SimpleValue->new( |
734
|
|
|
|
|
|
|
-tagname => 'creation_release', |
735
|
|
|
|
|
|
|
-value => $1 |
736
|
|
|
|
|
|
|
); |
737
|
0
|
|
|
|
|
0
|
$self->annotation_collection->add_Annotation($release); |
738
|
|
|
|
|
|
|
} elsif ($version =~ m{\(Rel\.\s(\d+),\sLast\supdated,\sVersion\s(\d+)\)}xmso ) { |
739
|
0
|
|
|
|
|
0
|
my $release = Bio::Annotation::SimpleValue->new( |
740
|
|
|
|
|
|
|
-tagname => 'update_release', |
741
|
|
|
|
|
|
|
-value => $1 |
742
|
|
|
|
|
|
|
); |
743
|
0
|
|
|
|
|
0
|
$self->annotation_collection->add_Annotation($release); |
744
|
0
|
|
|
|
|
0
|
my $update = Bio::Annotation::SimpleValue->new( |
745
|
|
|
|
|
|
|
-tagname => 'update_version', |
746
|
|
|
|
|
|
|
-value => $2 |
747
|
|
|
|
|
|
|
); |
748
|
0
|
|
|
|
|
0
|
$self->annotation_collection->add_Annotation($update); |
749
|
|
|
|
|
|
|
} |
750
|
10
|
|
|
|
|
10
|
push @{ $self->{'_params'}->{'-dates'} }, $date; |
|
10
|
|
|
|
|
51
|
|
751
|
|
|
|
|
|
|
} |
752
|
|
|
|
|
|
|
} |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
# UniProt/SwissProt DT lines |
755
|
|
|
|
|
|
|
sub _swiss_date { |
756
|
17
|
|
|
17
|
|
27
|
my ($self, $data) = @_; |
757
|
|
|
|
|
|
|
# swissprot |
758
|
17
|
|
|
|
|
49
|
my @dls = split m{\n}, $data->{DATA}; |
759
|
17
|
|
|
|
|
44
|
for my $dl (@dls) { |
760
|
51
|
|
|
|
|
104
|
my ($date, $version) = split(' ', $dl, 2); |
761
|
51
|
|
|
|
|
75
|
$date =~ tr{,}{}d; # remove comma if new version |
762
|
51
|
100
|
100
|
|
|
369
|
if ($version =~ m{\(Rel\. (\d+), Last sequence update\)} || # old |
|
|
100
|
100
|
|
|
|
|
763
|
|
|
|
|
|
|
$version =~ m{sequence version (\d+)\.}) { #new |
764
|
17
|
|
|
|
|
96
|
my $update = Bio::Annotation::SimpleValue->new( |
765
|
|
|
|
|
|
|
-tagname => 'seq_update', |
766
|
|
|
|
|
|
|
-value => $1 |
767
|
|
|
|
|
|
|
); |
768
|
17
|
|
|
|
|
44
|
$self->annotation_collection->add_Annotation($update); |
769
|
|
|
|
|
|
|
} elsif ($version =~ m{\(Rel\. (\d+), Last annotation update\)} || #old |
770
|
|
|
|
|
|
|
$version =~ m{entry version (\d+)\.}) { #new |
771
|
17
|
|
|
|
|
58
|
$self->{'_params'}->{'-version'} = $1; |
772
|
17
|
|
|
|
|
28
|
$self->{'_params'}->{'-seq_version'} = $1; |
773
|
|
|
|
|
|
|
} |
774
|
51
|
|
|
|
|
46
|
push @{ $self->{'_params'}->{'-dates'} }, $date; |
|
51
|
|
|
|
|
146
|
|
775
|
|
|
|
|
|
|
} |
776
|
|
|
|
|
|
|
} |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
# GenBank KEYWORDS line |
779
|
|
|
|
|
|
|
# EMBL KW line |
780
|
|
|
|
|
|
|
# UniProt/SwissProt KW line |
781
|
|
|
|
|
|
|
sub _generic_keywords { |
782
|
55
|
|
|
55
|
|
73
|
my ($self, $data) = @_; |
783
|
55
|
|
|
|
|
184
|
$data->{DATA} =~ s{\.$}{}; |
784
|
55
|
|
|
|
|
366
|
my @kw = split m{\s*\;\s*}xo ,$data->{DATA}; |
785
|
55
|
|
|
|
|
131
|
$self->{'_params'}->{'-keywords'} = \@kw; |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# GenBank DEFINITION line |
789
|
|
|
|
|
|
|
# EMBL DE line |
790
|
|
|
|
|
|
|
# UniProt/SwissProt DE line |
791
|
|
|
|
|
|
|
sub _generic_description { |
792
|
57
|
|
|
57
|
|
63
|
my ($self, $data) = @_; |
793
|
57
|
|
|
|
|
140
|
$self->{'_params'}->{'-desc'} = $data->{DATA}; |
794
|
|
|
|
|
|
|
} |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
# GenBank ACCESSION line |
797
|
|
|
|
|
|
|
# EMBL AC line |
798
|
|
|
|
|
|
|
# UniProt/SwissProt AC line |
799
|
|
|
|
|
|
|
sub _generic_accession { |
800
|
56
|
|
|
56
|
|
70
|
my ($self, $data) = @_; |
801
|
56
|
|
|
|
|
231
|
my @accs = split m{[\s;]+}, $data->{DATA}; |
802
|
56
|
|
|
|
|
129
|
$self->{'_params'}->{'-accession_number'} = shift @accs; |
803
|
56
|
100
|
|
|
|
170
|
$self->{'_params'}->{'-secondary_accessions'} = \@accs if @accs; |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
####################### SPECIES HANDLERS ####################### |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
# uses Bio::Species |
809
|
|
|
|
|
|
|
# GenBank SOURCE, ORGANISM lines |
810
|
|
|
|
|
|
|
# EMBL O* lines |
811
|
|
|
|
|
|
|
# UniProt/SwissProt O* lines |
812
|
|
|
|
|
|
|
sub _generic_species { |
813
|
56
|
|
|
56
|
|
63
|
my ($self, $data) = @_; |
814
|
|
|
|
|
|
|
|
815
|
56
|
|
|
|
|
102
|
my $seqformat = $self->format; |
816
|
|
|
|
|
|
|
# if data is coming in from GenBank parser... |
817
|
56
|
100
|
66
|
|
|
364
|
if ($seqformat eq 'genbank' && |
818
|
|
|
|
|
|
|
$data->{ORGANISM} =~ m{(.+?)\s(\S+;[^\n\.]+)}ox) { |
819
|
30
|
|
|
|
|
110
|
($data->{ORGANISM}, $data->{CLASSIFICATION}) = ($1, $2); |
820
|
|
|
|
|
|
|
} |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
# SwissProt stuff... |
823
|
|
|
|
|
|
|
# hybrid names in swissprot files are no longer valid per intergration into |
824
|
|
|
|
|
|
|
# UniProt. Files containing these have been split into separate entries, so |
825
|
|
|
|
|
|
|
# it is probably a good idea to update if one has these lingering around... |
826
|
|
|
|
|
|
|
|
827
|
56
|
|
|
|
|
67
|
my $taxid; |
828
|
56
|
100
|
|
|
|
128
|
if ($seqformat eq 'swiss') { |
829
|
17
|
50
|
|
|
|
79
|
if ($data->{DATA} =~ m{^([^,]+)}ox) { |
830
|
17
|
|
|
|
|
54
|
$data->{DATA} = $1; |
831
|
|
|
|
|
|
|
} |
832
|
17
|
100
|
66
|
|
|
107
|
if ($data->{CROSSREF} && $data->{CROSSREF} =~ m{NCBI_TaxID=(\d+)}) { |
833
|
16
|
|
|
|
|
31
|
$taxid = $1; |
834
|
|
|
|
|
|
|
} |
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
my ($sl, $class, $sci_name) = ($data->{DATA}, |
838
|
|
|
|
|
|
|
$data->{CLASSIFICATION}, |
839
|
56
|
|
100
|
|
|
200
|
$data->{ORGANISM} || ''); |
840
|
56
|
|
|
|
|
66
|
my ($organelle,$abbr_name, $common); |
841
|
56
|
|
|
|
|
550
|
my @class = reverse split m{\s*;\s*}, $class; |
842
|
|
|
|
|
|
|
# have to treat swiss different from everything else... |
843
|
56
|
50
|
|
|
|
527
|
if ($sl =~ m{^(mitochondrion|chloroplast|plastid)? # GenBank format |
844
|
|
|
|
|
|
|
\s*(.*?) |
845
|
|
|
|
|
|
|
\s*(?: \( (.*?) \) )?\.?$ |
846
|
|
|
|
|
|
|
}xmso ){ |
847
|
56
|
|
|
|
|
157
|
($organelle, $abbr_name, $common) = ($1, $2, $3); # optional |
848
|
|
|
|
|
|
|
} else { |
849
|
0
|
|
|
|
|
0
|
$abbr_name = $sl; # nothing caught; this is a backup! |
850
|
|
|
|
|
|
|
} |
851
|
|
|
|
|
|
|
# there is no 'abbreviated name' for EMBL |
852
|
56
|
100
|
|
|
|
141
|
$sci_name = $abbr_name if $seqformat ne 'genbank'; |
853
|
56
|
|
100
|
|
|
189
|
$organelle ||= ''; |
854
|
56
|
|
100
|
|
|
119
|
$common ||= ''; |
855
|
56
|
50
|
|
|
|
93
|
$sci_name || return; |
856
|
56
|
|
|
|
|
81
|
unshift @class, $sci_name; |
857
|
|
|
|
|
|
|
# no genus/species parsing here; moving to Bio::Taxon-based taxonomy |
858
|
56
|
|
|
|
|
267
|
my $make = Bio::Species->new(); |
859
|
56
|
|
|
|
|
149
|
$make->scientific_name($sci_name); |
860
|
56
|
50
|
|
|
|
219
|
$make->classification(@class) if @class > 0; |
861
|
56
|
100
|
|
|
|
139
|
$common && $make->common_name( $common ); |
862
|
56
|
50
|
|
|
|
185
|
$abbr_name && $make->name('abbreviated', $abbr_name); |
863
|
56
|
100
|
|
|
|
108
|
$organelle && $make->organelle($organelle); |
864
|
56
|
100
|
|
|
|
109
|
$taxid && $make->ncbi_taxid($taxid); |
865
|
56
|
|
|
|
|
281
|
$self->{'_params'}->{'-species'} = $make; |
866
|
|
|
|
|
|
|
} |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
####################### ANNOTATION HANDLERS ####################### |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
# GenBank DBSOURCE line |
871
|
|
|
|
|
|
|
sub _genbank_dbsource { |
872
|
1
|
|
|
1
|
|
2
|
my ($self, $data) = @_; |
873
|
1
|
|
|
|
|
2
|
my $dbsource = $data->{DATA}; |
874
|
1
|
|
|
|
|
2
|
my $annotation = $self->annotation_collection; |
875
|
|
|
|
|
|
|
# deal with swissprot dbsources |
876
|
|
|
|
|
|
|
# we could possibly parcel these out to subhandlers... |
877
|
1
|
50
|
|
|
|
4
|
if( $dbsource =~ s/(UniProt(?:KB)|swissprot):\s+locus\s+(\S+)\,.+\n// ) { |
878
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
879
|
|
|
|
|
|
|
('dblink', |
880
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new |
881
|
|
|
|
|
|
|
(-primary_id => $2, |
882
|
|
|
|
|
|
|
-database => $1, |
883
|
|
|
|
|
|
|
-tagname => 'dblink')); |
884
|
0
|
0
|
|
|
|
0
|
if( $dbsource =~ s/\s*created:\s+([^\.]+)\.\n// ) { |
885
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
886
|
|
|
|
|
|
|
('swissprot_dates', |
887
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new |
888
|
|
|
|
|
|
|
(-tagname => 'date_created', |
889
|
|
|
|
|
|
|
-value => $1)); |
890
|
|
|
|
|
|
|
} |
891
|
0
|
|
|
|
|
0
|
while( $dbsource =~ s/\s*(sequence|annotation)\s+updated:\s+([^\.]+)\.\n//g ) { |
892
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
893
|
|
|
|
|
|
|
('swissprot_dates', |
894
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new |
895
|
|
|
|
|
|
|
(-tagname => 'date_updated', |
896
|
|
|
|
|
|
|
-value => $1)); |
897
|
|
|
|
|
|
|
} |
898
|
0
|
|
|
|
|
0
|
$dbsource =~ s/\n/ /g; |
899
|
0
|
0
|
|
|
|
0
|
if( $dbsource =~ s/\s*xrefs:\s+((?:\S+,\s+)+\S+)\s+xrefs/xrefs/ ) { |
|
|
0
|
|
|
|
|
|
900
|
|
|
|
|
|
|
# will use $i to determine even or odd |
901
|
|
|
|
|
|
|
# for swissprot the accessions are paired |
902
|
0
|
|
|
|
|
0
|
my $i = 0; |
903
|
0
|
|
|
|
|
0
|
for my $dbsrc ( split(/,\s+/,$1) ) { |
904
|
0
|
0
|
0
|
|
|
0
|
if( $dbsrc =~ /(\S+)\.(\d+)/ || $dbsrc =~ /(\S+)/ ) { |
905
|
0
|
|
|
|
|
0
|
my ($id,$version) = ($1,$2); |
906
|
0
|
0
|
|
|
|
0
|
$version ='' unless defined $version; |
907
|
0
|
|
|
|
|
0
|
my $db; |
908
|
0
|
0
|
|
|
|
0
|
if( $id =~ /^\d\S{3}/) { |
909
|
0
|
|
|
|
|
0
|
$db = 'PDB'; |
910
|
|
|
|
|
|
|
} else { |
911
|
0
|
0
|
|
|
|
0
|
$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank'; |
912
|
|
|
|
|
|
|
} |
913
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
914
|
|
|
|
|
|
|
('dblink', |
915
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new |
916
|
|
|
|
|
|
|
(-primary_id => $id, |
917
|
|
|
|
|
|
|
-version => $version, |
918
|
|
|
|
|
|
|
-database => $db, |
919
|
|
|
|
|
|
|
-tagname => 'dblink')); |
920
|
|
|
|
|
|
|
} |
921
|
|
|
|
|
|
|
} |
922
|
|
|
|
|
|
|
} elsif( $dbsource =~ s/\s*xrefs:\s+(.+)\s+xrefs/xrefs/i ) { |
923
|
|
|
|
|
|
|
# download screwed up and ncbi didn't put acc in for gi numbers |
924
|
0
|
|
|
|
|
0
|
my $i = 0; |
925
|
0
|
|
|
|
|
0
|
for my $id ( split(/\,\s+/,$1) ) { |
926
|
0
|
|
|
|
|
0
|
my ($acc,$db); |
927
|
0
|
0
|
|
|
|
0
|
if( $id =~ /gi:\s+(\d+)/ ) { |
|
|
0
|
|
|
|
|
|
928
|
0
|
|
|
|
|
0
|
$acc= $1; |
929
|
0
|
0
|
|
|
|
0
|
$db = ($i++ % 2 ) ? 'GenPept' : 'GenBank'; |
930
|
|
|
|
|
|
|
} elsif( $id =~ /pdb\s+accession\s+(\S+)/ ) { |
931
|
0
|
|
|
|
|
0
|
$acc= $1; |
932
|
0
|
|
|
|
|
0
|
$db = 'PDB'; |
933
|
|
|
|
|
|
|
} else { |
934
|
0
|
|
|
|
|
0
|
$acc= $id; |
935
|
0
|
|
|
|
|
0
|
$db = ''; |
936
|
|
|
|
|
|
|
} |
937
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
938
|
|
|
|
|
|
|
('dblink', |
939
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new |
940
|
|
|
|
|
|
|
(-primary_id => $acc, |
941
|
|
|
|
|
|
|
-database => $db, |
942
|
|
|
|
|
|
|
-tagname => 'dblink')); |
943
|
|
|
|
|
|
|
} |
944
|
|
|
|
|
|
|
} else { |
945
|
0
|
|
|
|
|
0
|
$self->warn("Cannot match $dbsource\n"); |
946
|
|
|
|
|
|
|
} |
947
|
0
|
0
|
|
|
|
0
|
if( $dbsource =~ s/xrefs\s+\(non\-sequence\s+databases\):\s+ |
948
|
|
|
|
|
|
|
((?:\S+,\s+)+\S+)//x ) { |
949
|
0
|
|
|
|
|
0
|
for my $id ( split(/\,\s+/,$1) ) { |
950
|
0
|
|
|
|
|
0
|
my $db; |
951
|
|
|
|
|
|
|
# this is because GenBank dropped the spaces!!! |
952
|
|
|
|
|
|
|
# I'm sure we're not going to get this right |
953
|
|
|
|
|
|
|
##if( $id =~ s/^://i ) { |
954
|
|
|
|
|
|
|
## $db = $1; |
955
|
|
|
|
|
|
|
##} |
956
|
0
|
|
|
|
|
0
|
$db = substr($id,0,index($id,':')); |
957
|
0
|
0
|
|
|
|
0
|
if (! exists $DBSOURCE{ $db }) { |
958
|
0
|
|
|
|
|
0
|
$db = ''; # do we want 'GenBank' here? |
959
|
|
|
|
|
|
|
} |
960
|
0
|
|
|
|
|
0
|
$id = substr($id,index($id,':')+1); |
961
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation |
962
|
|
|
|
|
|
|
('dblink',Bio::Annotation::DBLink->new |
963
|
|
|
|
|
|
|
(-primary_id => $id, |
964
|
|
|
|
|
|
|
-database => $db, |
965
|
|
|
|
|
|
|
-tagname => 'dblink')); |
966
|
|
|
|
|
|
|
} |
967
|
|
|
|
|
|
|
} |
968
|
|
|
|
|
|
|
} else { |
969
|
1
|
50
|
|
|
|
6
|
if( $dbsource =~ /^(\S*?):?\s*accession\s+(\S+)\.(\d+)/ ) { |
|
|
0
|
|
|
|
|
|
970
|
1
|
|
|
|
|
2
|
my ($db,$id,$version) = ($1,$2,$3); |
971
|
1
|
|
50
|
|
|
11
|
$annotation->add_Annotation |
972
|
|
|
|
|
|
|
('dblink', |
973
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new |
974
|
|
|
|
|
|
|
(-primary_id => $id, |
975
|
|
|
|
|
|
|
-version => $version, |
976
|
|
|
|
|
|
|
-database => $db || 'GenBank', |
977
|
|
|
|
|
|
|
-tagname => 'dblink')); |
978
|
|
|
|
|
|
|
} elsif ( $dbsource =~ /(\S+)([\.:])(\d+)/ ) { |
979
|
0
|
|
|
|
|
0
|
my ($id, $db, $version); |
980
|
0
|
0
|
|
|
|
0
|
if ($2 eq ':') { |
981
|
0
|
|
|
|
|
0
|
($db, $id) = ($1, $3); |
982
|
|
|
|
|
|
|
} else { |
983
|
0
|
|
|
|
|
0
|
($db, $id, $version) = ('GenBank', $1, $3); |
984
|
|
|
|
|
|
|
} |
985
|
0
|
|
|
|
|
0
|
$annotation->add_Annotation('dblink', |
986
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
987
|
|
|
|
|
|
|
-primary_id => $id, |
988
|
|
|
|
|
|
|
-version => $version, |
989
|
|
|
|
|
|
|
-database => $db, |
990
|
|
|
|
|
|
|
-tagname => 'dblink') |
991
|
|
|
|
|
|
|
); |
992
|
|
|
|
|
|
|
} else { |
993
|
0
|
|
|
|
|
0
|
$self->warn("Unrecognized DBSOURCE data: $dbsource\n"); |
994
|
|
|
|
|
|
|
} |
995
|
|
|
|
|
|
|
} |
996
|
|
|
|
|
|
|
} |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
# EMBL DR lines |
999
|
|
|
|
|
|
|
# UniProt/SwissProt DR lines |
1000
|
|
|
|
|
|
|
sub _generic_dbsource { |
1001
|
20
|
|
|
20
|
|
25
|
my ($self, $data) = @_; |
1002
|
|
|
|
|
|
|
#$self->debug(Dumper($data)); |
1003
|
20
|
|
|
|
|
124
|
while ($data->{DATA} =~ m{([^\n]+)}og) { |
1004
|
351
|
|
|
|
|
599
|
my $dblink = $1; |
1005
|
351
|
|
|
|
|
800
|
$dblink =~ s{\.$}{}; |
1006
|
351
|
|
|
|
|
308
|
my $link; |
1007
|
351
|
|
|
|
|
649
|
my @linkdata = split '; ',$dblink; |
1008
|
351
|
50
|
|
|
|
1221
|
if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) { |
1009
|
|
|
|
|
|
|
#if ( $dblink =~ m{([^\s;]+);\s*([^\s;]+);?\s*([^\s;]+)?}) { |
1010
|
351
|
|
|
|
|
631
|
my ($databse, $prim_id, $sec_id) = ($1,$2,$3); |
1011
|
351
|
|
|
|
|
990
|
$link = Bio::Annotation::DBLink->new(-database => $databse, |
1012
|
|
|
|
|
|
|
-primary_id => $prim_id, |
1013
|
|
|
|
|
|
|
-optional_id => $sec_id); |
1014
|
|
|
|
|
|
|
} else { |
1015
|
0
|
|
|
|
|
0
|
$self->warn("No match for $dblink"); |
1016
|
|
|
|
|
|
|
} |
1017
|
351
|
|
|
|
|
591
|
$self->annotation_collection->add_Annotation('dblink', $link); |
1018
|
|
|
|
|
|
|
} |
1019
|
|
|
|
|
|
|
} |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
# GenBank REFERENCE and related lines |
1023
|
|
|
|
|
|
|
# EMBL R* lines |
1024
|
|
|
|
|
|
|
# UniProt/SwissProt R* lines |
1025
|
|
|
|
|
|
|
sub _generic_reference { |
1026
|
301
|
|
|
301
|
|
252
|
my ($self, $data) = @_; |
1027
|
301
|
|
|
|
|
400
|
my $seqformat = $self->format; |
1028
|
301
|
|
|
|
|
262
|
my ($start, $end); |
1029
|
|
|
|
|
|
|
# get these in EMBL/Swiss |
1030
|
301
|
100
|
|
|
|
448
|
if ($data->{CROSSREF}) { |
1031
|
128
|
|
|
|
|
615
|
while ($data->{CROSSREF} =~ m{(pubmed|doi|medline)(?:=|;\s+)(\S+)}oig) { |
1032
|
199
|
|
|
|
|
407
|
my ($db, $ref) = (uc $1, $2); |
1033
|
199
|
|
|
|
|
407
|
$ref =~ s{[;.]+$}{}; |
1034
|
199
|
|
|
|
|
548
|
$data->{$db} = $ref; |
1035
|
|
|
|
|
|
|
} |
1036
|
|
|
|
|
|
|
} |
1037
|
|
|
|
|
|
|
# run some cleanup for swissprot |
1038
|
301
|
100
|
|
|
|
457
|
if ($seqformat eq 'swiss') { |
1039
|
109
|
|
|
|
|
144
|
for my $val (values %{ $data }) { |
|
109
|
|
|
|
|
261
|
|
1040
|
959
|
|
|
|
|
983
|
$val =~ s{;$}{}; |
1041
|
959
|
|
|
|
|
956
|
$val =~ s{(\w-)\s}{$1}; |
1042
|
|
|
|
|
|
|
} |
1043
|
|
|
|
|
|
|
} |
1044
|
301
|
100
|
|
|
|
430
|
if ( $data->{POSITION} ) { |
1045
|
127
|
100
|
|
|
|
421
|
if ($seqformat eq 'embl') { |
|
|
100
|
|
|
|
|
|
1046
|
18
|
|
|
|
|
53
|
($start, $end) = split '-', $data->{POSITION},2; |
1047
|
|
|
|
|
|
|
} elsif ($data->{POSITION} =~ m{.+? OF (\d+)-(\d+).*}) { #swiss |
1048
|
23
|
|
|
|
|
42
|
($start, $end) = ($1, $2); |
1049
|
|
|
|
|
|
|
} |
1050
|
|
|
|
|
|
|
} |
1051
|
301
|
100
|
|
|
|
649
|
if ($data->{DATA} =~ m{^\d+\s+\([a-z]+\s+(\d+)\s+to\s+(\d+)\)}xmso) { |
1052
|
59
|
|
|
|
|
98
|
($start, $end) = ($1, $2); |
1053
|
|
|
|
|
|
|
} |
1054
|
|
|
|
|
|
|
my $ref = Bio::Annotation::Reference->new( |
1055
|
|
|
|
|
|
|
-comment => $data->{REMARK}, |
1056
|
|
|
|
|
|
|
-location => $data->{JOURNAL}, |
1057
|
|
|
|
|
|
|
-pubmed => $data->{PUBMED}, |
1058
|
|
|
|
|
|
|
-consortium => $data->{CONSRTM}, |
1059
|
|
|
|
|
|
|
-title => $data->{TITLE}, |
1060
|
|
|
|
|
|
|
-authors => $data->{AUTHORS}, |
1061
|
|
|
|
|
|
|
-medline => $data->{MEDLINE}, |
1062
|
|
|
|
|
|
|
-doi => $data->{DOI}, |
1063
|
|
|
|
|
|
|
-rp => $data->{POSITION}, # JIC... |
1064
|
301
|
|
|
|
|
2455
|
-start => $start, |
1065
|
|
|
|
|
|
|
-end => $end, |
1066
|
|
|
|
|
|
|
); |
1067
|
301
|
100
|
|
|
|
1093
|
if ($data->{DATA} =~ m{^\d+\s+\((.*)\)}xmso) { |
1068
|
59
|
|
|
|
|
147
|
$ref->gb_reference($1); |
1069
|
|
|
|
|
|
|
} |
1070
|
301
|
|
|
|
|
503
|
$self->annotation_collection->add_Annotation('reference', $ref); |
1071
|
|
|
|
|
|
|
} |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
# GenBank COMMENT lines |
1074
|
|
|
|
|
|
|
# EMBL CC lines |
1075
|
|
|
|
|
|
|
# UniProt/SwissProt CC lines |
1076
|
|
|
|
|
|
|
sub _generic_comment { |
1077
|
43
|
|
|
43
|
|
53
|
my ($self, $data) = @_; |
1078
|
|
|
|
|
|
|
$self->annotation_collection->add_Annotation('comment', |
1079
|
43
|
|
|
|
|
84
|
Bio::Annotation::Comment->new( -text => $data->{DATA} )); |
1080
|
|
|
|
|
|
|
} |
1081
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
####################### SEQFEATURE HANDLER ####################### |
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
# GenBank Feature Table |
1085
|
|
|
|
|
|
|
sub _generic_seqfeatures { |
1086
|
848
|
|
|
848
|
|
690
|
my ($self, $data) = @_; |
1087
|
848
|
100
|
|
|
|
1377
|
return if $data->{FEATURE_KEY} eq 'FEATURES'; |
1088
|
818
|
|
|
|
|
720
|
my $primary_tag = $data->{FEATURE_KEY}; |
1089
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
# grab the NCBI taxon ID from the source SF |
1091
|
818
|
100
|
66
|
|
|
1310
|
if ($primary_tag eq 'source' && exists $data->{'db_xref'}) { |
1092
|
36
|
100
|
66
|
|
|
220
|
if ( $self->{'_params'}->{'-species'} && |
1093
|
|
|
|
|
|
|
$data->{'db_xref'} =~ m{taxon:(\d+)}xmso ) { |
1094
|
35
|
|
|
|
|
112
|
$self->{'_params'}->{'-species'}->ncbi_taxid($1); |
1095
|
|
|
|
|
|
|
} |
1096
|
|
|
|
|
|
|
} |
1097
|
818
|
|
|
|
|
1161
|
my $source = $self->format; |
1098
|
|
|
|
|
|
|
|
1099
|
818
|
|
|
|
|
654
|
my $seqid = ${ $self->get_params('accession_number') }{'accession_number'}; |
|
818
|
|
|
|
|
1026
|
|
1100
|
|
|
|
|
|
|
|
1101
|
818
|
|
|
|
|
878
|
my $loc; |
1102
|
818
|
|
|
|
|
838
|
eval { |
1103
|
818
|
|
|
|
|
2061
|
$loc = $self->{'_locfactory'}->from_string($data->{'LOCATION'}); |
1104
|
|
|
|
|
|
|
}; |
1105
|
818
|
50
|
|
|
|
1399
|
if(! $loc) { |
1106
|
|
|
|
|
|
|
$self->warn("exception while parsing location line [" . |
1107
|
|
|
|
|
|
|
$data->{'LOCATION'} . |
1108
|
|
|
|
|
|
|
"] in reading $source, ignoring feature " . |
1109
|
0
|
|
|
|
|
0
|
$data->{'primary_tag'}. |
1110
|
|
|
|
|
|
|
" (seqid=" . $seqid . "): " . $@); |
1111
|
0
|
|
|
|
|
0
|
return; |
1112
|
|
|
|
|
|
|
} |
1113
|
818
|
50
|
33
|
|
|
1282
|
if($seqid && (! $loc->is_remote())) { |
1114
|
0
|
|
|
|
|
0
|
$loc->seq_id($seqid); # propagates if it is a split location |
1115
|
|
|
|
|
|
|
} |
1116
|
818
|
|
|
|
|
1919
|
my $sf = Bio::SeqFeature::Generic->direct_new(); |
1117
|
818
|
|
|
|
|
1208
|
$sf->location($loc); |
1118
|
818
|
|
|
|
|
1231
|
$sf->primary_tag($primary_tag); |
1119
|
818
|
|
|
|
|
1222
|
$sf->seq_id($seqid); |
1120
|
818
|
|
|
|
|
1106
|
$sf->source_tag($source); |
1121
|
818
|
|
|
|
|
977
|
delete $data->{'FEATURE_KEY'}; |
1122
|
818
|
|
|
|
|
808
|
delete $data->{'LOCATION'}; |
1123
|
818
|
|
|
|
|
835
|
delete $data->{'NAME'}; |
1124
|
818
|
|
|
|
|
597
|
delete $data->{'DATA'}; |
1125
|
818
|
|
|
|
|
1788
|
$sf->set_attributes(-tag => $data); |
1126
|
818
|
|
|
|
|
902
|
push @{ $self->{'_params'}->{'-features'} }, $sf; |
|
818
|
|
|
|
|
2462
|
|
1127
|
|
|
|
|
|
|
} |
1128
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
####################### ODDS AND ENDS ####################### |
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
# Those things that don't fit anywhere else. If a specific name |
1132
|
|
|
|
|
|
|
# maps to the below table, that class and method are used, otherwise |
1133
|
|
|
|
|
|
|
# it goes into a SimpleValue (I think this is a good argument for why |
1134
|
|
|
|
|
|
|
# we need a generic mechanism for storing annotation) |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
sub _generic_simplevalue { |
1137
|
16
|
|
|
16
|
|
16
|
my ($self, $data) = @_; |
1138
|
|
|
|
|
|
|
$self->annotation_collection->add_Annotation( |
1139
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}), |
1140
|
|
|
|
|
|
|
-value => $data->{DATA}) |
1141
|
16
|
|
|
|
|
41
|
); |
1142
|
|
|
|
|
|
|
} |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
15
|
0
|
|
sub noop {} |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
sub _debug { |
1147
|
0
|
|
|
0
|
|
|
my ($self, $data) = @_; |
1148
|
0
|
|
|
|
|
|
$self->debug(Dumper($data)); |
1149
|
|
|
|
|
|
|
} |
1150
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
1; |