| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# BioPerl module for Bio::SeqIO::locuslink |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
# Cared for by Keith Ching |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Copyright Keith Ching |
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# |
|
13
|
|
|
|
|
|
|
# (c) Keith Ching, kching at gnf.org, 2002. |
|
14
|
|
|
|
|
|
|
# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. |
|
15
|
|
|
|
|
|
|
# |
|
16
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself. |
|
17
|
|
|
|
|
|
|
# Refer to the Perl Artistic License (see the license accompanying this |
|
18
|
|
|
|
|
|
|
# software package, or see http://www.perl.com/language/misc/Artistic.html) |
|
19
|
|
|
|
|
|
|
# for the terms under which you may use, modify, and redistribute this module. |
|
20
|
|
|
|
|
|
|
# |
|
21
|
|
|
|
|
|
|
# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED |
|
22
|
|
|
|
|
|
|
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF |
|
23
|
|
|
|
|
|
|
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. |
|
24
|
|
|
|
|
|
|
# |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 NAME |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Bio::SeqIO::locuslink - LocusLink input/output stream |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# don't instantiate directly - instead do |
|
35
|
|
|
|
|
|
|
my $seqio = Bio::SeqIO->new(-format => "locuslink", -file => \STDIN); |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
This module parses LocusLink into Bio::SeqI objects with rich |
|
40
|
|
|
|
|
|
|
annotation, but no sequence. |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
The input file has to be in the LL_tmpl format - the tabular format |
|
43
|
|
|
|
|
|
|
will not work. |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
The way the current implementation populates the object is rather a |
|
46
|
|
|
|
|
|
|
draft work than a finished work of art. Note that at this stage the |
|
47
|
|
|
|
|
|
|
LocusLink entries cannot be round-tripped, because the parser loses |
|
48
|
|
|
|
|
|
|
certain information. For instance, most of the alternative transcript |
|
49
|
|
|
|
|
|
|
descriptions are not retained. The parser also misses any element |
|
50
|
|
|
|
|
|
|
that deals with visual representation (e.g., 'button') except for the |
|
51
|
|
|
|
|
|
|
URLs. Almost all of the pieces of the annotation are kept in a |
|
52
|
|
|
|
|
|
|
Bio::Annotation::Collection object, see L |
|
53
|
|
|
|
|
|
|
for more information. |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=head1 FEEDBACK |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head2 Mailing Lists |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
|
60
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to |
|
61
|
|
|
|
|
|
|
the Bioperl mailing list. Your participation is much appreciated. |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
|
64
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head2 Support |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
I |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
|
73
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
|
74
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
|
75
|
|
|
|
|
|
|
with code and data examples if at all possible. |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head2 Reporting Bugs |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
|
80
|
|
|
|
|
|
|
of the bugs and their resolution. Bug reports can be submitted via |
|
81
|
|
|
|
|
|
|
the web: |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=head1 AUTHOR - Keith Ching |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
Email kching at gnf.org |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
Hilmar Lapp, hlapp at gmx.net |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head1 APPENDIX |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
|
96
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=cut |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
package Bio::SeqIO::locuslink; |
|
101
|
|
|
|
|
|
|
|
|
102
|
1
|
|
|
1
|
|
435
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
22
|
|
|
103
|
|
|
|
|
|
|
|
|
104
|
1
|
|
|
1
|
|
232
|
use Bio::Seq::SeqFactory; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
21
|
|
|
105
|
1
|
|
|
1
|
|
254
|
use Bio::Species; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
20
|
|
|
106
|
1
|
|
|
1
|
|
260
|
use Bio::Annotation::DBLink; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
19
|
|
|
107
|
|
|
|
|
|
|
#use Bio::Annotation::Reference; |
|
108
|
1
|
|
|
1
|
|
232
|
use Bio::Annotation::Comment; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
20
|
|
|
109
|
1
|
|
|
1
|
|
239
|
use Bio::Annotation::SimpleValue; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
20
|
|
|
110
|
1
|
|
|
1
|
|
266
|
use Bio::Annotation::OntologyTerm; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
19
|
|
|
111
|
1
|
|
|
1
|
|
289
|
use Bio::Annotation::Collection; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
20
|
|
|
112
|
|
|
|
|
|
|
|
|
113
|
1
|
|
|
1
|
|
4
|
use base qw(Bio::SeqIO); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
327
|
|
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
# list of all the field names in locuslink |
|
116
|
|
|
|
|
|
|
my @locuslink_keys = qw( |
|
117
|
|
|
|
|
|
|
ACCNUM |
|
118
|
|
|
|
|
|
|
ALIAS_PROT |
|
119
|
|
|
|
|
|
|
ALIAS_SYMBOL |
|
120
|
|
|
|
|
|
|
ASSEMBLY |
|
121
|
|
|
|
|
|
|
BUTTON |
|
122
|
|
|
|
|
|
|
CDD |
|
123
|
|
|
|
|
|
|
CHR |
|
124
|
|
|
|
|
|
|
COMP |
|
125
|
|
|
|
|
|
|
CONTIG |
|
126
|
|
|
|
|
|
|
CURRENT_LOCUSID |
|
127
|
|
|
|
|
|
|
DB_DESCR |
|
128
|
|
|
|
|
|
|
DB_LINK |
|
129
|
|
|
|
|
|
|
ECNUM |
|
130
|
|
|
|
|
|
|
EVID |
|
131
|
|
|
|
|
|
|
EXTANNOT |
|
132
|
|
|
|
|
|
|
GO |
|
133
|
|
|
|
|
|
|
GRIF |
|
134
|
|
|
|
|
|
|
LINK |
|
135
|
|
|
|
|
|
|
LOCUSID |
|
136
|
|
|
|
|
|
|
LOCUS_CONFIRMED |
|
137
|
|
|
|
|
|
|
LOCUS_TYPE |
|
138
|
|
|
|
|
|
|
MAP |
|
139
|
|
|
|
|
|
|
MAPLINK |
|
140
|
|
|
|
|
|
|
NC |
|
141
|
|
|
|
|
|
|
NG |
|
142
|
|
|
|
|
|
|
NM |
|
143
|
|
|
|
|
|
|
NP |
|
144
|
|
|
|
|
|
|
NR |
|
145
|
|
|
|
|
|
|
OFFICIAL_GENE_NAME |
|
146
|
|
|
|
|
|
|
OFFICIAL_SYMBOL |
|
147
|
|
|
|
|
|
|
OMIM |
|
148
|
|
|
|
|
|
|
ORGANISM |
|
149
|
|
|
|
|
|
|
PHENOTYPE |
|
150
|
|
|
|
|
|
|
PHENOTYPE_ID |
|
151
|
|
|
|
|
|
|
PMID |
|
152
|
|
|
|
|
|
|
PREFERRED_GENE_NAME |
|
153
|
|
|
|
|
|
|
PREFERRED_PRODUCT |
|
154
|
|
|
|
|
|
|
PREFERRED_SYMBOL |
|
155
|
|
|
|
|
|
|
PRODUCT |
|
156
|
|
|
|
|
|
|
PROT |
|
157
|
|
|
|
|
|
|
RELL |
|
158
|
|
|
|
|
|
|
STATUS |
|
159
|
|
|
|
|
|
|
STS |
|
160
|
|
|
|
|
|
|
SUMFUNC |
|
161
|
|
|
|
|
|
|
SUMMARY |
|
162
|
|
|
|
|
|
|
TRANSVAR |
|
163
|
|
|
|
|
|
|
TYPE |
|
164
|
|
|
|
|
|
|
UNIGENE |
|
165
|
|
|
|
|
|
|
XG |
|
166
|
|
|
|
|
|
|
XM |
|
167
|
|
|
|
|
|
|
XP |
|
168
|
|
|
|
|
|
|
XR |
|
169
|
|
|
|
|
|
|
); |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# list of fields to make simple annotations from |
|
172
|
|
|
|
|
|
|
# fields not listed here or as a key in feature hash are ignored (lost). |
|
173
|
|
|
|
|
|
|
my %anntype_map = ( |
|
174
|
|
|
|
|
|
|
SimpleValue => [qw( |
|
175
|
|
|
|
|
|
|
ALIAS_PROT |
|
176
|
|
|
|
|
|
|
ALIAS_SYMBOL |
|
177
|
|
|
|
|
|
|
CDD |
|
178
|
|
|
|
|
|
|
CHR |
|
179
|
|
|
|
|
|
|
CURRENT_LOCUSID |
|
180
|
|
|
|
|
|
|
ECNUM |
|
181
|
|
|
|
|
|
|
EXTANNOT |
|
182
|
|
|
|
|
|
|
MAP |
|
183
|
|
|
|
|
|
|
NC |
|
184
|
|
|
|
|
|
|
NR |
|
185
|
|
|
|
|
|
|
OFFICIAL_GENE_NAME |
|
186
|
|
|
|
|
|
|
OFFICIAL_SYMBOL |
|
187
|
|
|
|
|
|
|
PHENOTYPE |
|
188
|
|
|
|
|
|
|
PREFERRED_GENE_NAME |
|
189
|
|
|
|
|
|
|
PREFERRED_PRODUCT |
|
190
|
|
|
|
|
|
|
PREFERRED_SYMBOL |
|
191
|
|
|
|
|
|
|
PRODUCT |
|
192
|
|
|
|
|
|
|
RELL |
|
193
|
|
|
|
|
|
|
SUMFUNC |
|
194
|
|
|
|
|
|
|
) |
|
195
|
|
|
|
|
|
|
], |
|
196
|
|
|
|
|
|
|
Comment => [qw( |
|
197
|
|
|
|
|
|
|
SUMMARY |
|
198
|
|
|
|
|
|
|
) |
|
199
|
|
|
|
|
|
|
], |
|
200
|
|
|
|
|
|
|
); |
|
201
|
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# certain fields are not named the same as the symgene database list |
|
204
|
|
|
|
|
|
|
my %dbname_map = ( |
|
205
|
|
|
|
|
|
|
pfam => 'Pfam', |
|
206
|
|
|
|
|
|
|
smart => 'SMART', |
|
207
|
|
|
|
|
|
|
NM => 'RefSeq', |
|
208
|
|
|
|
|
|
|
NP => 'RefSeq', |
|
209
|
|
|
|
|
|
|
XP => 'RefSeq', |
|
210
|
|
|
|
|
|
|
XM => 'RefSeq', |
|
211
|
|
|
|
|
|
|
NG => 'RefSeq', |
|
212
|
|
|
|
|
|
|
XG => 'RefSeq', |
|
213
|
|
|
|
|
|
|
XR => 'RefSeq', |
|
214
|
|
|
|
|
|
|
PROT => 'GenBank', |
|
215
|
|
|
|
|
|
|
ACCNUM => 'GenBank', |
|
216
|
|
|
|
|
|
|
CONTIG => 'GenBank', |
|
217
|
|
|
|
|
|
|
# certain fields are not named the same as the symgene |
|
218
|
|
|
|
|
|
|
# database list: rename the fields the symgene database name |
|
219
|
|
|
|
|
|
|
# key = field name in locuslink |
|
220
|
|
|
|
|
|
|
# value = database name in sym |
|
221
|
|
|
|
|
|
|
#GO => 'GO', |
|
222
|
|
|
|
|
|
|
OMIM => 'MIM', |
|
223
|
|
|
|
|
|
|
GRIF => 'GRIF', |
|
224
|
|
|
|
|
|
|
STS => 'STS', |
|
225
|
|
|
|
|
|
|
UNIGENE => 'UniGene', |
|
226
|
|
|
|
|
|
|
); |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# certain CDD entries use the wrong prefix for the accession number |
|
229
|
|
|
|
|
|
|
# cddprefix will replace the key w/ the value for these entries |
|
230
|
|
|
|
|
|
|
my %cddprefix = ( |
|
231
|
|
|
|
|
|
|
pfam => 'PF', |
|
232
|
|
|
|
|
|
|
smart => 'SM', |
|
233
|
|
|
|
|
|
|
); |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
# alternate mappings if one field does not exist |
|
236
|
|
|
|
|
|
|
my %alternate_map = ( |
|
237
|
|
|
|
|
|
|
OFFICIAL_GENE_NAME => 'PREFERRED_GENE_NAME', |
|
238
|
|
|
|
|
|
|
OFFICIAL_SYMBOL => 'PREFERRED_SYMBOL', |
|
239
|
|
|
|
|
|
|
); |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
# for these field names, we only care about the first value X in value X|Y|Z |
|
242
|
|
|
|
|
|
|
my @ll_firstelements = qw( |
|
243
|
|
|
|
|
|
|
NM |
|
244
|
|
|
|
|
|
|
NP |
|
245
|
|
|
|
|
|
|
NG |
|
246
|
|
|
|
|
|
|
XG |
|
247
|
|
|
|
|
|
|
XM |
|
248
|
|
|
|
|
|
|
XP |
|
249
|
|
|
|
|
|
|
XR |
|
250
|
|
|
|
|
|
|
PROT |
|
251
|
|
|
|
|
|
|
STS |
|
252
|
|
|
|
|
|
|
ACCNUM |
|
253
|
|
|
|
|
|
|
CONTIG |
|
254
|
|
|
|
|
|
|
GRIF |
|
255
|
|
|
|
|
|
|
); |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# these fields need to be flattened into a single string, using the given |
|
258
|
|
|
|
|
|
|
# join string |
|
259
|
|
|
|
|
|
|
my %flatten_tags = ( |
|
260
|
|
|
|
|
|
|
ASSEMBLY => ',', |
|
261
|
|
|
|
|
|
|
ORGANISM => '', # this should occur only once |
|
262
|
|
|
|
|
|
|
OFFICIAL_SYMBOL => '', # this should occur only once |
|
263
|
|
|
|
|
|
|
OFFICIAL_GENE_NAME => '', # this should occur only once |
|
264
|
|
|
|
|
|
|
LOCUSID => '', # this should occur only once |
|
265
|
|
|
|
|
|
|
PMID => ',', |
|
266
|
|
|
|
|
|
|
PREFERRED_SYMBOL => ', ', |
|
267
|
|
|
|
|
|
|
PREFERRED_GENE_NAME => ', ' |
|
268
|
|
|
|
|
|
|
); |
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# set the default search pattern for all the field names |
|
271
|
|
|
|
|
|
|
my %feature_pat_map = map { ($_ , "^$_: (.+)\n"); } @locuslink_keys; |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
sub _initialize { |
|
274
|
1
|
|
|
1
|
|
2
|
my($self,@args) = @_; |
|
275
|
|
|
|
|
|
|
|
|
276
|
1
|
|
|
|
|
4
|
$self->SUPER::_initialize(@args); |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# overwrite the search pattern w/ the first value pattern |
|
279
|
1
|
|
|
|
|
1
|
foreach my $key(@ll_firstelements){ |
|
280
|
12
|
|
|
|
|
18
|
$feature_pat_map{$key}="^$key: ([^|]+)"; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
# special search pattern for cdd entries |
|
284
|
1
|
|
|
|
|
3
|
foreach my $key(keys %cddprefix) { |
|
285
|
2
|
|
|
|
|
4
|
$feature_pat_map{$key}='^CDD: .+\|'.$key.'(\d+)'; |
|
286
|
|
|
|
|
|
|
} |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# special patterns for specific fields |
|
289
|
1
|
|
|
|
|
2
|
$feature_pat_map{MAP} = '^MAP: (.+?)\|'; |
|
290
|
1
|
|
|
|
|
2
|
$feature_pat_map{MAPHTML} = '^MAP: .+\|(<.+>)\|'; |
|
291
|
1
|
|
|
|
|
1
|
$feature_pat_map{GO} = '^GO: .+\|.+\|\w+\|(GO:\d+)\|'; |
|
292
|
1
|
|
|
|
|
1
|
$feature_pat_map{GO_DESC} = '^GO: .+\|(.+)\|\w+\|GO:\d+\|'; |
|
293
|
1
|
|
|
|
|
2
|
$feature_pat_map{GO_CAT} = '^GO: (.+)\|.+\|\w+\|GO:\d+\|'; |
|
294
|
1
|
|
|
|
|
2
|
$feature_pat_map{EXTANNOT} = '^EXTANNOT: (.+)\|(.+)\|\w+\|.+\|\d+'; |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# set the sequence factory of none has been set already |
|
297
|
1
|
50
|
|
|
|
3
|
if(! $self->sequence_factory()) { |
|
298
|
1
|
|
|
|
|
6
|
$self->sequence_factory(Bio::Seq::SeqFactory->new( |
|
299
|
|
|
|
|
|
|
-type => 'Bio::Seq::RichSeq')); |
|
300
|
|
|
|
|
|
|
} |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
######################### |
|
305
|
|
|
|
|
|
|
# |
|
306
|
|
|
|
|
|
|
sub search_pattern{ |
|
307
|
|
|
|
|
|
|
# |
|
308
|
|
|
|
|
|
|
######################### |
|
309
|
114
|
|
|
114
|
0
|
91
|
my ($self, |
|
310
|
|
|
|
|
|
|
$entry, #text to search |
|
311
|
|
|
|
|
|
|
$searchconfirm, #to make sure you got the right thing |
|
312
|
|
|
|
|
|
|
$searchpattern, |
|
313
|
|
|
|
|
|
|
$searchtype) = @_; |
|
314
|
114
|
|
|
|
|
1548
|
my @query = $entry=~/$searchpattern/gm; |
|
315
|
114
|
50
|
|
|
|
206
|
if ($searchconfirm ne "FALSE"){ |
|
316
|
0
|
0
|
|
|
|
0
|
$self->warn("No $searchtype found\n$entry\n") unless @query; |
|
317
|
0
|
|
|
|
|
0
|
foreach (@query){ |
|
318
|
0
|
0
|
|
|
|
0
|
if (!($_=~/$searchconfirm/)){ |
|
319
|
0
|
|
|
|
|
0
|
$self->throw("error\n$entry\n$searchtype parse $_ does not match $searchconfirm\n"); |
|
320
|
|
|
|
|
|
|
} |
|
321
|
|
|
|
|
|
|
}#endforeach |
|
322
|
|
|
|
|
|
|
}#endsearchconfirm |
|
323
|
114
|
|
|
|
|
186
|
return(@query); |
|
324
|
|
|
|
|
|
|
}#endsub |
|
325
|
|
|
|
|
|
|
############ |
|
326
|
|
|
|
|
|
|
# |
|
327
|
|
|
|
|
|
|
sub read_species{ |
|
328
|
|
|
|
|
|
|
# |
|
329
|
|
|
|
|
|
|
############ |
|
330
|
2
|
|
|
2
|
0
|
2
|
my ($spline)=@_; |
|
331
|
2
|
|
|
|
|
1
|
my $species; |
|
332
|
|
|
|
|
|
|
my $genus; |
|
333
|
2
|
|
|
|
|
11
|
($genus,$species)=$spline=~/([^ ]+) ([^ ]+)/; |
|
334
|
2
|
|
|
|
|
11
|
my $make = Bio::Species->new(); |
|
335
|
2
|
|
|
|
|
6
|
$make->classification( ($species,$genus) ); |
|
336
|
2
|
|
|
|
|
8
|
return $make; |
|
337
|
|
|
|
|
|
|
} |
|
338
|
|
|
|
|
|
|
################ |
|
339
|
|
|
|
|
|
|
# |
|
340
|
|
|
|
|
|
|
sub read_dblink{ |
|
341
|
|
|
|
|
|
|
# |
|
342
|
|
|
|
|
|
|
################ |
|
343
|
36
|
|
|
36
|
0
|
31
|
my ($ann,$db,$ref)=@_; |
|
344
|
36
|
100
|
|
|
|
54
|
my @results=$ref ? @$ref : (); |
|
345
|
36
|
|
|
|
|
31
|
foreach my $id(@results){ |
|
346
|
50
|
50
|
|
|
|
83
|
if($id){ |
|
347
|
50
|
|
|
|
|
124
|
$ann->add_Annotation('dblink', |
|
348
|
|
|
|
|
|
|
Bio::Annotation::DBLink->new( |
|
349
|
|
|
|
|
|
|
-database =>$db , |
|
350
|
|
|
|
|
|
|
-primary_id =>$id)); |
|
351
|
|
|
|
|
|
|
} |
|
352
|
|
|
|
|
|
|
} |
|
353
|
36
|
|
|
|
|
43
|
return($ann); |
|
354
|
|
|
|
|
|
|
} |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
################ |
|
357
|
|
|
|
|
|
|
# |
|
358
|
|
|
|
|
|
|
sub read_reference{ |
|
359
|
|
|
|
|
|
|
# |
|
360
|
|
|
|
|
|
|
################ |
|
361
|
2
|
|
|
2
|
0
|
5
|
my ($ann,$db,$results)=@_; |
|
362
|
|
|
|
|
|
|
|
|
363
|
2
|
50
|
|
|
|
4
|
if($results){ |
|
364
|
2
|
|
|
|
|
3
|
chomp($results); |
|
365
|
2
|
|
|
|
|
6
|
my @ids=split(/,/,$results); |
|
366
|
2
|
50
|
|
|
|
6
|
$ann = read_dblink($ann,$db,\@ids) if @ids; |
|
367
|
|
|
|
|
|
|
} |
|
368
|
2
|
|
|
|
|
5
|
return $ann; |
|
369
|
|
|
|
|
|
|
}#endsub |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
################ |
|
372
|
|
|
|
|
|
|
# |
|
373
|
|
|
|
|
|
|
sub add_annotation{ |
|
374
|
|
|
|
|
|
|
# |
|
375
|
|
|
|
|
|
|
################ |
|
376
|
43
|
|
|
43
|
0
|
44
|
my ($ac,$type,$text,$anntype)=@_; |
|
377
|
43
|
|
|
|
|
28
|
my @args; |
|
378
|
|
|
|
|
|
|
|
|
379
|
43
|
50
|
|
|
|
49
|
$anntype = 'SimpleValue' unless $anntype; |
|
380
|
|
|
|
|
|
|
SWITCH : { |
|
381
|
43
|
100
|
|
|
|
31
|
$anntype eq 'SimpleValue' && do { |
|
|
43
|
|
|
|
|
60
|
|
|
382
|
41
|
|
|
|
|
67
|
push(@args, -value => $text, -tagname => $type); |
|
383
|
41
|
|
|
|
|
32
|
last SWITCH; |
|
384
|
|
|
|
|
|
|
}; |
|
385
|
2
|
50
|
|
|
|
4
|
$anntype eq 'Comment' && do { |
|
386
|
2
|
|
|
|
|
4
|
push(@args, -text => $text, -tagname => 'comment'); |
|
387
|
2
|
|
|
|
|
3
|
last SWITCH; |
|
388
|
|
|
|
|
|
|
}; |
|
389
|
|
|
|
|
|
|
} |
|
390
|
43
|
|
|
|
|
92
|
$ac->add_Annotation("Bio::Annotation::$anntype"->new(@args)); |
|
391
|
43
|
|
|
|
|
81
|
return($ac); |
|
392
|
|
|
|
|
|
|
}#endsub |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
################ |
|
395
|
|
|
|
|
|
|
# |
|
396
|
|
|
|
|
|
|
sub add_annotation_ref{ |
|
397
|
|
|
|
|
|
|
# |
|
398
|
|
|
|
|
|
|
################ |
|
399
|
4
|
|
|
4
|
0
|
3
|
my ($ann,$type,$textref)=@_; |
|
400
|
4
|
50
|
|
|
|
10
|
my @text=$textref ? @$textref : (); |
|
401
|
|
|
|
|
|
|
|
|
402
|
4
|
|
|
|
|
5
|
foreach my $text(@text){ |
|
403
|
22
|
|
|
|
|
51
|
$ann->add_Annotation($type,Bio::Annotation::SimpleValue->new(-value => $text)); |
|
404
|
|
|
|
|
|
|
} |
|
405
|
4
|
|
|
|
|
6
|
return($ann); |
|
406
|
|
|
|
|
|
|
}#endsub |
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
################ |
|
409
|
|
|
|
|
|
|
# |
|
410
|
|
|
|
|
|
|
sub make_unique{ |
|
411
|
|
|
|
|
|
|
# |
|
412
|
|
|
|
|
|
|
############## |
|
413
|
2
|
|
|
2
|
0
|
2
|
my ($ann,$key) = @_; |
|
414
|
|
|
|
|
|
|
|
|
415
|
2
|
|
|
|
|
3
|
my %seen = (); |
|
416
|
2
|
|
|
|
|
4
|
foreach my $dbl ($ann->remove_Annotations($key)) { |
|
417
|
50
|
100
|
|
|
|
59
|
if(!exists($seen{$dbl->as_text()})) { |
|
418
|
43
|
|
|
|
|
46
|
$seen{$dbl->as_text()} = 1; |
|
419
|
43
|
|
|
|
|
55
|
$ann->add_Annotation($dbl); |
|
420
|
|
|
|
|
|
|
} |
|
421
|
|
|
|
|
|
|
} |
|
422
|
2
|
|
|
|
|
7
|
return $ann; |
|
423
|
|
|
|
|
|
|
} |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
################ |
|
426
|
|
|
|
|
|
|
# |
|
427
|
|
|
|
|
|
|
sub next_seq{ |
|
428
|
|
|
|
|
|
|
# |
|
429
|
|
|
|
|
|
|
############## |
|
430
|
3
|
|
|
3
|
1
|
10
|
my ($self, @args)=@_; |
|
431
|
3
|
|
|
|
|
2
|
my (@results,$search,$ref,$cddref); |
|
432
|
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
# LOCUSLINK entries begin w/ >> |
|
434
|
3
|
|
|
|
|
11
|
local $/="\n>>"; |
|
435
|
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
# slurp in a whole entry and return if no more entries |
|
437
|
3
|
100
|
|
|
|
8
|
return unless my $entry = $self->_readline; |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
# strip the leading '>>' if it's the first entry |
|
440
|
2
|
100
|
|
|
|
6
|
if (index($entry,'>>') == 0) { #first entry |
|
441
|
1
|
|
|
|
|
3
|
$entry = substr($entry,2); |
|
442
|
|
|
|
|
|
|
} |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# we aren't interested in obsoleted entries, so we need to loop |
|
445
|
|
|
|
|
|
|
# and skip those until we've found the next not obsoleted |
|
446
|
2
|
|
|
|
|
3
|
my %record = (); |
|
447
|
2
|
|
33
|
|
|
12
|
while($entry && ($entry =~ /\w/)) { |
|
448
|
2
|
50
|
|
|
|
5
|
if (!($entry=~/LOCUSID/)){ |
|
449
|
0
|
|
|
|
|
0
|
$self->throw("No LOCUSID in first line of record. ". |
|
450
|
|
|
|
|
|
|
"Not LocusLink in my book."); |
|
451
|
|
|
|
|
|
|
} |
|
452
|
|
|
|
|
|
|
# see whether it's an obsoleted entry, and if so jump to the next |
|
453
|
|
|
|
|
|
|
# one entry right away |
|
454
|
2
|
50
|
|
|
|
7
|
if($entry =~ /^CURRENT_LOCUSID:/m) { |
|
455
|
|
|
|
|
|
|
# read next entry and continue |
|
456
|
0
|
|
|
|
|
0
|
$entry = $self->_readline; |
|
457
|
0
|
|
|
|
|
0
|
%record = (); |
|
458
|
0
|
|
|
|
|
0
|
next; |
|
459
|
|
|
|
|
|
|
} |
|
460
|
|
|
|
|
|
|
# loop through list of features and get field values |
|
461
|
|
|
|
|
|
|
# place into record hash as array refs |
|
462
|
2
|
|
|
|
|
12
|
foreach my $key (keys %feature_pat_map){ |
|
463
|
114
|
|
|
|
|
89
|
$search=$feature_pat_map{$key}; |
|
464
|
114
|
|
|
|
|
132
|
@results=$self->search_pattern($entry,'FALSE',$search,$search); |
|
465
|
114
|
100
|
|
|
|
237
|
$record{$key} = @results ? [@results] : undef; |
|
466
|
|
|
|
|
|
|
}#endfor |
|
467
|
|
|
|
|
|
|
# terminate loop as this one hasn't been obsoleted |
|
468
|
2
|
|
|
|
|
7
|
last; |
|
469
|
|
|
|
|
|
|
} |
|
470
|
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
# we have reached the end-of-file ... |
|
472
|
2
|
50
|
|
|
|
4
|
return unless %record; |
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
# special processing for CDD entries like pfam and smart |
|
475
|
2
|
|
|
|
|
2
|
my ($PRESENT,@keep); |
|
476
|
2
|
|
|
|
|
5
|
foreach my $key(keys %cddprefix){ |
|
477
|
|
|
|
|
|
|
#print "check CDD $key\n"; |
|
478
|
4
|
100
|
|
|
|
7
|
if($record{$key}) { |
|
479
|
2
|
|
|
|
|
3
|
@keep=(); |
|
480
|
2
|
|
|
|
|
2
|
foreach my $list (@{$record{$key}}) { |
|
|
2
|
|
|
|
|
4
|
|
|
481
|
|
|
|
|
|
|
# replace AC with correct AC number |
|
482
|
4
|
|
|
|
|
6
|
push(@keep,$cddprefix{$key}.$list); |
|
483
|
|
|
|
|
|
|
} |
|
484
|
|
|
|
|
|
|
# replace CDD ref with correctly prefixed AC number |
|
485
|
2
|
|
|
|
|
5
|
$record{$key} = [@keep]; |
|
486
|
|
|
|
|
|
|
} |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
# modify CDD references @=(); |
|
489
|
2
|
50
|
|
|
|
3
|
if($record{CDD}) { |
|
490
|
2
|
|
|
|
|
3
|
@keep=(); |
|
491
|
2
|
|
|
|
|
2
|
foreach my $cdd (@{$record{CDD}}) { |
|
|
2
|
|
|
|
|
3
|
|
|
492
|
8
|
|
|
|
|
5
|
$PRESENT = undef; |
|
493
|
8
|
|
|
|
|
9
|
foreach my $key (keys %cddprefix) { |
|
494
|
16
|
100
|
|
|
|
74
|
if ($cdd=~/$key/){ |
|
495
|
4
|
|
|
|
|
2
|
$PRESENT = 1; |
|
496
|
4
|
|
|
|
|
6
|
last; |
|
497
|
|
|
|
|
|
|
} |
|
498
|
|
|
|
|
|
|
} |
|
499
|
8
|
100
|
|
|
|
14
|
push(@keep,$cdd) if(! $PRESENT); |
|
500
|
|
|
|
|
|
|
} |
|
501
|
2
|
|
|
|
|
3
|
$record{CDD} = [@keep]; |
|
502
|
|
|
|
|
|
|
} |
|
503
|
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
# create annotation collection - we'll need it now |
|
505
|
2
|
|
|
|
|
9
|
my $ann = Bio::Annotation::Collection->new(); |
|
506
|
|
|
|
|
|
|
|
|
507
|
2
|
|
|
|
|
7
|
foreach my $field(keys %dbname_map){ |
|
508
|
32
|
|
|
|
|
43
|
$ann=read_dblink($ann,$dbname_map{$field},$record{$field}); |
|
509
|
|
|
|
|
|
|
} |
|
510
|
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# add GO link as an OntologyTerm annotation |
|
512
|
2
|
50
|
|
|
|
6
|
if($record{GO}) { |
|
513
|
2
|
|
|
|
|
3
|
for(my $j = 0; $j < @{$record{GO}}; $j++) { |
|
|
10
|
|
|
|
|
20
|
|
|
514
|
|
|
|
|
|
|
my $goann = Bio::Annotation::OntologyTerm->new( |
|
515
|
|
|
|
|
|
|
-identifier => $record{GO}->[$j], |
|
516
|
|
|
|
|
|
|
-name => $record{GO_DESC}->[$j], |
|
517
|
8
|
|
|
|
|
32
|
-ontology => $record{GO_CAT}->[$j]); |
|
518
|
8
|
|
|
|
|
17
|
$ann->add_Annotation($goann); |
|
519
|
|
|
|
|
|
|
} |
|
520
|
|
|
|
|
|
|
} |
|
521
|
|
|
|
|
|
|
|
|
522
|
2
|
|
|
|
|
5
|
$ann=add_annotation_ref($ann,'URL',$record{LINK}); |
|
523
|
2
|
|
|
|
|
4
|
$ann=add_annotation_ref($ann,'URL',$record{DB_LINK}); |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# everything else gets a simple tag or comment value annotation |
|
526
|
2
|
|
|
|
|
4
|
foreach my $anntype (keys %anntype_map) { |
|
527
|
4
|
|
|
|
|
3
|
foreach my $key (@{$anntype_map{$anntype}}){ |
|
|
4
|
|
|
|
|
6
|
|
|
528
|
40
|
100
|
|
|
|
59
|
if($record{$key}){ |
|
529
|
27
|
|
|
|
|
21
|
foreach (@{$record{$key}}){ |
|
|
27
|
|
|
|
|
32
|
|
|
530
|
|
|
|
|
|
|
#print "$key\t\t$_\n"; |
|
531
|
43
|
|
|
|
|
47
|
$ann=add_annotation($ann,$key,$_,$anntype); |
|
532
|
|
|
|
|
|
|
} |
|
533
|
|
|
|
|
|
|
} |
|
534
|
|
|
|
|
|
|
} |
|
535
|
|
|
|
|
|
|
} |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
# flatten designated attributes into a scalar value |
|
538
|
2
|
|
|
|
|
6
|
foreach my $field (keys %flatten_tags) { |
|
539
|
16
|
100
|
|
|
|
22
|
if($record{$field}) { |
|
540
|
|
|
|
|
|
|
$record{$field} = join($flatten_tags{$field}, |
|
541
|
12
|
|
|
|
|
10
|
@{$record{$field}}); |
|
|
12
|
|
|
|
|
19
|
|
|
542
|
|
|
|
|
|
|
} |
|
543
|
|
|
|
|
|
|
} |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# annotation that expects the array flattened out |
|
546
|
2
|
|
|
|
|
18
|
$ann=read_reference($ann,'PUBMED',$record{PMID}); |
|
547
|
2
|
50
|
|
|
|
22
|
if($record{ASSEMBLY}) { |
|
548
|
2
|
|
|
|
|
6
|
my @assembly=split(/,/,$record{ASSEMBLY}); |
|
549
|
2
|
|
|
|
|
3
|
$ann=read_dblink($ann,'GenBank',\@assembly); |
|
550
|
|
|
|
|
|
|
} |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# replace fields w/ alternate if original does not exist |
|
553
|
2
|
|
|
|
|
5
|
foreach my $fieldval (keys %alternate_map){ |
|
554
|
4
|
50
|
33
|
|
|
10
|
if((! $record{$fieldval}) && ($record{$alternate_map{$fieldval}})){ |
|
555
|
0
|
|
|
|
|
0
|
$record{$fieldval}=$record{$alternate_map{$fieldval}}; |
|
556
|
|
|
|
|
|
|
} |
|
557
|
|
|
|
|
|
|
} |
|
558
|
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
# presently we can't store types or context of dblinks - therefore |
|
560
|
|
|
|
|
|
|
# we need to remove duplicates that only differ in context |
|
561
|
2
|
|
|
|
|
4
|
make_unique($ann,'dblink'); |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
# create sequence object (i.e., let seq.factory create one) |
|
564
|
|
|
|
|
|
|
my $seq = $self->sequence_factory->create( |
|
565
|
|
|
|
|
|
|
-verbose => $self->verbose(), |
|
566
|
|
|
|
|
|
|
-accession_number => $record{LOCUSID}, |
|
567
|
|
|
|
|
|
|
-desc => $record{OFFICIAL_GENE_NAME}, |
|
568
|
|
|
|
|
|
|
-display_id => $record{OFFICIAL_SYMBOL}, |
|
569
|
2
|
|
|
|
|
5
|
-species => read_species($record{ORGANISM}), |
|
570
|
|
|
|
|
|
|
-annotation => $ann); |
|
571
|
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
# dump out object contents |
|
573
|
|
|
|
|
|
|
# show_obj([$seq]); |
|
574
|
|
|
|
|
|
|
|
|
575
|
2
|
|
|
|
|
27
|
return($seq); |
|
576
|
|
|
|
|
|
|
} |
|
577
|
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
################ |
|
579
|
|
|
|
|
|
|
# |
|
580
|
|
|
|
|
|
|
sub show_obj{ |
|
581
|
|
|
|
|
|
|
# |
|
582
|
|
|
|
|
|
|
################ |
|
583
|
0
|
|
|
0
|
0
|
|
my ($seqlistref)=@_; |
|
584
|
0
|
|
|
|
|
|
my @list=@$seqlistref; |
|
585
|
0
|
|
|
|
|
|
my $out = Bio::SeqIO->new('-fh' => \*STDOUT, -format => 'genbank' ); |
|
586
|
0
|
|
|
|
|
|
my ($ann,@values,$val); |
|
587
|
|
|
|
|
|
|
|
|
588
|
0
|
|
|
|
|
|
foreach my $seq(@list){ |
|
589
|
0
|
|
|
|
|
|
$out->write_seq($seq); |
|
590
|
0
|
|
|
|
|
|
$ann=$seq->annotation; |
|
591
|
0
|
|
|
|
|
|
foreach my $key ( $ann->get_all_annotation_keys() ) { |
|
592
|
0
|
|
|
|
|
|
@values = $ann->get_Annotations($key); |
|
593
|
0
|
|
|
|
|
|
foreach my $value ( @values ) { |
|
594
|
|
|
|
|
|
|
# value is an Bio::AnnotationI, and defines a "as_text" method |
|
595
|
0
|
|
|
|
|
|
$val=$value->as_text; |
|
596
|
0
|
|
|
|
|
|
print "Annotation ",$key,"\t\t",$val,"\n"; |
|
597
|
|
|
|
|
|
|
} |
|
598
|
|
|
|
|
|
|
} |
|
599
|
|
|
|
|
|
|
} |
|
600
|
|
|
|
|
|
|
}#endsub |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
1; |