line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::SeqIO::swiss |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Copyright Elia Stupka |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 NAME |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
Bio::SeqIO::swiss - Swissprot sequence input/output stream |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 SYNOPSIS |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
It is probably best not to use this object directly, but |
17
|
|
|
|
|
|
|
rather go through the SeqIO handler system: |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
use Bio::SeqIO; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
$stream = Bio::SeqIO->new(-file => $filename, |
22
|
|
|
|
|
|
|
-format => 'swiss'); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
while ( my $seq = $stream->next_seq() ) { |
25
|
|
|
|
|
|
|
# do something with $seq |
26
|
|
|
|
|
|
|
} |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 DESCRIPTION |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
This object can transform Bio::Seq objects to and from Swiss-Pprot flat |
31
|
|
|
|
|
|
|
file databases. |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
There is a lot of flexibility here about how to dump things which needs |
34
|
|
|
|
|
|
|
to be documented. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head2 GN (Gene name) line management details |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
A Uniprot/Swiss-Prot entry holds information on one protein |
39
|
|
|
|
|
|
|
sequence. If that sequence is identical across genes and species, they |
40
|
|
|
|
|
|
|
are all merged into one entry. This creates complex needs for several |
41
|
|
|
|
|
|
|
annotation fields in swiss-prot format. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
The latest syntax for GN line is described in the user manual: |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
http://www.expasy.ch/sprot/userman.html#GN_line |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
Each of the possibly multiple genes in an entry can have Name, |
48
|
|
|
|
|
|
|
Synonyms (only if there is a name), OrderedLocusNames (names from |
49
|
|
|
|
|
|
|
genomic sequences) and ORFNames (temporary or cosmid names). "Name" |
50
|
|
|
|
|
|
|
here really means "symbol". This complexity is now dealt with the |
51
|
|
|
|
|
|
|
following way: |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
A new Bio::AnnotationI class was created in order to store the |
54
|
|
|
|
|
|
|
data in tag-value pairs. This class (Bio::Annotation::TagTree) |
55
|
|
|
|
|
|
|
is stored in the Bio::Annotation::Collection object and is |
56
|
|
|
|
|
|
|
accessed like all other annotations. The tag name is 'gene_name'. |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
There is a single Bio::Annotation::TagTree per sequence record, which |
59
|
|
|
|
|
|
|
corresponds to the original class that stored this data |
60
|
|
|
|
|
|
|
(Bio::Annotation::StructuredValue). Depending on how we progress |
61
|
|
|
|
|
|
|
this may change to represent each group of gene names. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
For now, to access the gene name tree annotation, one uses the below method: |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
my ($gene) = $seq->annotation->get_Annotations('gene_name'); |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
If you are only interested in displaying the values, value() returns a |
68
|
|
|
|
|
|
|
string with similar formatting. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
There are several ways to get directly at the information you want if you |
71
|
|
|
|
|
|
|
know the element (tag) for the data. For gene names all data is stored with |
72
|
|
|
|
|
|
|
the element-tag pairs: |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
"element1=tag1, tag2, tag3; element2=tag4, tag5;" |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
This normally means the element will be 'Name', 'Synonyms', etc. and the |
77
|
|
|
|
|
|
|
gene names the values. Using findval(), you can do the following: |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# grab a flattened list of all gene names |
80
|
|
|
|
|
|
|
my @names = $ann->findval('Name'); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# or iterated through the nodes and grab the name for each group |
83
|
|
|
|
|
|
|
for my $node ($ann->findnode('gene_name')) { |
84
|
|
|
|
|
|
|
my @names = $node->findval('Name'); |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
The current method for parsing gene name data (and reconstructing gene name |
88
|
|
|
|
|
|
|
output) is very generic. This is somewhat preemptive if, for instance, UniProt |
89
|
|
|
|
|
|
|
decides to update and add another element name to the current ones using the |
90
|
|
|
|
|
|
|
same formatting layout. Under those circumstances, one can iterate through the |
91
|
|
|
|
|
|
|
tag tree in a safe way and retrieve all node data like so. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# retrieve the gene name nodes (groups like names, synonyms, etc). |
94
|
|
|
|
|
|
|
for my $ann ($seq->annotation->get_Annotations('gene_name')) { |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
# each gene name group |
97
|
|
|
|
|
|
|
for my $node ($ann->findnode('gene_name')) { |
98
|
|
|
|
|
|
|
print "Gene name:\n"; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# each gene name node (tag => value pair) |
101
|
|
|
|
|
|
|
for my $n ($node->children) { |
102
|
|
|
|
|
|
|
print "\t".$n->element.": ".$n->children."\n"; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
For more uses see Bio::Annotation::TagTree. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
Since Uniprot/Swiss-Prot format have been around for quite some time, the |
110
|
|
|
|
|
|
|
parser is also able to read in the older GN line syntax where genes |
111
|
|
|
|
|
|
|
are separated by AND and various symbols by OR. The first symbol is |
112
|
|
|
|
|
|
|
taken to be the 'Name' and the remaining ones are stored as 'Synonyms'. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
Also, for UniProt output we support using other Bio::AnnotationI, but in this |
115
|
|
|
|
|
|
|
case we only use the stringified version of the annotation. This is to allow for |
116
|
|
|
|
|
|
|
backwards compatibility with code that previously used |
117
|
|
|
|
|
|
|
Bio::Annotation::SimpleValue or other Bio::AnnotationI classes. |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=head2 Optional functions |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=over 3 |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=item _show_dna() |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
(output only) shows the dna or not |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=item _post_sort() |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
(output only) provides a sorting func which is applied to the FTHelpers |
130
|
|
|
|
|
|
|
before printing |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=item _id_generation_func() |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
This is function which is called as |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
print "ID ", $func($seq), "\n"; |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
To generate the ID line. If it is not there, it generates a sensible ID |
139
|
|
|
|
|
|
|
line using a number of tools. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
If you want to output annotations in Swissprot format they need to be |
142
|
|
|
|
|
|
|
stored in a Bio::Annotation::Collection object which is accessible |
143
|
|
|
|
|
|
|
through the Bio::SeqI interface method L. |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
The following are the names of the keys which are polled from a |
146
|
|
|
|
|
|
|
L object. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
reference - Should contain Bio::Annotation::Reference objects |
149
|
|
|
|
|
|
|
comment - Should contain Bio::Annotation::Comment objects |
150
|
|
|
|
|
|
|
dblink - Should contain Bio::Annotation::DBLink objects |
151
|
|
|
|
|
|
|
gene_name - Should contain Bio::Annotation::SimpleValue object |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=back |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=head1 FEEDBACK |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 Mailing Lists |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this |
160
|
|
|
|
|
|
|
and other Bioperl modules. Send your comments and suggestions, |
161
|
|
|
|
|
|
|
preferably to one of the Bioperl mailing lists. |
162
|
|
|
|
|
|
|
Your participation is much appreciated. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
165
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=head2 Support |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
I |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
174
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
175
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
176
|
|
|
|
|
|
|
with code and data examples if at all possible. |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=head2 Reporting Bugs |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
181
|
|
|
|
|
|
|
the bugs and their resolution. |
182
|
|
|
|
|
|
|
Bug reports can be submitted via the web: |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head1 AUTHOR - Elia Stupka |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
Email elia@tll.org.sg |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head1 APPENDIX |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
193
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=cut |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
# Let the code begin... |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
package Bio::SeqIO::swiss; |
200
|
5
|
|
|
5
|
|
680
|
use vars qw(@Unknown_names @Unknown_genus); |
|
5
|
|
|
|
|
14
|
|
|
5
|
|
|
|
|
333
|
|
201
|
5
|
|
|
5
|
|
31
|
use strict; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
145
|
|
202
|
5
|
|
|
5
|
|
780
|
use Bio::SeqIO::FTHelper; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
126
|
|
203
|
5
|
|
|
5
|
|
27
|
use Bio::SeqFeature::Generic; |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
94
|
|
204
|
5
|
|
|
5
|
|
855
|
use Bio::Species; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
135
|
|
205
|
5
|
|
|
5
|
|
2539
|
use Bio::Tools::SeqStats; |
|
5
|
|
|
|
|
17
|
|
|
5
|
|
|
|
|
192
|
|
206
|
5
|
|
|
5
|
|
37
|
use Bio::Seq::SeqFactory; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
130
|
|
207
|
5
|
|
|
5
|
|
24
|
use Bio::Annotation::Collection; |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
108
|
|
208
|
5
|
|
|
5
|
|
763
|
use Bio::Annotation::Comment; |
|
5
|
|
|
|
|
11
|
|
|
5
|
|
|
|
|
117
|
|
209
|
5
|
|
|
5
|
|
747
|
use Bio::Annotation::Reference; |
|
5
|
|
|
|
|
10
|
|
|
5
|
|
|
|
|
119
|
|
210
|
5
|
|
|
5
|
|
27
|
use Bio::Annotation::DBLink; |
|
5
|
|
|
|
|
9
|
|
|
5
|
|
|
|
|
117
|
|
211
|
5
|
|
|
5
|
|
25
|
use Bio::Annotation::SimpleValue; |
|
5
|
|
|
|
|
9
|
|
|
5
|
|
|
|
|
103
|
|
212
|
5
|
|
|
5
|
|
1166
|
use Bio::Annotation::TagTree; |
|
5
|
|
|
|
|
12
|
|
|
5
|
|
|
|
|
265
|
|
213
|
|
|
|
|
|
|
|
214
|
5
|
|
|
5
|
|
29
|
use base qw(Bio::SeqIO); |
|
5
|
|
|
|
|
63
|
|
|
5
|
|
|
|
|
29607
|
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
our $LINE_LENGTH = 76; |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# this is for doing species name parsing |
219
|
|
|
|
|
|
|
@Unknown_names=('other', 'unidentified', |
220
|
|
|
|
|
|
|
'unknown organism', 'not specified', |
221
|
|
|
|
|
|
|
'not shown', 'Unspecified', 'Unknown', |
222
|
|
|
|
|
|
|
'None', 'unclassified', 'unidentified organism', |
223
|
|
|
|
|
|
|
'not supplied' |
224
|
|
|
|
|
|
|
); |
225
|
|
|
|
|
|
|
# dictionary of synonyms for taxid 32644 |
226
|
|
|
|
|
|
|
# all above can be part of valid species name |
227
|
|
|
|
|
|
|
@Unknown_genus = qw(unknown unclassified uncultured unidentified); |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# if there are any other gene name tags, they are added to the end |
230
|
|
|
|
|
|
|
our @GENE_NAME_ORDER = qw(Name Synonyms OrderedLocusNames ORFNames); |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub _initialize { |
233
|
23
|
|
|
23
|
|
84
|
my($self,@args) = @_; |
234
|
23
|
|
|
|
|
142
|
$self->SUPER::_initialize(@args); |
235
|
|
|
|
|
|
|
# hash for functions for decoding keys. |
236
|
23
|
|
|
|
|
113
|
$self->{'_func_ftunit_hash'} = {}; |
237
|
|
|
|
|
|
|
# sets this to one by default. People can change it |
238
|
23
|
|
|
|
|
141
|
$self->_show_dna(1); |
239
|
23
|
50
|
|
|
|
113
|
if ( ! defined $self->sequence_factory ) { |
240
|
23
|
|
|
|
|
95
|
$self->sequence_factory(Bio::Seq::SeqFactory->new |
241
|
|
|
|
|
|
|
(-verbose => $self->verbose(), |
242
|
|
|
|
|
|
|
-type => 'Bio::Seq::RichSeq')); |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=head2 next_seq |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Title : next_seq |
249
|
|
|
|
|
|
|
Usage : $seq = $stream->next_seq() |
250
|
|
|
|
|
|
|
Function: returns the next sequence in the stream |
251
|
|
|
|
|
|
|
Returns : Bio::Seq object |
252
|
|
|
|
|
|
|
Args : |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=cut |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
sub next_seq { |
257
|
36
|
|
|
36
|
1
|
4420
|
my ($self,@args) = @_; |
258
|
36
|
|
|
|
|
111
|
my ($pseq,$c,$line,$name,$desc,$acc,$seqc,$mol,$div, $sptr,$seq_div, |
259
|
|
|
|
|
|
|
$date,$comment,@date_arr); |
260
|
36
|
|
|
|
|
77
|
my $genename = ""; |
261
|
36
|
|
|
|
|
259
|
my ($annotation, %params, @features) = ( Bio::Annotation::Collection->new()); |
262
|
|
|
|
|
|
|
|
263
|
36
|
|
|
|
|
76
|
local $_; |
264
|
|
|
|
|
|
|
|
265
|
36
|
|
100
|
|
|
211
|
1 while defined($_ = $self->_readline) && /^\s+$/; |
266
|
36
|
100
|
66
|
|
|
259
|
return unless defined $_ && /^ID\s/; |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# fixed to allow _DIVISION to be optional for bug #946 |
269
|
|
|
|
|
|
|
# see bug report for more information |
270
|
|
|
|
|
|
|
# |
271
|
|
|
|
|
|
|
# 9/6/06 Note: Swiss/TrEMBL sequences have no division acc. to UniProt |
272
|
|
|
|
|
|
|
# release notes; this is fixed to simplify the regex parsing |
273
|
|
|
|
|
|
|
# STANDARD (SwissProt) and PRELIMINARY (TrEMBL) added to namespace() |
274
|
31
|
50
|
|
|
|
245
|
unless( m{^ |
275
|
|
|
|
|
|
|
ID \s+ # |
276
|
|
|
|
|
|
|
(\S+) \s+ # $1 entryname |
277
|
|
|
|
|
|
|
([^\s;]+); \s+ # $2 DataClass |
278
|
|
|
|
|
|
|
(?:PRT;)? \s+ # Molecule Type (optional) |
279
|
|
|
|
|
|
|
[0-9]+[ ]AA \. # Sequencelength (capture?) |
280
|
|
|
|
|
|
|
$ |
281
|
|
|
|
|
|
|
}ox ) { |
282
|
|
|
|
|
|
|
# I couldn't find any new current UniProt sequences |
283
|
|
|
|
|
|
|
# that matched this format: |
284
|
|
|
|
|
|
|
# || m/^ID\s+(\S+)\s+(_([^\s_]+))? /ox ) { |
285
|
0
|
|
|
|
|
0
|
$self->throw("swissprot stream with no ID. Not swissprot in my book"); |
286
|
|
|
|
|
|
|
} |
287
|
31
|
|
|
|
|
153
|
($name, $seq_div) = ($1, $2); |
288
|
31
|
100
|
100
|
|
|
257
|
$params{'-namespace'} = |
|
|
100
|
100
|
|
|
|
|
289
|
|
|
|
|
|
|
($seq_div eq 'Reviewed' || $seq_div eq 'STANDARD') ? 'Swiss-Prot' : |
290
|
|
|
|
|
|
|
($seq_div eq 'Unreviewed' || $seq_div eq 'PRELIMINARY') ? 'TrEMBL' : |
291
|
|
|
|
|
|
|
$seq_div; |
292
|
|
|
|
|
|
|
# we shouldn't be setting the division, but for now... |
293
|
31
|
|
|
|
|
147
|
my ($junk, $division) = split q(_), $name; |
294
|
31
|
|
|
|
|
88
|
$params{'-division'} = $division; |
295
|
31
|
|
|
|
|
69
|
$params{'-alphabet'} = 'protein'; |
296
|
|
|
|
|
|
|
# this is important to have the id for display in e.g. FTHelper, otherwise |
297
|
|
|
|
|
|
|
# you won't know which entry caused an error |
298
|
31
|
|
|
|
|
69
|
$params{'-display_id'} = $name; |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
BEFORE_FEATURE_TABLE : |
301
|
31
|
|
|
|
|
99
|
while ( defined($_ = $self->_readline) ) { |
302
|
|
|
|
|
|
|
# Exit at start of Feature table and at the sequence at the |
303
|
|
|
|
|
|
|
# latest HL 05/11/2000 |
304
|
981
|
100
|
|
|
|
2952
|
last if( /^(FT|SQ)/ ); |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# Description line(s) |
307
|
950
|
100
|
|
|
|
7670
|
if (/^DE\s+(\S.*\S)/) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
308
|
54
|
100
|
|
|
|
242
|
$desc .= $desc ? " $1" : $1; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
#Gene name |
311
|
|
|
|
|
|
|
elsif (/^GN\s+(.*)/) { |
312
|
36
|
100
|
|
|
|
97
|
$genename .= " " if $genename; |
313
|
36
|
|
|
|
|
112
|
$genename .= $1; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
#accession number(s) |
316
|
|
|
|
|
|
|
elsif ( /^AC\s+(.+)/) { |
317
|
31
|
|
|
|
|
164
|
my @accs = split(/[; ]+/, $1); # allow space in addition |
318
|
|
|
|
|
|
|
$params{'-accession_number'} = shift @accs |
319
|
31
|
50
|
|
|
|
135
|
unless defined $params{'-accession_number'}; |
320
|
31
|
|
|
|
|
64
|
push @{$params{'-secondary_accessions'}}, @accs; |
|
31
|
|
|
|
|
140
|
|
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
#date and sequence version |
323
|
|
|
|
|
|
|
elsif ( /^DT\s+(.*)/ ) { |
324
|
90
|
|
|
|
|
442
|
my $line = $1; |
325
|
90
|
|
|
|
|
281
|
my ($date, $version) = split(' ', $line, 2); |
326
|
90
|
|
|
|
|
162
|
$date =~ tr/,//d; # remove comma if new version |
327
|
90
|
100
|
100
|
|
|
701
|
if ($version =~ /\(Rel\. (\d+), Last sequence update\)/ || # old |
|
|
100
|
100
|
|
|
|
|
328
|
|
|
|
|
|
|
/sequence version (\d+)/) { #new |
329
|
30
|
|
|
|
|
240
|
my $update = Bio::Annotation::SimpleValue->new |
330
|
|
|
|
|
|
|
(-tagname => 'seq_update', |
331
|
|
|
|
|
|
|
-value => $1 |
332
|
|
|
|
|
|
|
); |
333
|
30
|
|
|
|
|
152
|
$annotation->add_Annotation($update); |
334
|
|
|
|
|
|
|
} elsif ($version =~ /\(Rel\. (\d+), Last annotation update\)/ || #old |
335
|
|
|
|
|
|
|
/entry version (\d+)/) { #new |
336
|
30
|
|
|
|
|
96
|
$params{'-version'} = $1; |
337
|
|
|
|
|
|
|
} |
338
|
90
|
|
|
|
|
121
|
push @{$params{'-dates'}}, $date; |
|
90
|
|
|
|
|
368
|
|
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
# Evidence level |
341
|
|
|
|
|
|
|
elsif ( /^PE\s+(.*)/ ) { |
342
|
1
|
|
|
|
|
6
|
my $line = $1; |
343
|
1
|
|
|
|
|
7
|
$line =~ s/;\s*//; # trim trailing semicolon and any spaces at the end of the line |
344
|
1
|
|
|
|
|
12
|
my $evidence = Bio::Annotation::SimpleValue->new |
345
|
|
|
|
|
|
|
(-tagname => 'evidence', |
346
|
|
|
|
|
|
|
-value => $line |
347
|
|
|
|
|
|
|
); |
348
|
1
|
|
|
|
|
6
|
$annotation->add_Annotation($evidence); |
349
|
|
|
|
|
|
|
} |
350
|
|
|
|
|
|
|
# Organism name and phylogenetic information |
351
|
|
|
|
|
|
|
elsif (/^O[SCG]/) { |
352
|
30
|
|
|
|
|
130
|
my $species = $self->_read_swissprot_Species($_); |
353
|
30
|
|
|
|
|
286
|
$params{'-species'}= $species; |
354
|
|
|
|
|
|
|
# now we are one line ahead -- so continue without reading the next |
355
|
|
|
|
|
|
|
# line HL 05/11/2000 |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
# References |
358
|
|
|
|
|
|
|
elsif (/^R/) { |
359
|
30
|
|
|
|
|
150
|
my $refs = $self->_read_swissprot_References($_); |
360
|
30
|
|
|
|
|
97
|
foreach my $r (@$refs) { |
361
|
227
|
|
|
|
|
564
|
$annotation->add_Annotation('reference',$r); |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
# Comments |
365
|
|
|
|
|
|
|
elsif (/^CC\s{3}(.*)/) { |
366
|
30
|
|
|
|
|
148
|
$comment .= $1; |
367
|
30
|
|
|
|
|
65
|
$comment .= "\n"; |
368
|
30
|
|
66
|
|
|
99
|
while (defined ($_ = $self->_readline) && /^CC\s{3}(.*)/ ) { |
369
|
474
|
|
|
|
|
1560
|
$comment .= $1 . "\n"; |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# note: don't try to process comments here -- they may contain |
373
|
|
|
|
|
|
|
# structure. LP 07/30/2000 |
374
|
|
|
|
|
|
|
|
375
|
30
|
|
|
|
|
306
|
my $commobj = Bio::Annotation::Comment->new(-tagname => 'comment', |
376
|
|
|
|
|
|
|
-text => $comment); |
377
|
30
|
|
|
|
|
128
|
$annotation->add_Annotation('comment',$commobj); |
378
|
30
|
|
|
|
|
64
|
$comment = ""; |
379
|
30
|
|
|
|
|
132
|
$self->_pushback($_); |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
#DBLinks |
382
|
|
|
|
|
|
|
# old regexp |
383
|
|
|
|
|
|
|
# /^DR\s+(\S+)\;\s+(\S+)\;\s+(\S+)[\;\.](.*)$/) { |
384
|
|
|
|
|
|
|
# new regexp from Andreas Kahari bug #1584 |
385
|
|
|
|
|
|
|
elsif (/^DR\s+(\S+)\;\s+(\S+)\;\s+([^;]+)[\;\.](.*)$/) { |
386
|
599
|
|
|
|
|
2292
|
my ($database,$primaryid,$optional,$comment) = ($1,$2,$3,$4); |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
# drop leading and training spaces and trailing . |
389
|
599
|
|
|
|
|
1415
|
$comment =~ s/\.\s*$//; |
390
|
599
|
|
|
|
|
1144
|
$comment =~ s/^\s+//; |
391
|
|
|
|
|
|
|
|
392
|
599
|
|
|
|
|
2226
|
my $dblinkobj = Bio::Annotation::DBLink->new |
393
|
|
|
|
|
|
|
(-database => $database, |
394
|
|
|
|
|
|
|
-primary_id => $primaryid, |
395
|
|
|
|
|
|
|
-optional_id => $optional, |
396
|
|
|
|
|
|
|
-comment => $comment, |
397
|
|
|
|
|
|
|
-tagname => 'dblink', |
398
|
|
|
|
|
|
|
); |
399
|
|
|
|
|
|
|
|
400
|
599
|
|
|
|
|
1705
|
$annotation->add_Annotation('dblink',$dblinkobj); |
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
#keywords |
403
|
|
|
|
|
|
|
elsif ( /^KW\s+(.*)$/ ) { |
404
|
49
|
|
|
|
|
409
|
my @kw = split(/\s*\;\s*/,$1); |
405
|
49
|
50
|
|
|
|
237
|
defined $kw[-1] && $kw[-1] =~ s/\.$//; |
406
|
49
|
|
|
|
|
108
|
push @{$params{'-keywords'}}, @kw; |
|
49
|
|
|
|
|
289
|
|
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
# process and parse the gene name line if there was one (note: we |
410
|
|
|
|
|
|
|
# can't do this above b/c GN may be multi-line and we can't |
411
|
|
|
|
|
|
|
# unequivocally determine whether we've seen the last GN line in |
412
|
|
|
|
|
|
|
# the new format) |
413
|
31
|
100
|
|
|
|
106
|
if ($genename) { |
414
|
30
|
|
|
|
|
78
|
my @stags; |
415
|
30
|
100
|
|
|
|
133
|
if ($genename =~ /\w=\w/) { |
416
|
|
|
|
|
|
|
# new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP) |
417
|
12
|
|
|
|
|
68
|
for my $n (split(m{\s+and\s+},$genename)) { |
418
|
14
|
|
|
|
|
29
|
my @genenames; |
419
|
14
|
|
|
|
|
74
|
for my $section (split(m{\s*;\s*},$n)) { |
420
|
19
|
|
|
|
|
76
|
my ($tag, $rest) = split("=",$section); |
421
|
19
|
|
50
|
|
|
53
|
$rest ||= ''; |
422
|
19
|
|
|
|
|
56
|
for my $val (split(m{\s*,\s*},$rest)) { |
423
|
23
|
|
|
|
|
70
|
push @genenames, [$tag => $val]; |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
} |
426
|
14
|
|
|
|
|
83
|
push @stags, ['gene_name' => \@genenames]; |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
} else { |
429
|
|
|
|
|
|
|
# old format |
430
|
18
|
|
|
|
|
90
|
for my $section (split(/ AND /, $genename)) { |
431
|
22
|
|
|
|
|
48
|
my @genenames; |
432
|
22
|
|
|
|
|
102
|
$section =~ s/[\(\)\.]//g; |
433
|
22
|
|
|
|
|
99
|
my @names = split(m{\s+OR\s+}, $section); |
434
|
22
|
|
|
|
|
152
|
push @genenames, ['Name' => shift @names]; |
435
|
22
|
|
|
|
|
78
|
push @genenames, map {['Synonyms' => $_]} @names; |
|
25
|
|
|
|
|
70
|
|
436
|
22
|
|
|
|
|
102
|
push @stags, ['gene_name' => \@genenames] |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
} #use Data::Dumper; print Dumper $gn, $genename;# exit; |
439
|
30
|
|
|
|
|
371
|
my $gn = Bio::Annotation::TagTree->new(-tagname => 'gene_name', |
440
|
|
|
|
|
|
|
-value => ['gene_names' => \@stags]); |
441
|
30
|
|
|
|
|
136
|
$annotation->add_Annotation('gene_name', $gn); |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
FEATURE_TABLE : |
445
|
|
|
|
|
|
|
# if there is no feature table, or if we've got beyond, exit loop or don't |
446
|
|
|
|
|
|
|
# even enter HL 05/11/2000 |
447
|
31
|
|
66
|
|
|
271
|
while (defined $_ && /^FT/ ) { |
448
|
443
|
|
|
|
|
1337
|
my $ftunit = $self->_read_FTHelper_swissprot($_); |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
# process ftunit |
451
|
|
|
|
|
|
|
# when parsing of the line fails we get undef returned |
452
|
443
|
50
|
|
|
|
856
|
if ($ftunit) { |
453
|
|
|
|
|
|
|
push(@features, |
454
|
|
|
|
|
|
|
$ftunit->_generic_seqfeature($self->location_factory(), |
455
|
443
|
|
|
|
|
1373
|
$params{'-seqid'}, "SwissProt")); |
456
|
|
|
|
|
|
|
} else { |
457
|
|
|
|
|
|
|
$self->warn("failed to parse feature table line for seq " . |
458
|
0
|
|
|
|
|
0
|
$params{'-display_id'}. "\n$_"); |
459
|
|
|
|
|
|
|
} |
460
|
443
|
|
|
|
|
1713
|
$_ = $self->_readline; |
461
|
|
|
|
|
|
|
} |
462
|
31
|
|
33
|
|
|
303
|
while ( defined($_) && ! /^SQ/ ) { |
463
|
0
|
|
|
|
|
0
|
$_ = $self->_readline; |
464
|
|
|
|
|
|
|
} |
465
|
31
|
|
|
|
|
74
|
$seqc = ""; |
466
|
31
|
|
|
|
|
96
|
while ( defined ($_ = $self->_readline) ) { |
467
|
242
|
100
|
|
|
|
563
|
last if m{^//}; |
468
|
211
|
|
|
|
|
1428
|
s/[^A-Za-z]//g; |
469
|
211
|
|
|
|
|
673
|
$seqc .= uc($_); |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
|
472
|
31
|
|
|
|
|
183
|
my $seq= $self->sequence_factory->create |
473
|
|
|
|
|
|
|
(-verbose => $self->verbose, |
474
|
|
|
|
|
|
|
%params, |
475
|
|
|
|
|
|
|
-seq => $seqc, |
476
|
|
|
|
|
|
|
-desc => $desc, |
477
|
|
|
|
|
|
|
-features => \@features, |
478
|
|
|
|
|
|
|
-annotation => $annotation, |
479
|
|
|
|
|
|
|
); |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
# The annotation doesn't get added by the constructor |
482
|
31
|
|
|
|
|
164
|
$seq->annotation($annotation); |
483
|
|
|
|
|
|
|
|
484
|
31
|
|
|
|
|
416
|
return $seq; |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
=head2 write_seq |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
Title : write_seq |
490
|
|
|
|
|
|
|
Usage : $stream->write_seq($seq) |
491
|
|
|
|
|
|
|
Function: writes the $seq object (must be seq) to the stream |
492
|
|
|
|
|
|
|
Returns : 1 for success and 0 for error |
493
|
|
|
|
|
|
|
Args : array of 1 to n Bio::SeqI objects |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
=cut |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub write_seq { |
499
|
4
|
|
|
4
|
1
|
87
|
my ($self,@seqs) = @_; |
500
|
4
|
|
|
|
|
15
|
foreach my $seq ( @seqs ) { |
501
|
4
|
50
|
|
|
|
18
|
$self->throw("Attempting to write with no seq!") unless defined $seq; |
502
|
|
|
|
|
|
|
|
503
|
4
|
50
|
33
|
|
|
36
|
if ( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { |
504
|
0
|
|
|
|
|
0
|
$self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
|
507
|
4
|
|
|
|
|
11
|
my $i; |
508
|
4
|
|
|
|
|
26
|
my $str = $seq->seq; |
509
|
|
|
|
|
|
|
|
510
|
4
|
|
|
|
|
9
|
my $div; |
511
|
4
|
|
66
|
|
|
73
|
my $ns = ($seq->can('namespace')) && $seq->namespace(); |
512
|
4
|
|
|
|
|
23
|
my $len = $seq->length(); |
513
|
|
|
|
|
|
|
|
514
|
4
|
100
|
66
|
|
|
45
|
if ( !$seq->can('division') || ! defined ($div = $seq->division()) ) { |
515
|
1
|
|
|
|
|
4
|
$div = 'UNK'; |
516
|
|
|
|
|
|
|
} |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
# namespace dictates database, takes precedent over division. Sorry! |
519
|
4
|
100
|
66
|
|
|
32
|
if (defined($ns) && $ns ne '') { |
520
|
3
|
0
|
|
|
|
17
|
$div = ($ns eq 'Swiss-Prot') ? 'Reviewed' : |
|
|
50
|
|
|
|
|
|
521
|
|
|
|
|
|
|
($ns eq 'TrEMBL') ? 'Unreviewed' : |
522
|
|
|
|
|
|
|
$ns; |
523
|
|
|
|
|
|
|
} else { |
524
|
1
|
|
|
|
|
2
|
$ns = 'Swiss-Prot'; |
525
|
|
|
|
|
|
|
# division not reset; acts as fallback |
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
|
528
|
4
|
50
|
|
|
|
38
|
$self->warn("No whitespace allowed in SWISS-PROT display id [". $seq->display_id. "]") |
529
|
|
|
|
|
|
|
if $seq->display_id =~ /\s/; |
530
|
|
|
|
|
|
|
|
531
|
4
|
|
|
|
|
11
|
my $temp_line; |
532
|
4
|
50
|
|
|
|
19
|
if ( $self->_id_generation_func ) { |
533
|
0
|
|
|
|
|
0
|
$temp_line = &{$self->_id_generation_func}($seq); |
|
0
|
|
|
|
|
0
|
|
534
|
|
|
|
|
|
|
} else { |
535
|
|
|
|
|
|
|
#$temp_line = sprintf ("%10s STANDARD; %3s; %d AA.", |
536
|
|
|
|
|
|
|
# $seq->primary_id()."_".$div,$mol,$len); |
537
|
|
|
|
|
|
|
# Reconstructing the ID relies heavily upon the input source having |
538
|
|
|
|
|
|
|
# been in a format that is parsed as this routine expects it -- that is, |
539
|
|
|
|
|
|
|
# by this module itself. This is bad, I think, and immediately breaks |
540
|
|
|
|
|
|
|
# if e.g. the Bio::DB::GenPept module is used as input. |
541
|
|
|
|
|
|
|
# Hence, switch to display_id(); _every_ sequence is supposed to have |
542
|
|
|
|
|
|
|
# this. HL 2000/09/03 |
543
|
|
|
|
|
|
|
# Changed to reflect ID line changes in UniProt |
544
|
|
|
|
|
|
|
# Oct 2006 - removal of molecule type - see bug 2134 |
545
|
4
|
|
|
|
|
12
|
$temp_line = sprintf ("%-24s%-12s%9d AA.", |
546
|
|
|
|
|
|
|
$seq->display_id(), $div.';', $len); |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
|
549
|
4
|
|
|
|
|
44
|
$self->_print( "ID $temp_line\n"); |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
# if there, write the accession line |
552
|
4
|
|
|
|
|
25
|
local($^W) = 0; # suppressing warnings about uninitialized fields |
553
|
|
|
|
|
|
|
|
554
|
4
|
50
|
|
|
|
26
|
if ( $self->_ac_generation_func ) { |
|
|
50
|
|
|
|
|
|
555
|
0
|
|
|
|
|
0
|
$temp_line = &{$self->_ac_generation_func}($seq); |
|
0
|
|
|
|
|
0
|
|
556
|
0
|
|
|
|
|
0
|
$self->_print( "AC $temp_line\n"); |
557
|
|
|
|
|
|
|
} |
558
|
|
|
|
|
|
|
elsif ($seq->can('accession_number') ) { |
559
|
4
|
|
|
|
|
22
|
my $ac_line = $seq->accession_number; |
560
|
4
|
100
|
|
|
|
30
|
if ($seq->can('get_secondary_accessions') ) { |
561
|
3
|
|
|
|
|
16
|
foreach my $sacc ($seq->get_secondary_accessions) { |
562
|
0
|
|
|
|
|
0
|
$ac_line .= "; ". $sacc;; |
563
|
|
|
|
|
|
|
} |
564
|
3
|
|
|
|
|
11
|
$ac_line .= ";"; |
565
|
|
|
|
|
|
|
} |
566
|
|
|
|
|
|
|
|
567
|
4
|
|
|
|
|
24
|
$self->_write_line_swissprot_regex("AC ","AC ",$ac_line, |
568
|
|
|
|
|
|
|
"\\s\+\|\$",$LINE_LENGTH); |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
# otherwise - cannot print |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# Date lines and sequence versions (changed 6/15/2006) |
574
|
|
|
|
|
|
|
# This is rebuilt from scratch using the current SwissProt/UniProt format |
575
|
4
|
100
|
|
|
|
37
|
if ( $seq->can('get_dates') ) { |
576
|
3
|
|
|
|
|
21
|
my @dates = $seq->get_dates(); |
577
|
3
|
|
|
|
|
9
|
my $ct = 1; |
578
|
3
|
|
|
|
|
15
|
my $seq_version = $seq->version; |
579
|
3
|
|
|
|
|
13
|
my ($update_version) = $seq->annotation->get_Annotations("seq_update"); |
580
|
3
|
|
|
|
|
11
|
foreach my $dt (@dates) { |
581
|
9
|
100
|
|
|
|
41
|
$self->_write_line_swissprot_regex("DT ","DT ", |
582
|
|
|
|
|
|
|
$dt.', integrated into UniProtKB/'.$ns.'.', |
583
|
|
|
|
|
|
|
"\\s\+\|\$",$LINE_LENGTH) if $ct == 1; |
584
|
9
|
100
|
|
|
|
37
|
$self->_write_line_swissprot_regex("DT ","DT ", |
585
|
|
|
|
|
|
|
$dt.", sequence version ".$update_version->display_text.'.', |
586
|
|
|
|
|
|
|
"\\s\+\|\$",$LINE_LENGTH) if $ct == 2; |
587
|
9
|
100
|
|
|
|
38
|
$self->_write_line_swissprot_regex("DT ","DT ", |
588
|
|
|
|
|
|
|
$dt.", entry version $seq_version.", |
589
|
|
|
|
|
|
|
"\\s\+\|\$",$LINE_LENGTH) if $ct == 3; |
590
|
9
|
|
|
|
|
18
|
$ct++; |
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
#Definition lines |
595
|
4
|
|
|
|
|
27
|
$self->_write_line_swissprot_regex("DE ","DE ",$seq->desc(),"\\s\+\|\$",$LINE_LENGTH); |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
#Gene name; print out new format |
598
|
4
|
|
|
|
|
17
|
foreach my $gene ( my @genes = $seq->annotation->get_Annotations('gene_name') ) { |
599
|
|
|
|
|
|
|
# gene is a Bio::Annotation::TagTree; |
600
|
3
|
100
|
|
|
|
25
|
if ($gene->isa('Bio::Annotation::TagTree')) { |
601
|
2
|
|
|
|
|
4
|
my @genelines; |
602
|
2
|
|
|
|
|
15
|
for my $node ($gene->findnode('gene_name')) { |
603
|
|
|
|
|
|
|
# check for Name and Synonym first, then the rest get tagged on |
604
|
2
|
|
|
|
|
464
|
my $geneline = "GN "; |
605
|
2
|
|
|
|
|
21
|
my %genedata = $node->hash; |
606
|
2
|
|
|
|
|
104
|
for my $tag (@GENE_NAME_ORDER) { |
607
|
8
|
100
|
|
|
|
25
|
if (exists $genedata{$tag}) { |
608
|
|
|
|
|
|
|
$geneline .= (ref $genedata{$tag} eq 'ARRAY') ? |
609
|
2
|
50
|
|
|
|
18
|
"$tag=".join(', ',@{$genedata{$tag}})."; " : |
|
0
|
|
|
|
|
0
|
|
610
|
|
|
|
|
|
|
"$tag=$genedata{$tag}; "; |
611
|
2
|
|
|
|
|
8
|
delete $genedata{$tag}; |
612
|
|
|
|
|
|
|
} |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
# add rest |
615
|
2
|
|
|
|
|
11
|
for my $tag (sort keys %genedata) { |
616
|
|
|
|
|
|
|
$geneline .= (ref $genedata{$tag} eq 'ARRAY') ? |
617
|
0
|
0
|
|
|
|
0
|
"$tag=".join(', ',@{$genedata{$tag}})."; " : |
|
0
|
|
|
|
|
0
|
|
618
|
|
|
|
|
|
|
"$tag=$genedata{$tag}; "; |
619
|
0
|
|
|
|
|
0
|
delete $genedata{$tag}; |
620
|
|
|
|
|
|
|
} |
621
|
2
|
|
|
|
|
12
|
push @genelines, "$geneline\n"; |
622
|
|
|
|
|
|
|
} |
623
|
2
|
|
|
|
|
14
|
$self->_print(join("GN and\n",@genelines)); |
624
|
|
|
|
|
|
|
} else { # fall back to getting stringified output |
625
|
1
|
|
|
|
|
4
|
$self->_write_line_swissprot_regex("GN ","GN ", |
626
|
|
|
|
|
|
|
$gene->display_text, |
627
|
|
|
|
|
|
|
"\\s\+\|\$", |
628
|
|
|
|
|
|
|
$LINE_LENGTH); |
629
|
|
|
|
|
|
|
} |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
# Organism lines |
633
|
4
|
100
|
66
|
|
|
48
|
if ($seq->can('species') && (my $spec = $seq->species)) { |
634
|
3
|
|
|
|
|
25
|
my @class = $spec->classification(); |
635
|
3
|
|
|
|
|
6
|
shift(@class); |
636
|
3
|
|
|
|
|
17
|
my $species = $spec->species; |
637
|
3
|
|
|
|
|
11
|
my $genus = $spec->genus; |
638
|
3
|
|
|
|
|
20
|
my $OS = $spec->scientific_name; |
639
|
3
|
50
|
|
|
|
17
|
if ($class[-1] =~ /viruses/i) { |
640
|
0
|
|
|
|
|
0
|
$OS = $species; |
641
|
0
|
0
|
|
|
|
0
|
$OS .= " ". $spec->sub_species if $spec->sub_species; |
642
|
|
|
|
|
|
|
} |
643
|
3
|
|
|
|
|
19
|
foreach (($spec->variant, $spec->common_name)) { |
644
|
3
|
50
|
|
|
|
13
|
$OS .= " ($_)" if $_; |
645
|
|
|
|
|
|
|
} |
646
|
3
|
|
|
|
|
28
|
$self->_print( "OS $OS.\n"); |
647
|
3
|
|
|
|
|
19
|
my $OC = join('; ', reverse(@class)) .'.'; |
648
|
3
|
|
|
|
|
19
|
$self->_write_line_swissprot_regex("OC ","OC ",$OC,"\; \|\$",$LINE_LENGTH); |
649
|
3
|
50
|
|
|
|
18
|
if ($spec->organelle) { |
650
|
0
|
|
|
|
|
0
|
$self->_write_line_swissprot_regex("OG ","OG ",$spec->organelle,"\; \|\$",$LINE_LENGTH); |
651
|
|
|
|
|
|
|
} |
652
|
3
|
50
|
|
|
|
16
|
if ($spec->ncbi_taxid) { |
653
|
3
|
|
|
|
|
13
|
$self->_print("OX NCBI_TaxID=".$spec->ncbi_taxid.";\n"); |
654
|
|
|
|
|
|
|
} |
655
|
|
|
|
|
|
|
} |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
# Reference lines |
658
|
4
|
|
|
|
|
14
|
my $t = 1; |
659
|
4
|
|
|
|
|
29
|
foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { |
660
|
3
|
|
|
|
|
19
|
$self->_print( "RN [$t]\n"); |
661
|
|
|
|
|
|
|
# changed by lorenz 08/03/00 |
662
|
|
|
|
|
|
|
# j.gilbert and h.lapp agreed that the rp line in swissprot seems |
663
|
|
|
|
|
|
|
# more like a comment than a parseable value, so print it as is |
664
|
3
|
50
|
|
|
|
20
|
if ($ref->rp) { |
665
|
3
|
|
|
|
|
17
|
$self->_write_line_swissprot_regex("RP ","RP ",$ref->rp, |
666
|
|
|
|
|
|
|
"\\s\+\|\$",$LINE_LENGTH); |
667
|
|
|
|
|
|
|
} |
668
|
3
|
100
|
|
|
|
20
|
if ($ref->comment) { |
669
|
2
|
|
|
|
|
6
|
$self->_write_line_swissprot_regex("RC ","RC ",$ref->comment, |
670
|
|
|
|
|
|
|
"\\s\+\|\$",$LINE_LENGTH); |
671
|
|
|
|
|
|
|
} |
672
|
3
|
50
|
33
|
|
|
17
|
if ($ref->medline or $ref->pubmed or $ref->doi) { |
|
|
|
33
|
|
|
|
|
673
|
|
|
|
|
|
|
# new RX format in swissprot LP 09/17/00 |
674
|
|
|
|
|
|
|
# RX line can now have a DOI, Heikki 13 Feb 2008 |
675
|
|
|
|
|
|
|
|
676
|
0
|
|
|
|
|
0
|
my $line; |
677
|
0
|
0
|
|
|
|
0
|
$line .= "MEDLINE=". $ref->medline. '; ' if $ref->medline; |
678
|
0
|
0
|
|
|
|
0
|
$line .= "PubMed=". $ref->pubmed. '; ' if $ref->pubmed; |
679
|
0
|
0
|
|
|
|
0
|
$line .= "DOI=". $ref->doi. '; ' if $ref->doi; |
680
|
0
|
|
|
|
|
0
|
chop $line; |
681
|
|
|
|
|
|
|
|
682
|
0
|
|
|
|
|
0
|
$self->_write_line_swissprot_regex("RX ","RX ", |
683
|
|
|
|
|
|
|
$line, |
684
|
|
|
|
|
|
|
"\\s\+\|\$",$LINE_LENGTH); |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
} |
687
|
3
|
50
|
|
|
|
17
|
my $author = $ref->authors .';' if($ref->authors); |
688
|
3
|
50
|
|
|
|
16
|
my $title = $ref->title .';' if( $ref->title); |
689
|
3
|
50
|
|
|
|
16
|
my $rg = $ref->rg . ';' if $ref->rg; |
690
|
3
|
|
|
|
|
40
|
$author =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#' |
691
|
|
|
|
|
|
|
|
692
|
3
|
50
|
|
|
|
14
|
$self->_write_line_swissprot_regex("RG ","RG ",$rg,"\\s\+\|\$",$LINE_LENGTH) if $rg; |
693
|
3
|
50
|
|
|
|
18
|
$self->_write_line_swissprot_regex("RA ","RA ",$author,"\\s\+\|\$",$LINE_LENGTH) if $author; |
694
|
3
|
50
|
|
|
|
11
|
$self->_write_line_swissprot_regex("RT ","RT ",$title,'[\s\-]+|$',$LINE_LENGTH) if $title; |
695
|
3
|
|
|
|
|
16
|
$self->_write_line_swissprot_regex("RL ","RL ",$ref->location,"\\s\+\|\$",$LINE_LENGTH); |
696
|
3
|
|
|
|
|
15
|
$t++; |
697
|
|
|
|
|
|
|
} |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
# Comment lines |
700
|
|
|
|
|
|
|
|
701
|
4
|
|
|
|
|
17
|
foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { |
702
|
3
|
|
|
|
|
15
|
foreach my $cline (split ("\n", $comment->text)) { |
703
|
48
|
|
|
|
|
81
|
while (length $cline > 74) { |
704
|
0
|
|
|
|
|
0
|
$self->_print("CC ",(substr $cline,0,74),"\n"); |
705
|
0
|
|
|
|
|
0
|
$cline = substr $cline,74; |
706
|
|
|
|
|
|
|
} |
707
|
48
|
|
|
|
|
77
|
$self->_print("CC ",$cline,"\n"); |
708
|
|
|
|
|
|
|
} |
709
|
|
|
|
|
|
|
} |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
# Database xref lines |
712
|
|
|
|
|
|
|
|
713
|
4
|
|
|
|
|
18
|
foreach my $dblink ( $seq->annotation->get_Annotations('dblink') ) { |
714
|
27
|
|
|
|
|
56
|
my ($primary_id) = $dblink->primary_id; |
715
|
|
|
|
|
|
|
|
716
|
27
|
100
|
66
|
|
|
48
|
if (defined($dblink->comment) && ($dblink->comment) ) { |
|
|
50
|
|
|
|
|
|
717
|
12
|
|
|
|
|
19
|
$self->_print("DR ",$dblink->database,"; ",$primary_id,"; ", |
718
|
|
|
|
|
|
|
$dblink->optional_id,"; ",$dblink->comment,".\n"); |
719
|
|
|
|
|
|
|
} elsif ($dblink->optional_id) { |
720
|
15
|
|
|
|
|
39
|
$self->_print("DR ",$dblink->database,"; ", |
721
|
|
|
|
|
|
|
$primary_id,"; ", |
722
|
|
|
|
|
|
|
$dblink->optional_id,".\n"); |
723
|
|
|
|
|
|
|
} else { |
724
|
0
|
|
|
|
|
0
|
$self->_print("DR ",$dblink->database, |
725
|
|
|
|
|
|
|
"; ",$primary_id,"; ","-.\n"); |
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
} |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# Evidence lines |
730
|
|
|
|
|
|
|
|
731
|
4
|
|
|
|
|
22
|
foreach my $evidence ( $seq->annotation->get_Annotations('evidence') ) { |
732
|
0
|
|
|
|
|
0
|
$self->_print("PE ",$evidence->value,";\n"); |
733
|
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
# if there, write the kw line |
736
|
|
|
|
|
|
|
{ |
737
|
4
|
|
|
|
|
9
|
my $kw; |
|
4
|
|
|
|
|
6
|
|
738
|
4
|
50
|
|
|
|
19
|
if ( my $func = $self->_kw_generation_func ) { |
|
|
100
|
|
|
|
|
|
739
|
0
|
|
|
|
|
0
|
$kw = &{$func}($seq); |
|
0
|
|
|
|
|
0
|
|
740
|
|
|
|
|
|
|
} elsif ( $seq->can('keywords') ) { |
741
|
3
|
|
|
|
|
15
|
$kw = $seq->keywords; |
742
|
3
|
50
|
|
|
|
24
|
if ( ref($kw) =~ /ARRAY/i ) { |
743
|
0
|
|
|
|
|
0
|
$kw = join("; ", @$kw); |
744
|
|
|
|
|
|
|
} |
745
|
3
|
50
|
33
|
|
|
31
|
$kw .= '.' if $kw and $kw !~ /\.$/ ; |
746
|
|
|
|
|
|
|
} |
747
|
4
|
|
|
|
|
51
|
$kw =~ s/([\w\.]) (\w)/$1#$2/g; # add word wrap protection char '#' |
748
|
4
|
100
|
|
|
|
23
|
$self->_write_line_swissprot_regex("KW ","KW ", |
749
|
|
|
|
|
|
|
$kw, "\\s\+\|\$",$LINE_LENGTH) |
750
|
|
|
|
|
|
|
if $kw; |
751
|
|
|
|
|
|
|
} |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
#Check if there is seqfeatures before printing the FT line |
754
|
4
|
50
|
|
|
|
47
|
my @feats = $seq->can('top_SeqFeatures') ? $seq->top_SeqFeatures : (); |
755
|
4
|
100
|
|
|
|
15
|
if ($feats[0]) { |
756
|
3
|
50
|
|
|
|
14
|
if ( defined $self->_post_sort ) { |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
# we need to read things into an array. Process. Sort them. Print 'em |
759
|
|
|
|
|
|
|
|
760
|
0
|
|
|
|
|
0
|
my $post_sort_func = $self->_post_sort(); |
761
|
0
|
|
|
|
|
0
|
my @fth; |
762
|
|
|
|
|
|
|
|
763
|
0
|
|
|
|
|
0
|
foreach my $sf ( @feats ) { |
764
|
0
|
|
|
|
|
0
|
push(@fth,Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq)); |
765
|
|
|
|
|
|
|
} |
766
|
0
|
|
|
|
|
0
|
@fth = sort { &$post_sort_func($a,$b) } @fth; |
|
0
|
|
|
|
|
0
|
|
767
|
|
|
|
|
|
|
|
768
|
0
|
|
|
|
|
0
|
foreach my $fth ( @fth ) { |
769
|
0
|
|
|
|
|
0
|
$self->_print_swissprot_FTHelper($fth); |
770
|
|
|
|
|
|
|
} |
771
|
|
|
|
|
|
|
} else { |
772
|
|
|
|
|
|
|
# not post sorted. And so we can print as we get them. |
773
|
|
|
|
|
|
|
# lower memory load... |
774
|
|
|
|
|
|
|
|
775
|
3
|
|
|
|
|
9
|
foreach my $sf ( @feats ) { |
776
|
9
|
|
|
|
|
35
|
my @fth = Bio::SeqIO::FTHelper::from_SeqFeature($sf,$seq); |
777
|
9
|
|
|
|
|
18
|
foreach my $fth ( @fth ) { |
778
|
9
|
50
|
|
|
|
32
|
if ( ! $fth->isa('Bio::SeqIO::FTHelper') ) { |
779
|
0
|
|
|
|
|
0
|
$sf->throw("Cannot process FTHelper... $fth"); |
780
|
|
|
|
|
|
|
} |
781
|
9
|
|
|
|
|
32
|
$self->_print_swissprot_FTHelper($fth); |
782
|
|
|
|
|
|
|
} |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
} |
785
|
|
|
|
|
|
|
|
786
|
3
|
50
|
|
|
|
14
|
if ( $self->_show_dna() == 0 ) { |
787
|
0
|
|
|
|
|
0
|
return; |
788
|
|
|
|
|
|
|
} |
789
|
|
|
|
|
|
|
} |
790
|
|
|
|
|
|
|
# finished printing features. |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
# molecular weight |
793
|
4
|
|
|
|
|
9
|
my $mw = ${Bio::Tools::SeqStats->get_mol_wt($seq->primary_seq)}[0]; |
|
4
|
|
|
|
|
21
|
|
794
|
|
|
|
|
|
|
# checksum |
795
|
|
|
|
|
|
|
# was crc32 checksum, changed it to crc64 |
796
|
4
|
|
|
|
|
27
|
my $crc64 = $self->_crc64(\$str); |
797
|
4
|
|
|
|
|
112
|
$self->_print( sprintf("SQ SEQUENCE %4d AA; %d MW; %16s CRC64;\n", |
798
|
|
|
|
|
|
|
$len,$mw,$crc64)); |
799
|
4
|
|
|
|
|
25
|
$self->_print( " "); |
800
|
4
|
|
|
|
|
8
|
my $linepos; |
801
|
4
|
|
|
|
|
33
|
for ($i = 0; $i < length($str); $i += 10) { |
802
|
164
|
|
|
|
|
389
|
$self->_print( " ", substr($str,$i,10)); |
803
|
164
|
|
|
|
|
226
|
$linepos += 11; |
804
|
164
|
100
|
66
|
|
|
428
|
if ( ($i+10)%60 == 0 && (($i+10) < length($str))) { |
805
|
24
|
|
|
|
|
48
|
$self->_print( "\n "); |
806
|
|
|
|
|
|
|
} |
807
|
|
|
|
|
|
|
} |
808
|
4
|
|
|
|
|
18
|
$self->_print( "\n//\n"); |
809
|
|
|
|
|
|
|
|
810
|
4
|
50
|
33
|
|
|
22
|
$self->flush if $self->_flush_on_write && defined $self->_fh; |
811
|
4
|
|
|
|
|
73
|
return 1; |
812
|
|
|
|
|
|
|
} |
813
|
|
|
|
|
|
|
} |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
# Thanks to James Gilbert for the following two. LP 08/01/2000 |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
=head2 _generateCRCTable |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
Title : _generateCRCTable |
820
|
|
|
|
|
|
|
Usage : |
821
|
|
|
|
|
|
|
Function: |
822
|
|
|
|
|
|
|
Example : |
823
|
|
|
|
|
|
|
Returns : |
824
|
|
|
|
|
|
|
Args : |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
=cut |
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
sub _generateCRCTable { |
830
|
|
|
|
|
|
|
# 10001000001010010010001110000100 |
831
|
|
|
|
|
|
|
# 32 |
832
|
0
|
|
|
0
|
|
0
|
my $poly = 0xEDB88320; |
833
|
0
|
|
|
|
|
0
|
my ($self) = shift; |
834
|
|
|
|
|
|
|
|
835
|
0
|
|
|
|
|
0
|
$self->{'_crcTable'} = []; |
836
|
0
|
|
|
|
|
0
|
foreach my $i (0..255) { |
837
|
0
|
|
|
|
|
0
|
my $crc = $i; |
838
|
0
|
|
|
|
|
0
|
for (my $j=8; $j > 0; $j--) { |
839
|
0
|
0
|
|
|
|
0
|
if ($crc & 1) { |
840
|
0
|
|
|
|
|
0
|
$crc = ($crc >> 1) ^ $poly; |
841
|
|
|
|
|
|
|
} else { |
842
|
0
|
|
|
|
|
0
|
$crc >>= 1; |
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
} |
845
|
0
|
|
|
|
|
0
|
${$self->{'_crcTable'}}[$i] = $crc; |
|
0
|
|
|
|
|
0
|
|
846
|
|
|
|
|
|
|
} |
847
|
|
|
|
|
|
|
} |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
=head2 _crc32 |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
Title : _crc32 |
853
|
|
|
|
|
|
|
Usage : |
854
|
|
|
|
|
|
|
Function: |
855
|
|
|
|
|
|
|
Example : |
856
|
|
|
|
|
|
|
Returns : |
857
|
|
|
|
|
|
|
Args : |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
=cut |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
sub _crc32 { |
863
|
0
|
|
|
0
|
|
0
|
my( $self, $str ) = @_; |
864
|
|
|
|
|
|
|
|
865
|
0
|
0
|
|
|
|
0
|
$self->throw("Argument to crc32() must be ref to scalar") |
866
|
|
|
|
|
|
|
unless ref($str) eq 'SCALAR'; |
867
|
|
|
|
|
|
|
|
868
|
0
|
0
|
|
|
|
0
|
$self->_generateCRCTable() unless exists $self->{'_crcTable'}; |
869
|
|
|
|
|
|
|
|
870
|
0
|
|
|
|
|
0
|
my $len = length($$str); |
871
|
|
|
|
|
|
|
|
872
|
0
|
|
|
|
|
0
|
my $crc = 0xFFFFFFFF; |
873
|
0
|
|
|
|
|
0
|
for (my $i = 0; $i < $len; $i++) { |
874
|
|
|
|
|
|
|
# Get upper case value of each letter |
875
|
0
|
|
|
|
|
0
|
my $int = ord uc substr $$str, $i, 1; |
876
|
|
|
|
|
|
|
$crc = (($crc >> 8) & 0x00FFFFFF) ^ |
877
|
0
|
|
|
|
|
0
|
${$self->{'_crcTable'}}[ ($crc ^ $int) & 0xFF ]; |
|
0
|
|
|
|
|
0
|
|
878
|
|
|
|
|
|
|
} |
879
|
0
|
|
|
|
|
0
|
return $crc; |
880
|
|
|
|
|
|
|
} |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
=head2 _crc64 |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
Title : _crc64 |
885
|
|
|
|
|
|
|
Usage : |
886
|
|
|
|
|
|
|
Function: |
887
|
|
|
|
|
|
|
Example : |
888
|
|
|
|
|
|
|
Returns : |
889
|
|
|
|
|
|
|
Args : |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
=cut |
893
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
sub _crc64{ |
895
|
4
|
|
|
4
|
|
16
|
my ($self, $sequence) = @_; |
896
|
4
|
|
|
|
|
10
|
my $POLY64REVh = 0xd8000000; |
897
|
4
|
|
|
|
|
11
|
my @CRCTableh = 256; |
898
|
4
|
|
|
|
|
8
|
my @CRCTablel = 256; |
899
|
4
|
|
|
|
|
8
|
my $initialized; |
900
|
|
|
|
|
|
|
|
901
|
4
|
|
|
|
|
10
|
my $seq = $$sequence; |
902
|
|
|
|
|
|
|
|
903
|
4
|
|
|
|
|
6
|
my $crcl = 0; |
904
|
4
|
|
|
|
|
9
|
my $crch = 0; |
905
|
4
|
50
|
|
|
|
14
|
if (!$initialized) { |
906
|
4
|
|
|
|
|
6
|
$initialized = 1; |
907
|
4
|
|
|
|
|
19
|
for (my $i=0; $i<256; $i++) { |
908
|
1024
|
|
|
|
|
1048
|
my $partl = $i; |
909
|
1024
|
|
|
|
|
930
|
my $parth = 0; |
910
|
1024
|
|
|
|
|
1443
|
for (my $j=0; $j<8; $j++) { |
911
|
8192
|
|
|
|
|
8502
|
my $rflag = $partl & 1; |
912
|
8192
|
|
|
|
|
7437
|
$partl >>= 1; |
913
|
8192
|
50
|
|
|
|
10322
|
$partl |= (1 << 31) if $parth & 1; |
914
|
8192
|
|
|
|
|
7402
|
$parth >>= 1; |
915
|
8192
|
100
|
|
|
|
14621
|
$parth ^= $POLY64REVh if $rflag; |
916
|
|
|
|
|
|
|
} |
917
|
1024
|
|
|
|
|
1259
|
$CRCTableh[$i] = $parth; |
918
|
1024
|
|
|
|
|
1617
|
$CRCTablel[$i] = $partl; |
919
|
|
|
|
|
|
|
} |
920
|
|
|
|
|
|
|
} |
921
|
|
|
|
|
|
|
|
922
|
4
|
|
|
|
|
476
|
foreach (split '', $seq) { |
923
|
1636
|
|
|
|
|
1710
|
my $shr = ($crch & 0xFF) << 24; |
924
|
1636
|
|
|
|
|
1581
|
my $temp1h = $crch >> 8; |
925
|
1636
|
|
|
|
|
1562
|
my $temp1l = ($crcl >> 8) | $shr; |
926
|
1636
|
|
|
|
|
2062
|
my $tableindex = ($crcl ^ (unpack "C", $_)) & 0xFF; |
927
|
1636
|
|
|
|
|
1859
|
$crch = $temp1h ^ $CRCTableh[$tableindex]; |
928
|
1636
|
|
|
|
|
2050
|
$crcl = $temp1l ^ $CRCTablel[$tableindex]; |
929
|
|
|
|
|
|
|
} |
930
|
4
|
|
|
|
|
148
|
my $crc64 = sprintf("%08X%08X", $crch, $crcl); |
931
|
4
|
|
|
|
|
87
|
return $crc64; |
932
|
|
|
|
|
|
|
} |
933
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
=head2 _print_swissprot_FTHelper |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
Title : _print_swissprot_FTHelper |
937
|
|
|
|
|
|
|
Usage : |
938
|
|
|
|
|
|
|
Function: |
939
|
|
|
|
|
|
|
Example : |
940
|
|
|
|
|
|
|
Returns : |
941
|
|
|
|
|
|
|
Args : |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
=cut |
945
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
sub _print_swissprot_FTHelper { |
947
|
9
|
|
|
9
|
|
22
|
my ($self,$fth,$always_quote) = @_; |
948
|
9
|
|
50
|
|
|
118
|
$always_quote ||= 0; |
949
|
9
|
|
|
|
|
21
|
my ($start,$end) = ('?', '?'); |
950
|
|
|
|
|
|
|
|
951
|
9
|
50
|
33
|
|
|
53
|
if ( ! ref $fth || ! $fth->isa('Bio::SeqIO::FTHelper') ) { |
952
|
0
|
|
|
|
|
0
|
$fth->warn("$fth is not a FTHelper class. ". |
953
|
|
|
|
|
|
|
"Attempting to print, but there could be tears!"); |
954
|
|
|
|
|
|
|
} |
955
|
9
|
|
|
|
|
17
|
my $desc = ""; |
956
|
|
|
|
|
|
|
|
957
|
9
|
|
|
|
|
20
|
for my $tag ( qw(description gene note product) ) { |
958
|
9
|
50
|
|
|
|
26
|
if ( exists $fth->field->{$tag} ) { |
959
|
9
|
|
|
|
|
15
|
$desc = @{$fth->field->{$tag}}[0]."."; |
|
9
|
|
|
|
|
24
|
|
960
|
9
|
|
|
|
|
20
|
last; |
961
|
|
|
|
|
|
|
} |
962
|
|
|
|
|
|
|
} |
963
|
9
|
|
|
|
|
57
|
$desc =~ s/\.$//; |
964
|
|
|
|
|
|
|
|
965
|
9
|
|
|
|
|
21
|
my $ftid = ""; |
966
|
9
|
50
|
|
|
|
28
|
if ( exists $fth->field->{'FTId'} ) { |
967
|
0
|
|
|
|
|
0
|
$ftid = @{$fth->field->{'FTId'}}[0]. '.'; |
|
0
|
|
|
|
|
0
|
|
968
|
|
|
|
|
|
|
} |
969
|
|
|
|
|
|
|
|
970
|
9
|
|
|
|
|
26
|
my $key =substr($fth->key,0,8); |
971
|
9
|
|
|
|
|
23
|
my $loc = $fth->loc; |
972
|
9
|
100
|
|
|
|
59
|
if ( $loc =~ /(\?|\d+|\>\d+|<\d+)?\.\.(\?|\d+|<\d+|>\d+)?/ ) { |
|
|
50
|
|
|
|
|
|
973
|
6
|
50
|
|
|
|
27
|
$start = $1 if defined $1; |
974
|
6
|
50
|
|
|
|
31
|
$end = $2 if defined $2; |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
# to_FTString only returns one value when start == end, #JB955 |
977
|
|
|
|
|
|
|
# so if no match is found, assume it is both start and end #JB955 |
978
|
|
|
|
|
|
|
} elsif ( $loc =~ /join\((\d+)((?:,\d+)+)?\)/) { |
979
|
0
|
|
|
|
|
0
|
my @y = ($1); |
980
|
0
|
0
|
|
|
|
0
|
if ( defined( my $m = $2) ) { |
981
|
0
|
|
|
|
|
0
|
$m =~ s/^\,//; |
982
|
0
|
|
|
|
|
0
|
push @y, split(/,/,$m); |
983
|
|
|
|
|
|
|
} |
984
|
0
|
|
|
|
|
0
|
for my $x ( @y ) { |
985
|
0
|
|
|
|
|
0
|
$self->_write_line_swissprot_regex( |
986
|
|
|
|
|
|
|
sprintf("FT %-8s %6s %6s ", |
987
|
|
|
|
|
|
|
$key, |
988
|
|
|
|
|
|
|
$x ,$x), |
989
|
|
|
|
|
|
|
"FT ", |
990
|
|
|
|
|
|
|
$desc.'.','\s+|$',$LINE_LENGTH); |
991
|
|
|
|
|
|
|
} |
992
|
0
|
|
|
|
|
0
|
return; |
993
|
|
|
|
|
|
|
} else { |
994
|
3
|
|
|
|
|
13
|
$start = $end = $fth->loc; |
995
|
|
|
|
|
|
|
} |
996
|
9
|
50
|
|
|
|
19
|
if ($desc) { |
997
|
9
|
|
|
|
|
86
|
$self->_write_line_swissprot_regex(sprintf("FT %-8s %6s %6s ", |
998
|
|
|
|
|
|
|
$key, |
999
|
|
|
|
|
|
|
$start ,$end), |
1000
|
|
|
|
|
|
|
"FT ", |
1001
|
|
|
|
|
|
|
$desc. '.', '\s+|$', $LINE_LENGTH); |
1002
|
|
|
|
|
|
|
} else { #HELIX and STRAND do not have descriptions |
1003
|
0
|
|
|
|
|
0
|
$self->_write_line_swissprot_regex(sprintf("FT %-8s %6s %6s", |
1004
|
|
|
|
|
|
|
$key, |
1005
|
|
|
|
|
|
|
$start ,$end), |
1006
|
|
|
|
|
|
|
"FT ", |
1007
|
|
|
|
|
|
|
' ', '\s+|$', $LINE_LENGTH); |
1008
|
|
|
|
|
|
|
} |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
|
1011
|
9
|
50
|
|
|
|
64
|
if ($ftid) { |
1012
|
0
|
|
|
|
|
0
|
$self->_write_line_swissprot_regex("FT ", |
1013
|
|
|
|
|
|
|
"FT ", |
1014
|
|
|
|
|
|
|
"/FTId=$ftid",'.|$',$LINE_LENGTH); |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
} |
1019
|
|
|
|
|
|
|
#' |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
=head2 _read_swissprot_References |
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
Title : _read_swissprot_References |
1024
|
|
|
|
|
|
|
Usage : |
1025
|
|
|
|
|
|
|
Function: Reads references from swissprot format. Internal function really |
1026
|
|
|
|
|
|
|
Example : |
1027
|
|
|
|
|
|
|
Returns : |
1028
|
|
|
|
|
|
|
Args : |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
=cut |
1032
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
sub _read_swissprot_References{ |
1034
|
30
|
|
|
30
|
|
97
|
my ($self,$line) = @_; |
1035
|
30
|
|
|
|
|
104
|
my ($b1, $b2, $rp, $rg, $title, $loc, $au, $med, $com, $pubmed, $doi); |
1036
|
30
|
|
|
|
|
0
|
my @refs; |
1037
|
30
|
|
|
|
|
71
|
local $_ = $line; |
1038
|
30
|
|
|
|
|
91
|
while ( defined $_ ) { |
1039
|
1813
|
100
|
100
|
|
|
10990
|
if ( /^[^R]/ || /^RN/ ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1040
|
257
|
100
|
|
|
|
473
|
if ( $rp ) { |
1041
|
227
|
100
|
|
|
|
417
|
$rg =~ s/;\s*$//g if defined($rg); |
1042
|
227
|
100
|
|
|
|
335
|
if (defined($au)) { |
1043
|
223
|
|
|
|
|
850
|
$au =~ s/;\s*$//; |
1044
|
|
|
|
|
|
|
} else { |
1045
|
4
|
|
|
|
|
10
|
$au = $rg; |
1046
|
|
|
|
|
|
|
} |
1047
|
227
|
100
|
|
|
|
883
|
$title =~ s/;\s*$//g if defined($title); |
1048
|
227
|
|
|
|
|
1483
|
push @refs, Bio::Annotation::Reference->new |
1049
|
|
|
|
|
|
|
(-title => $title, |
1050
|
|
|
|
|
|
|
-start => $b1, |
1051
|
|
|
|
|
|
|
-end => $b2, |
1052
|
|
|
|
|
|
|
-authors => $au, |
1053
|
|
|
|
|
|
|
-location=> $loc, |
1054
|
|
|
|
|
|
|
-medline => $med, |
1055
|
|
|
|
|
|
|
-pubmed => $pubmed, |
1056
|
|
|
|
|
|
|
-doi => $doi, |
1057
|
|
|
|
|
|
|
-comment => $com, |
1058
|
|
|
|
|
|
|
-rp => $rp, |
1059
|
|
|
|
|
|
|
-rg => $rg, |
1060
|
|
|
|
|
|
|
-tagname => 'reference', |
1061
|
|
|
|
|
|
|
); |
1062
|
|
|
|
|
|
|
# reset state for the next reference |
1063
|
227
|
|
|
|
|
514
|
$rp = ''; |
1064
|
|
|
|
|
|
|
} |
1065
|
257
|
100
|
|
|
|
658
|
if (index($_,'R') != 0) { |
1066
|
30
|
|
|
|
|
149
|
$self->_pushback($_); # want this line to go back on the list |
1067
|
30
|
|
|
|
|
91
|
last; # may be the safest exit point HL 05/11/2000 |
1068
|
|
|
|
|
|
|
} |
1069
|
|
|
|
|
|
|
# don't forget to reset the state for the next reference |
1070
|
227
|
|
|
|
|
390
|
$b1 = $b2 = $rg = $med = $com = $pubmed = $doi = undef; |
1071
|
227
|
|
|
|
|
284
|
$title = $loc = $au = undef; |
1072
|
|
|
|
|
|
|
} elsif ( /^RP\s{3}(.+? OF (\d+)-(\d+).*)/) { |
1073
|
35
|
|
|
|
|
133
|
$rp .= $1; |
1074
|
35
|
|
|
|
|
77
|
$b1 = $2; |
1075
|
35
|
|
|
|
|
93
|
$b2 = $3; |
1076
|
|
|
|
|
|
|
} elsif ( /^RP\s{3}(.*)/) { |
1077
|
198
|
100
|
|
|
|
388
|
if ($rp) { |
1078
|
6
|
|
|
|
|
23
|
$rp .= " ".$1; |
1079
|
|
|
|
|
|
|
} else { |
1080
|
192
|
|
|
|
|
508
|
$rp = $1; |
1081
|
|
|
|
|
|
|
} |
1082
|
|
|
|
|
|
|
} elsif (/^RX\s{3}(.*)/) { # each reference can have only one RX line |
1083
|
177
|
|
|
|
|
389
|
my $line = $1; |
1084
|
177
|
100
|
|
|
|
705
|
$med = $1 if $line =~ /MEDLINE=(\d+);/; |
1085
|
177
|
100
|
|
|
|
639
|
$pubmed = $1 if $line =~ /PubMed=(\d+);/; |
1086
|
177
|
100
|
|
|
|
443
|
$doi = $1 if $line =~ /DOI=(.+);/; |
1087
|
|
|
|
|
|
|
} elsif ( /^RA\s{3}(.*)/ ) { |
1088
|
382
|
100
|
|
|
|
1132
|
$au .= $au ? " $1" : $1; |
1089
|
|
|
|
|
|
|
} elsif ( /^RG\s{3}(.*)/ ) { |
1090
|
16
|
50
|
|
|
|
65
|
$rg .= $rg ? " $1" : $1; |
1091
|
|
|
|
|
|
|
} elsif ( /^RT\s{3}(.*)/ ) { |
1092
|
381
|
100
|
|
|
|
626
|
if ($title) { |
1093
|
177
|
|
|
|
|
317
|
my $tline = $1; |
1094
|
177
|
100
|
|
|
|
745
|
$title .= ($title =~ /[\w;,:\?!]$/) ? " $tline" : $tline; |
1095
|
|
|
|
|
|
|
} else { |
1096
|
204
|
|
|
|
|
377
|
$title = $1; |
1097
|
|
|
|
|
|
|
} |
1098
|
|
|
|
|
|
|
} elsif (/^RL\s{3}(.*)/ ) { |
1099
|
227
|
50
|
|
|
|
661
|
$loc .= $loc ? " $1" : $1; |
1100
|
|
|
|
|
|
|
} elsif ( /^RC\s{3}(.*)/ ) { |
1101
|
140
|
50
|
|
|
|
432
|
$com .= $com ? " $1" : $1; |
1102
|
|
|
|
|
|
|
} |
1103
|
1783
|
|
|
|
|
3386
|
$_ = $self->_readline; |
1104
|
|
|
|
|
|
|
} |
1105
|
30
|
|
|
|
|
129
|
return \@refs; |
1106
|
|
|
|
|
|
|
} |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
=head2 _read_swissprot_Species |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
Title : _read_swissprot_Species |
1112
|
|
|
|
|
|
|
Usage : |
1113
|
|
|
|
|
|
|
Function: Reads the swissprot Organism species and classification |
1114
|
|
|
|
|
|
|
lines. |
1115
|
|
|
|
|
|
|
Able to deal with unconventional species names. |
1116
|
|
|
|
|
|
|
Example : OS Unknown prokaryotic organism |
1117
|
|
|
|
|
|
|
$genus = undef ; $species = Unknown prokaryotic organism |
1118
|
|
|
|
|
|
|
Returns : A Bio::Species object |
1119
|
|
|
|
|
|
|
Args : |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
=cut |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
sub _read_swissprot_Species { |
1124
|
30
|
|
|
30
|
|
76
|
my( $self,$line ) = @_; |
1125
|
30
|
|
|
|
|
47
|
my $org; |
1126
|
30
|
|
|
|
|
75
|
local $_ = $line; |
1127
|
|
|
|
|
|
|
|
1128
|
30
|
|
|
|
|
76
|
my( $sub_species, $species, $genus, $common, $variant, $ncbi_taxid, $sci_name, $class_lines, $descr ); |
1129
|
30
|
|
|
|
|
53
|
my $osline = ""; |
1130
|
30
|
|
|
|
|
44
|
my $do_genus_check = 1; |
1131
|
30
|
|
|
|
|
95
|
while ( defined $_ ) { |
1132
|
183
|
100
|
|
|
|
526
|
last unless /^O[SCGX]/; |
1133
|
|
|
|
|
|
|
# believe it or not, but OS may come multiple times -- at this time |
1134
|
|
|
|
|
|
|
# we can't capture multiple species |
1135
|
153
|
100
|
100
|
|
|
978
|
if (/^OS\s+(\S.+)/ && (! defined($sci_name))) { |
|
|
100
|
100
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
1136
|
30
|
50
|
|
|
|
92
|
$osline .= " " if $osline; |
1137
|
30
|
|
|
|
|
107
|
$osline .= $1; |
1138
|
30
|
50
|
|
|
|
291
|
if ($osline =~ s/(,|, and|\.)$//) { |
1139
|
|
|
|
|
|
|
# OS lines are usually like: |
1140
|
|
|
|
|
|
|
# Homo sapiens (human) |
1141
|
|
|
|
|
|
|
# where we have $sci_name followed by $descr (common name) in |
1142
|
|
|
|
|
|
|
# brackets, but we can also have: |
1143
|
|
|
|
|
|
|
# Venerupis (Ruditapes) philippinarum |
1144
|
|
|
|
|
|
|
# where we have brackets but they don't indicate a $descr |
1145
|
30
|
50
|
|
|
|
147
|
if ($osline =~ /[^\(\)]+\(.+\)[^\(\)]+$/) { |
1146
|
|
|
|
|
|
|
#*** Danger! no idea if this will pick up some syntaxes for |
1147
|
|
|
|
|
|
|
# common names as well) |
1148
|
0
|
|
|
|
|
0
|
$sci_name = $osline; |
1149
|
0
|
|
|
|
|
0
|
$sci_name =~ s/\.$//; |
1150
|
0
|
|
|
|
|
0
|
$descr = ''; |
1151
|
0
|
|
|
|
|
0
|
$do_genus_check = 0; |
1152
|
|
|
|
|
|
|
} else { |
1153
|
30
|
|
|
|
|
166
|
($sci_name, $descr) = $osline =~ /(\S[^\(]+)(.*)/; |
1154
|
|
|
|
|
|
|
} |
1155
|
30
|
|
|
|
|
133
|
$sci_name =~ s/\s+$//; |
1156
|
|
|
|
|
|
|
|
1157
|
30
|
|
|
|
|
153
|
while ($descr =~ /\(([^\)]+)\)/g) { |
1158
|
13
|
|
|
|
|
33
|
my $item = $1; |
1159
|
|
|
|
|
|
|
# strain etc may not necessarily come first (yes, swissprot |
1160
|
|
|
|
|
|
|
# is messy) |
1161
|
13
|
50
|
33
|
|
|
323
|
if ((! defined($variant)) && |
|
|
50
|
33
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
(($item =~ /(^|[^\(\w])([Ss]train|isolate|serogroup|serotype|subtype|clone)\b/) || |
1163
|
|
|
|
|
|
|
($item =~ /^(biovar|pv\.|type\s+)/))) { |
1164
|
0
|
|
|
|
|
0
|
$variant = $item; |
1165
|
|
|
|
|
|
|
} elsif ($item =~ s/^subsp\.\s+//) { |
1166
|
0
|
0
|
|
|
|
0
|
if (! $sub_species) { |
|
|
0
|
|
|
|
|
|
1167
|
0
|
|
|
|
|
0
|
$sub_species = $item; |
1168
|
|
|
|
|
|
|
} elsif (! $variant) { |
1169
|
0
|
|
|
|
|
0
|
$variant = $item; |
1170
|
|
|
|
|
|
|
} |
1171
|
|
|
|
|
|
|
} elsif (! defined($common)) { |
1172
|
|
|
|
|
|
|
# we're only interested in the first common name |
1173
|
13
|
|
|
|
|
28
|
$common = $item; |
1174
|
13
|
50
|
33
|
|
|
82
|
if ((index($common, '(') >= 0) && |
1175
|
|
|
|
|
|
|
(index($common, ')') < 0)) { |
1176
|
0
|
|
|
|
|
0
|
$common .= ')'; |
1177
|
|
|
|
|
|
|
} |
1178
|
|
|
|
|
|
|
} |
1179
|
|
|
|
|
|
|
} |
1180
|
|
|
|
|
|
|
} |
1181
|
|
|
|
|
|
|
} elsif (s/^OC\s+(\S.+)$//) { |
1182
|
62
|
|
|
|
|
168
|
$class_lines .= $1; |
1183
|
|
|
|
|
|
|
} elsif (/^OG\s+(.*)/) { |
1184
|
0
|
|
|
|
|
0
|
$org = $1; |
1185
|
|
|
|
|
|
|
} elsif (/^OX\s+(.*)/ && (! defined($ncbi_taxid))) { |
1186
|
29
|
|
|
|
|
74
|
my $taxstring = $1; |
1187
|
|
|
|
|
|
|
# we only keep the first one and ignore all others |
1188
|
29
|
50
|
|
|
|
150
|
if ($taxstring =~ /NCBI_TaxID=([\w\d]+)/) { |
1189
|
29
|
|
|
|
|
78
|
$ncbi_taxid = $1; |
1190
|
|
|
|
|
|
|
} else { |
1191
|
0
|
|
|
|
|
0
|
$self->throw("$taxstring doesn't look like NCBI_TaxID"); |
1192
|
|
|
|
|
|
|
} |
1193
|
|
|
|
|
|
|
} |
1194
|
153
|
|
|
|
|
317
|
$_ = $self->_readline; |
1195
|
|
|
|
|
|
|
} |
1196
|
30
|
|
|
|
|
141
|
$self->_pushback($_); # pushback the last line because we need it |
1197
|
|
|
|
|
|
|
|
1198
|
30
|
50
|
|
|
|
98
|
$sci_name || return; |
1199
|
|
|
|
|
|
|
|
1200
|
|
|
|
|
|
|
# if the organism belongs to taxid 32644 then no Bio::Species object. |
1201
|
30
|
50
|
|
|
|
89
|
return if grep { $_ eq $sci_name } @Unknown_names; |
|
330
|
|
|
|
|
524
|
|
1202
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
# Convert data in classification lines into classification array. |
1204
|
|
|
|
|
|
|
# Remove trailing . then split on ';' or '.;' so that classification that is 2 |
1205
|
|
|
|
|
|
|
# or more words will still get matched, use map() to remove trailing/leading/intervening |
1206
|
|
|
|
|
|
|
# spaces |
1207
|
30
|
|
|
|
|
170
|
$class_lines=~s/\.\s*$//; |
1208
|
30
|
|
|
|
|
332
|
my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /[;\.]*;/, $class_lines; |
|
298
|
|
|
|
|
569
|
|
|
298
|
|
|
|
|
446
|
|
|
298
|
|
|
|
|
355
|
|
|
298
|
|
|
|
|
429
|
|
1209
|
|
|
|
|
|
|
|
1210
|
30
|
50
|
|
|
|
187
|
if ($class[0] =~ /viruses/i) { |
|
|
50
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
# viruses have different OS/OC syntax |
1212
|
0
|
|
|
|
|
0
|
my @virusnames = split(/\s+/, $sci_name); |
1213
|
0
|
0
|
|
|
|
0
|
$species = (@virusnames > 1) ? pop(@virusnames) : ''; |
1214
|
0
|
|
|
|
|
0
|
$genus = join(" ", @virusnames); |
1215
|
0
|
|
|
|
|
0
|
$sub_species = $descr; |
1216
|
|
|
|
|
|
|
} elsif ($do_genus_check) { |
1217
|
|
|
|
|
|
|
# do we have a genus? |
1218
|
30
|
|
|
|
|
80
|
my $possible_genus = $class[-1]; |
1219
|
30
|
50
|
|
|
|
125
|
$possible_genus .= "|$class[-2]" if $class[-2]; |
1220
|
30
|
50
|
|
|
|
1129
|
if ($sci_name =~ /^($possible_genus)/) { |
1221
|
30
|
|
|
|
|
104
|
$genus = $1; |
1222
|
30
|
|
|
|
|
374
|
($species) = $sci_name =~ /^$genus\s+(.+)/; |
1223
|
|
|
|
|
|
|
} else { |
1224
|
0
|
|
|
|
|
0
|
$species = $sci_name; |
1225
|
|
|
|
|
|
|
} |
1226
|
|
|
|
|
|
|
# is this organism of rank species or is it lower? |
1227
|
|
|
|
|
|
|
# (doesn't catch everything, but at least the guess isn't dangerous) |
1228
|
30
|
50
|
33
|
|
|
250
|
if ($species && $species =~ /subsp\.|var\./) { |
1229
|
0
|
|
|
|
|
0
|
($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/; |
1230
|
|
|
|
|
|
|
} |
1231
|
|
|
|
|
|
|
} |
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
# Bio::Species array needs array in Species -> Kingdom direction |
1234
|
30
|
50
|
|
|
|
109
|
unless ($class[-1] eq $sci_name) { |
1235
|
30
|
|
|
|
|
79
|
push(@class, $sci_name); |
1236
|
|
|
|
|
|
|
} |
1237
|
30
|
|
|
|
|
61
|
@class = reverse @class; |
1238
|
|
|
|
|
|
|
|
1239
|
30
|
|
|
|
|
238
|
my $taxon = Bio::Species->new(); |
1240
|
30
|
|
|
|
|
149
|
$taxon->scientific_name($sci_name); |
1241
|
30
|
|
|
|
|
125
|
$taxon->classification(@class); |
1242
|
30
|
100
|
|
|
|
138
|
$taxon->common_name($common) if $common; |
1243
|
30
|
50
|
|
|
|
89
|
$taxon->sub_species($sub_species) if $sub_species; |
1244
|
30
|
50
|
|
|
|
88
|
$taxon->organelle($org) if $org; |
1245
|
30
|
100
|
|
|
|
186
|
$taxon->ncbi_taxid($ncbi_taxid) if $ncbi_taxid; |
1246
|
30
|
50
|
|
|
|
80
|
$taxon->variant($variant) if $variant; |
1247
|
|
|
|
|
|
|
|
1248
|
|
|
|
|
|
|
# done |
1249
|
30
|
|
|
|
|
208
|
return $taxon; |
1250
|
|
|
|
|
|
|
} |
1251
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
=head2 _filehandle |
1253
|
|
|
|
|
|
|
|
1254
|
|
|
|
|
|
|
Title : _filehandle |
1255
|
|
|
|
|
|
|
Usage : $obj->_filehandle($newval) |
1256
|
|
|
|
|
|
|
Function: |
1257
|
|
|
|
|
|
|
Example : |
1258
|
|
|
|
|
|
|
Returns : value of _filehandle |
1259
|
|
|
|
|
|
|
Args : newvalue (optional) |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
=cut |
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
# inherited from SeqIO.pm ! HL 05/11/2000 |
1265
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
=head2 _read_FTHelper_swissprot |
1267
|
|
|
|
|
|
|
|
1268
|
|
|
|
|
|
|
Title : _read_FTHelper_swissprot |
1269
|
|
|
|
|
|
|
Usage : _read_FTHelper_swissprot(\$buffer) |
1270
|
|
|
|
|
|
|
Function: reads the next FT key line |
1271
|
|
|
|
|
|
|
Example : |
1272
|
|
|
|
|
|
|
Returns : Bio::SeqIO::FTHelper object |
1273
|
|
|
|
|
|
|
Args : |
1274
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
=cut |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
sub _read_FTHelper_swissprot { |
1279
|
443
|
|
|
443
|
|
802
|
my ($self,$line ) = @_; |
1280
|
|
|
|
|
|
|
# initial version implemented by HL 05/10/2000 |
1281
|
|
|
|
|
|
|
# FIXME this may not be perfect, so please review |
1282
|
|
|
|
|
|
|
# lots of cleaning up by JES 2004/07/01, still may not be perfect =) |
1283
|
|
|
|
|
|
|
# FTId now sepated from description as a qualifier |
1284
|
|
|
|
|
|
|
|
1285
|
443
|
|
|
|
|
720
|
local $_ = $line; |
1286
|
443
|
|
|
|
|
640
|
my ($key, # The key of the feature |
1287
|
|
|
|
|
|
|
$loc, # The location line from the feature |
1288
|
|
|
|
|
|
|
$desc, # The descriptive text |
1289
|
|
|
|
|
|
|
$ftid, # feature Id is like a qualifier but there can be only one of them |
1290
|
|
|
|
|
|
|
); |
1291
|
443
|
50
|
|
|
|
2154
|
if ( m/^FT\s{3}(\w+)\s+([\d\?\<]+)\s+([\d\?\>]+)\s*(.*)$/ox) { |
1292
|
443
|
|
|
|
|
1529
|
$key = $1; |
1293
|
443
|
|
|
|
|
713
|
my $loc1 = $2; |
1294
|
443
|
|
|
|
|
641
|
my $loc2 = $3; |
1295
|
443
|
|
|
|
|
922
|
$loc = "$loc1..$loc2"; |
1296
|
443
|
100
|
66
|
|
|
1913
|
if ($4 && (length($4) > 0)) { |
1297
|
384
|
|
|
|
|
662
|
$desc = $4; |
1298
|
384
|
|
|
|
|
725
|
chomp($desc); |
1299
|
|
|
|
|
|
|
} else { |
1300
|
59
|
|
|
|
|
96
|
$desc = ""; |
1301
|
|
|
|
|
|
|
} |
1302
|
|
|
|
|
|
|
} |
1303
|
|
|
|
|
|
|
|
1304
|
443
|
|
66
|
|
|
1099
|
while ( defined($_ = $self->_readline) && /^FT\s{20,}(\S.*)$/ ) { |
1305
|
312
|
|
|
|
|
951
|
my $continuation_line = $1; |
1306
|
312
|
100
|
|
|
|
1256
|
if ( $continuation_line =~ /.FTId=(.*)\./ ) { |
|
|
50
|
|
|
|
|
|
1307
|
272
|
|
|
|
|
570
|
$ftid=$1; |
1308
|
|
|
|
|
|
|
} |
1309
|
|
|
|
|
|
|
elsif ( $desc) { |
1310
|
40
|
|
|
|
|
115
|
$desc .= " $continuation_line"; |
1311
|
|
|
|
|
|
|
} else { |
1312
|
0
|
|
|
|
|
0
|
$desc = $continuation_line; |
1313
|
|
|
|
|
|
|
} |
1314
|
312
|
|
|
|
|
855
|
chomp $desc; |
1315
|
|
|
|
|
|
|
} |
1316
|
443
|
|
|
|
|
1351
|
$self->_pushback($_); |
1317
|
443
|
50
|
|
|
|
855
|
unless( $key ) { |
1318
|
|
|
|
|
|
|
# No feature key. What's this? |
1319
|
0
|
|
|
|
|
0
|
$self->warn("No feature key in putative feature table line: $line"); |
1320
|
0
|
|
|
|
|
0
|
return; |
1321
|
|
|
|
|
|
|
} |
1322
|
|
|
|
|
|
|
|
1323
|
|
|
|
|
|
|
# Make the new FTHelper object |
1324
|
443
|
|
|
|
|
1109
|
my $out = Bio::SeqIO::FTHelper->new(-verbose => $self->verbose()); |
1325
|
443
|
|
|
|
|
1331
|
$out->key($key); |
1326
|
443
|
|
|
|
|
1070
|
$out->loc($loc); |
1327
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
# store the description if there is one |
1329
|
443
|
100
|
66
|
|
|
1460
|
if ( $desc && length($desc) ) { |
1330
|
384
|
|
|
|
|
1749
|
$desc =~ s/\.$//; |
1331
|
384
|
|
|
|
|
733
|
push(@{$out->field->{"description"}}, $desc); |
|
384
|
|
|
|
|
842
|
|
1332
|
|
|
|
|
|
|
} |
1333
|
|
|
|
|
|
|
# Store the qualifier i.e. FTId |
1334
|
443
|
100
|
|
|
|
984
|
if ( $ftid ) { |
1335
|
272
|
|
|
|
|
340
|
push(@{$out->field->{"FTId"}}, $ftid); |
|
272
|
|
|
|
|
517
|
|
1336
|
|
|
|
|
|
|
} |
1337
|
443
|
|
|
|
|
1064
|
return $out; |
1338
|
|
|
|
|
|
|
} |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
=head2 _write_line_swissprot |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
Title : _write_line_swissprot |
1344
|
|
|
|
|
|
|
Usage : |
1345
|
|
|
|
|
|
|
Function: internal function |
1346
|
|
|
|
|
|
|
Example : |
1347
|
|
|
|
|
|
|
Returns : |
1348
|
|
|
|
|
|
|
Args : |
1349
|
|
|
|
|
|
|
|
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
=cut |
1352
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
sub _write_line_swissprot{ |
1354
|
0
|
|
|
0
|
|
0
|
my ($self,$pre1,$pre2,$line,$length) = @_; |
1355
|
|
|
|
|
|
|
|
1356
|
0
|
0
|
|
|
|
0
|
$length || $self->throw( "Miscalled write_line_swissprot without length. Programming error!"); |
1357
|
0
|
|
|
|
|
0
|
my $subl = $length - length $pre2; |
1358
|
0
|
|
|
|
|
0
|
my $linel = length $line; |
1359
|
0
|
|
|
|
|
0
|
my $i; |
1360
|
|
|
|
|
|
|
|
1361
|
0
|
|
|
|
|
0
|
my $sub = substr($line,0,$length - length $pre1); |
1362
|
|
|
|
|
|
|
|
1363
|
0
|
|
|
|
|
0
|
$self->_print( "$pre1$sub\n"); |
1364
|
|
|
|
|
|
|
|
1365
|
0
|
|
|
|
|
0
|
for ($i= ($length - length $pre1);$i < $linel;) { |
1366
|
0
|
|
|
|
|
0
|
$sub = substr($line,$i,($subl)); |
1367
|
0
|
|
|
|
|
0
|
$self->_print( "$pre2$sub\n"); |
1368
|
0
|
|
|
|
|
0
|
$i += $subl; |
1369
|
|
|
|
|
|
|
} |
1370
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
} |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
=head2 _write_line_swissprot_regex |
1374
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
Title : _write_line_swissprot_regex |
1376
|
|
|
|
|
|
|
Usage : |
1377
|
|
|
|
|
|
|
Function: internal function for writing lines of specified |
1378
|
|
|
|
|
|
|
length, with different first and the next line |
1379
|
|
|
|
|
|
|
left hand headers and split at specific points in the |
1380
|
|
|
|
|
|
|
text |
1381
|
|
|
|
|
|
|
Example : |
1382
|
|
|
|
|
|
|
Returns : nothing |
1383
|
|
|
|
|
|
|
Args : file handle, first header, second header, text-line, regex for line breaks, total line length |
1384
|
|
|
|
|
|
|
|
1385
|
|
|
|
|
|
|
|
1386
|
|
|
|
|
|
|
=cut |
1387
|
|
|
|
|
|
|
|
1388
|
|
|
|
|
|
|
sub _write_line_swissprot_regex { |
1389
|
44
|
|
|
44
|
|
108
|
my ($self,$pre1,$pre2,$line,$regex,$length) = @_; |
1390
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
#print STDOUT "Going to print with $line!\n"; |
1392
|
|
|
|
|
|
|
|
1393
|
44
|
50
|
|
|
|
88
|
$length || $self->throw( "Miscalled write_line_swissprot without length. Programming error!"); |
1394
|
|
|
|
|
|
|
|
1395
|
44
|
50
|
|
|
|
94
|
if ( length $pre1 != length $pre2 ) { |
1396
|
0
|
|
|
|
|
0
|
$self->warn( "len 1 is ". length ($pre1) . " len 2 is ". length ($pre2) . "\n"); |
1397
|
0
|
|
|
|
|
0
|
$self->throw( "Programming error - cannot called write_line_swissprot_regex with different length \npre1 ($pre1) and \npre2 ($pre2) tags!"); |
1398
|
|
|
|
|
|
|
} |
1399
|
|
|
|
|
|
|
|
1400
|
44
|
|
|
|
|
82
|
my $subl = $length - (length $pre1) -1 ; |
1401
|
|
|
|
|
|
|
|
1402
|
44
|
|
|
|
|
56
|
my $first_line = 1; |
1403
|
44
|
|
|
|
|
544
|
while ($line =~ m/(.{1,$subl})($regex)/g) { |
1404
|
54
|
|
|
|
|
170
|
my $s = $1.$2; |
1405
|
54
|
100
|
100
|
|
|
269
|
$s =~ s/([\w\.])#(\w)/$1 $2/g # remove word wrap protection char '#' |
1406
|
|
|
|
|
|
|
if $pre1 eq "RA " or $pre1 eq "KW "; |
1407
|
|
|
|
|
|
|
# remove annoying extra spaces at the end of the wrapped lines |
1408
|
54
|
100
|
|
|
|
147
|
substr($s, -1, 1, '') if substr($s, -1, 1) eq ' '; |
1409
|
54
|
100
|
|
|
|
101
|
if ($first_line) { |
1410
|
44
|
|
|
|
|
169
|
$self->_print( "$pre1$s\n"); |
1411
|
44
|
|
|
|
|
259
|
$first_line = 0; |
1412
|
|
|
|
|
|
|
} else { |
1413
|
10
|
|
|
|
|
41
|
$self->_print( "$pre2$s\n"); |
1414
|
|
|
|
|
|
|
} |
1415
|
|
|
|
|
|
|
|
1416
|
|
|
|
|
|
|
} |
1417
|
|
|
|
|
|
|
} |
1418
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
=head2 _post_sort |
1420
|
|
|
|
|
|
|
|
1421
|
|
|
|
|
|
|
Title : _post_sort |
1422
|
|
|
|
|
|
|
Usage : $obj->_post_sort($newval) |
1423
|
|
|
|
|
|
|
Function: |
1424
|
|
|
|
|
|
|
Returns : value of _post_sort |
1425
|
|
|
|
|
|
|
Args : newvalue (optional) |
1426
|
|
|
|
|
|
|
|
1427
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
=cut |
1429
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
sub _post_sort{ |
1431
|
3
|
|
|
3
|
|
7
|
my $obj = shift; |
1432
|
3
|
50
|
|
|
|
19
|
if ( @_ ) { |
1433
|
0
|
|
|
|
|
0
|
my $value = shift; |
1434
|
0
|
|
|
|
|
0
|
$obj->{'_post_sort'} = $value; |
1435
|
|
|
|
|
|
|
} |
1436
|
3
|
|
|
|
|
12
|
return $obj->{'_post_sort'}; |
1437
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
} |
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
=head2 _show_dna |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
Title : _show_dna |
1443
|
|
|
|
|
|
|
Usage : $obj->_show_dna($newval) |
1444
|
|
|
|
|
|
|
Function: |
1445
|
|
|
|
|
|
|
Returns : value of _show_dna |
1446
|
|
|
|
|
|
|
Args : newvalue (optional) |
1447
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
|
1449
|
|
|
|
|
|
|
=cut |
1450
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
sub _show_dna{ |
1452
|
26
|
|
|
26
|
|
61
|
my $obj = shift; |
1453
|
26
|
100
|
|
|
|
97
|
if ( @_ ) { |
1454
|
23
|
|
|
|
|
49
|
my $value = shift; |
1455
|
23
|
|
|
|
|
71
|
$obj->{'_show_dna'} = $value; |
1456
|
|
|
|
|
|
|
} |
1457
|
26
|
|
|
|
|
74
|
return $obj->{'_show_dna'}; |
1458
|
|
|
|
|
|
|
|
1459
|
|
|
|
|
|
|
} |
1460
|
|
|
|
|
|
|
|
1461
|
|
|
|
|
|
|
=head2 _id_generation_func |
1462
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
Title : _id_generation_func |
1464
|
|
|
|
|
|
|
Usage : $obj->_id_generation_func($newval) |
1465
|
|
|
|
|
|
|
Function: |
1466
|
|
|
|
|
|
|
Returns : value of _id_generation_func |
1467
|
|
|
|
|
|
|
Args : newvalue (optional) |
1468
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
=cut |
1471
|
|
|
|
|
|
|
|
1472
|
|
|
|
|
|
|
sub _id_generation_func{ |
1473
|
4
|
|
|
4
|
|
9
|
my $obj = shift; |
1474
|
4
|
50
|
|
|
|
13
|
if ( @_ ) { |
1475
|
0
|
|
|
|
|
0
|
my $value = shift; |
1476
|
0
|
|
|
|
|
0
|
$obj->{'_id_generation_func'} = $value; |
1477
|
|
|
|
|
|
|
} |
1478
|
4
|
|
|
|
|
16
|
return $obj->{'_id_generation_func'}; |
1479
|
|
|
|
|
|
|
|
1480
|
|
|
|
|
|
|
} |
1481
|
|
|
|
|
|
|
|
1482
|
|
|
|
|
|
|
=head2 _ac_generation_func |
1483
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
Title : _ac_generation_func |
1485
|
|
|
|
|
|
|
Usage : $obj->_ac_generation_func($newval) |
1486
|
|
|
|
|
|
|
Function: |
1487
|
|
|
|
|
|
|
Returns : value of _ac_generation_func |
1488
|
|
|
|
|
|
|
Args : newvalue (optional) |
1489
|
|
|
|
|
|
|
|
1490
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
=cut |
1492
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
sub _ac_generation_func{ |
1494
|
4
|
|
|
4
|
|
17
|
my $obj = shift; |
1495
|
4
|
50
|
|
|
|
18
|
if ( @_ ) { |
1496
|
0
|
|
|
|
|
0
|
my $value = shift; |
1497
|
0
|
|
|
|
|
0
|
$obj->{'_ac_generation_func'} = $value; |
1498
|
|
|
|
|
|
|
} |
1499
|
4
|
|
|
|
|
47
|
return $obj->{'_ac_generation_func'}; |
1500
|
|
|
|
|
|
|
|
1501
|
|
|
|
|
|
|
} |
1502
|
|
|
|
|
|
|
|
1503
|
|
|
|
|
|
|
=head2 _sv_generation_func |
1504
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
Title : _sv_generation_func |
1506
|
|
|
|
|
|
|
Usage : $obj->_sv_generation_func($newval) |
1507
|
|
|
|
|
|
|
Function: |
1508
|
|
|
|
|
|
|
Returns : value of _sv_generation_func |
1509
|
|
|
|
|
|
|
Args : newvalue (optional) |
1510
|
|
|
|
|
|
|
|
1511
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
=cut |
1513
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
sub _sv_generation_func{ |
1515
|
0
|
|
|
0
|
|
0
|
my $obj = shift; |
1516
|
0
|
0
|
|
|
|
0
|
if ( @_ ) { |
1517
|
0
|
|
|
|
|
0
|
my $value = shift; |
1518
|
0
|
|
|
|
|
0
|
$obj->{'_sv_generation_func'} = $value; |
1519
|
|
|
|
|
|
|
} |
1520
|
0
|
|
|
|
|
0
|
return $obj->{'_sv_generation_func'}; |
1521
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
} |
1523
|
|
|
|
|
|
|
|
1524
|
|
|
|
|
|
|
=head2 _kw_generation_func |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
Title : _kw_generation_func |
1527
|
|
|
|
|
|
|
Usage : $obj->_kw_generation_func($newval) |
1528
|
|
|
|
|
|
|
Function: |
1529
|
|
|
|
|
|
|
Returns : value of _kw_generation_func |
1530
|
|
|
|
|
|
|
Args : newvalue (optional) |
1531
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
|
1533
|
|
|
|
|
|
|
=cut |
1534
|
|
|
|
|
|
|
|
1535
|
|
|
|
|
|
|
sub _kw_generation_func{ |
1536
|
4
|
|
|
4
|
|
10
|
my $obj = shift; |
1537
|
4
|
50
|
|
|
|
18
|
if ( @_ ) { |
1538
|
0
|
|
|
|
|
0
|
my $value = shift; |
1539
|
0
|
|
|
|
|
0
|
$obj->{'_kw_generation_func'} = $value; |
1540
|
|
|
|
|
|
|
} |
1541
|
4
|
|
|
|
|
44
|
return $obj->{'_kw_generation_func'}; |
1542
|
|
|
|
|
|
|
|
1543
|
|
|
|
|
|
|
} |
1544
|
|
|
|
|
|
|
|
1545
|
|
|
|
|
|
|
1; |