line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::DB::NCBIHelper |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Cared for by Jason Stajich |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Jason Stajich |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
# |
14
|
|
|
|
|
|
|
# Interfaces with new WebDBSeqI interface |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 NAME |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
Bio::DB::NCBIHelper - A collection of routines useful for queries to |
19
|
|
|
|
|
|
|
NCBI databases. |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSIS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# Do not use this module directly. |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# get a Bio::DB::NCBIHelper object somehow |
26
|
|
|
|
|
|
|
my $seqio = $db->get_Stream_by_acc(['J00522']); |
27
|
|
|
|
|
|
|
foreach my $seq ( $seqio->next_seq ) { |
28
|
|
|
|
|
|
|
# process seq |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 DESCRIPTION |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Provides a single place to setup some common methods for querying NCBI |
34
|
|
|
|
|
|
|
web databases. This module just centralizes the methods for |
35
|
|
|
|
|
|
|
constructing a URL for querying NCBI GenBank and NCBI GenPept and the |
36
|
|
|
|
|
|
|
common HTML stripping done in L(). |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
The base NCBI query URL used is: |
39
|
|
|
|
|
|
|
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head1 FEEDBACK |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=head2 Mailing Lists |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
User feedback is an integral part of the |
46
|
|
|
|
|
|
|
evolution of this and other Bioperl modules. Send |
47
|
|
|
|
|
|
|
your comments and suggestions preferably to one |
48
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation |
49
|
|
|
|
|
|
|
is much appreciated. |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
52
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head2 Support |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
I |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
61
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
62
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
63
|
|
|
|
|
|
|
with code and data examples if at all possible. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head2 Reporting Bugs |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to |
68
|
|
|
|
|
|
|
help us keep track the bugs and their resolution. |
69
|
|
|
|
|
|
|
Bug reports can be submitted via the web. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=head1 AUTHOR - Jason Stajich |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
Email jason@bioperl.org |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head1 APPENDIX |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
The rest of the documentation details each of the |
80
|
|
|
|
|
|
|
object methods. Internal methods are usually |
81
|
|
|
|
|
|
|
preceded with a _ |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=cut |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
# Let the code begin... |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
package Bio::DB::NCBIHelper; |
88
|
3
|
|
|
3
|
|
19
|
use strict; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
82
|
|
89
|
|
|
|
|
|
|
|
90
|
3
|
|
|
3
|
|
732
|
use Bio::DB::Query::GenBank; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
78
|
|
91
|
3
|
|
|
3
|
|
19
|
use HTTP::Request::Common; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
147
|
|
92
|
3
|
|
|
3
|
|
16
|
use URI; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
48
|
|
93
|
3
|
|
|
3
|
|
13
|
use Bio::Root::IO; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
51
|
|
94
|
3
|
|
|
3
|
|
301
|
use Bio::DB::RefSeq; |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
76
|
|
95
|
3
|
|
|
3
|
|
21
|
use URI::Escape qw(uri_unescape); |
|
3
|
|
|
|
|
7
|
|
|
3
|
|
|
|
|
151
|
|
96
|
|
|
|
|
|
|
|
97
|
3
|
|
|
3
|
|
16
|
use base qw(Bio::DB::WebDBSeqI Bio::Root::Root); |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
4572
|
|
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
our $HOSTBASE = 'https://eutils.ncbi.nlm.nih.gov'; |
100
|
|
|
|
|
|
|
our $MAX_ENTRIES = 19000; |
101
|
|
|
|
|
|
|
our $REQUEST_DELAY = 3; |
102
|
|
|
|
|
|
|
our %CGILOCATION = ( |
103
|
|
|
|
|
|
|
'batch' => [ 'post' => '/entrez/eutils/epost.fcgi' ], |
104
|
|
|
|
|
|
|
'query' => [ 'get' => '/entrez/eutils/efetch.fcgi' ], |
105
|
|
|
|
|
|
|
'single' => [ 'get' => '/entrez/eutils/efetch.fcgi' ], |
106
|
|
|
|
|
|
|
'version' => [ 'get' => '/entrez/eutils/efetch.fcgi' ], |
107
|
|
|
|
|
|
|
'gi' => [ 'get' => '/entrez/eutils/efetch.fcgi' ], |
108
|
|
|
|
|
|
|
'webenv' => [ 'get' => '/entrez/eutils/efetch.fcgi' ] |
109
|
|
|
|
|
|
|
); |
110
|
|
|
|
|
|
|
our %FORMATMAP = ( |
111
|
|
|
|
|
|
|
'gb' => 'genbank', |
112
|
|
|
|
|
|
|
'gp' => 'genbank', |
113
|
|
|
|
|
|
|
'fasta' => 'fasta', |
114
|
|
|
|
|
|
|
'asn.1' => 'entrezgene', |
115
|
|
|
|
|
|
|
'gbwithparts' => 'genbank', |
116
|
|
|
|
|
|
|
); |
117
|
|
|
|
|
|
|
our $DEFAULTFORMAT = 'gb'; |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=head2 new |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Title : new |
122
|
|
|
|
|
|
|
Usage : |
123
|
|
|
|
|
|
|
Function: the new way to make modules a little more lightweight |
124
|
|
|
|
|
|
|
Returns : |
125
|
|
|
|
|
|
|
Args : |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=cut |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub new { |
130
|
1
|
|
|
1
|
1
|
18
|
my ( $class, @args ) = @_; |
131
|
1
|
|
|
|
|
8
|
my $self = $class->SUPER::new(@args); |
132
|
1
|
|
|
|
|
6
|
my ($seq_start, $seq_stop, $no_redirect, |
133
|
|
|
|
|
|
|
$redirect, $complexity, $strand |
134
|
|
|
|
|
|
|
) |
135
|
|
|
|
|
|
|
= $self->_rearrange( |
136
|
|
|
|
|
|
|
[ qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND) ], |
137
|
|
|
|
|
|
|
@args |
138
|
|
|
|
|
|
|
); |
139
|
1
|
50
|
|
|
|
3
|
$seq_start && $self->seq_start($seq_start); |
140
|
1
|
50
|
|
|
|
3
|
$seq_stop && $self->seq_stop($seq_stop); |
141
|
1
|
50
|
|
|
|
3
|
$no_redirect && $self->no_redirect($no_redirect); |
142
|
1
|
50
|
|
|
|
8
|
$redirect && $self->redirect_refseq($redirect); |
143
|
1
|
50
|
|
|
|
3
|
$strand && $self->strand($strand); |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# adjust statement to accept zero value |
146
|
1
|
0
|
0
|
|
|
2
|
defined $complexity |
|
|
|
33
|
|
|
|
|
147
|
|
|
|
|
|
|
&& ( $complexity >= 0 && $complexity <= 4 ) |
148
|
|
|
|
|
|
|
&& $self->complexity($complexity); |
149
|
1
|
|
|
|
|
8
|
return $self; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=head2 get_params |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
Title : get_params |
156
|
|
|
|
|
|
|
Usage : my %params = $self->get_params($mode) |
157
|
|
|
|
|
|
|
Function: returns key,value pairs to be passed to NCBI database |
158
|
|
|
|
|
|
|
for either 'batch' or 'single' sequence retrieval method |
159
|
|
|
|
|
|
|
Returns : a key,value pair hash |
160
|
|
|
|
|
|
|
Args : 'single' or 'batch' mode for retrieval |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=cut |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub get_params { |
165
|
0
|
|
|
0
|
1
|
0
|
my ($self, $mode) = @_; |
166
|
0
|
|
|
|
|
0
|
$self->throw("subclass did not implement get_params"); |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=head2 default_format |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
Title : default_format |
172
|
|
|
|
|
|
|
Usage : my $format = $self->default_format |
173
|
|
|
|
|
|
|
Function: returns default sequence format for this module |
174
|
|
|
|
|
|
|
Returns : string |
175
|
|
|
|
|
|
|
Args : none |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=cut |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
sub default_format { |
180
|
0
|
|
|
0
|
1
|
0
|
return $DEFAULTFORMAT; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head2 get_request |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Title : get_request |
186
|
|
|
|
|
|
|
Usage : my $url = $self->get_request |
187
|
|
|
|
|
|
|
Function: HTTP::Request |
188
|
|
|
|
|
|
|
Returns : |
189
|
|
|
|
|
|
|
Args : %qualifiers = a hash of qualifiers (ids, format, etc) |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=cut |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
sub get_request { |
194
|
0
|
|
|
0
|
1
|
0
|
my ( $self, @qualifiers ) = @_; |
195
|
0
|
|
|
|
|
0
|
my ( $mode, $uids, $format, $query, $seq_start, $seq_stop, $strand, |
196
|
|
|
|
|
|
|
$complexity ) |
197
|
|
|
|
|
|
|
= $self->_rearrange( |
198
|
|
|
|
|
|
|
[qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)], |
199
|
|
|
|
|
|
|
@qualifiers ); |
200
|
0
|
|
|
|
|
0
|
$mode = lc $mode; |
201
|
0
|
0
|
|
|
|
0
|
($format) = $self->request_format() unless ( defined $format ); |
202
|
0
|
0
|
0
|
|
|
0
|
if ( !defined $mode || $mode eq '' ) { $mode = 'single'; } |
|
0
|
|
|
|
|
0
|
|
203
|
0
|
|
|
|
|
0
|
my %params = $self->get_params($mode); |
204
|
0
|
0
|
|
|
|
0
|
if ( !%params ) { |
205
|
0
|
|
|
|
|
0
|
$self->throw( |
206
|
|
|
|
|
|
|
"must specify a valid retrieval mode 'single' or 'batch' not '$mode'" |
207
|
|
|
|
|
|
|
); |
208
|
|
|
|
|
|
|
} |
209
|
0
|
|
|
|
|
0
|
my $url = URI->new( $HOSTBASE . $CGILOCATION{$mode}[1] ); |
210
|
0
|
0
|
0
|
|
|
0
|
unless ( $mode eq 'webenv' || defined $uids || defined $query ) { |
|
|
|
0
|
|
|
|
|
211
|
0
|
|
|
|
|
0
|
$self->throw("Must specify a query or list of uids to fetch"); |
212
|
|
|
|
|
|
|
} |
213
|
0
|
0
|
0
|
|
|
0
|
if ( $query && $query->can('cookie') ) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
214
|
0
|
|
|
|
|
0
|
@params{ 'WebEnv', 'query_key' } = $query->cookie; |
215
|
0
|
|
|
|
|
0
|
$params{'db'} = $query->db; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
elsif ($query) { |
218
|
0
|
|
|
|
|
0
|
$params{'id'} = join ',', $query->ids; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
# for batch retrieval, non-query style |
222
|
|
|
|
|
|
|
elsif ( $mode eq 'webenv' && $self->can('cookie') ) { |
223
|
0
|
|
|
|
|
0
|
@params{ 'WebEnv', 'query_key' } = $self->cookie; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
elsif ($uids) { |
226
|
0
|
0
|
|
|
|
0
|
if ( ref($uids) =~ /array/i ) { |
227
|
0
|
|
|
|
|
0
|
$uids = join( ",", @$uids ); |
228
|
|
|
|
|
|
|
} |
229
|
0
|
|
|
|
|
0
|
$params{'id'} = $uids; |
230
|
|
|
|
|
|
|
} |
231
|
0
|
0
|
|
|
|
0
|
$seq_start && ( $params{'seq_start'} = $seq_start ); |
232
|
0
|
0
|
|
|
|
0
|
$seq_stop && ( $params{'seq_stop'} = $seq_stop ); |
233
|
0
|
0
|
|
|
|
0
|
$strand && ( $params{'strand'} = $strand ); |
234
|
0
|
0
|
0
|
|
|
0
|
if ( defined $complexity && ( $seq_start || $seq_stop || $strand ) ) { |
|
|
|
0
|
|
|
|
|
235
|
0
|
0
|
0
|
|
|
0
|
$self->warn( |
|
|
|
0
|
|
|
|
|
236
|
|
|
|
|
|
|
"Complexity set to $complexity; seq_start and seq_stop may not work!" |
237
|
|
|
|
|
|
|
) if ( $complexity != 1 && ( $seq_start || $seq_stop ) ); |
238
|
0
|
0
|
0
|
|
|
0
|
$self->warn( |
|
|
|
0
|
|
|
|
|
239
|
|
|
|
|
|
|
"Complexity set to 0; expect strange results with strand set to 2" |
240
|
|
|
|
|
|
|
) if ( $complexity == 0 && $strand == 2 && $format eq 'fasta' ); |
241
|
|
|
|
|
|
|
} |
242
|
0
|
0
|
|
|
|
0
|
defined $complexity && ( $params{'complexity'} = $complexity ); |
243
|
0
|
0
|
|
|
|
0
|
$params{'rettype'} = $format unless $mode eq 'batch'; |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# for now, 'post' is batch retrieval |
246
|
0
|
0
|
|
|
|
0
|
if ( $CGILOCATION{$mode}[0] eq 'post' ) { |
247
|
0
|
|
|
|
|
0
|
my $response = $self->ua->request( POST $url, [%params] ); |
248
|
0
|
0
|
|
|
|
0
|
$response->proxy_authorization_basic( $self->authentication ) |
249
|
|
|
|
|
|
|
if ( $self->authentication ); |
250
|
0
|
|
|
|
|
0
|
$self->_parse_response( $response->content ); |
251
|
0
|
|
|
|
|
0
|
my ( $cookie, $querykey ) = $self->cookie; |
252
|
0
|
|
|
|
|
0
|
my %qualifiers = ( |
253
|
|
|
|
|
|
|
'-mode' => 'webenv', |
254
|
|
|
|
|
|
|
'-seq_start' => $seq_start, |
255
|
|
|
|
|
|
|
'-seq_stop' => $seq_stop, |
256
|
|
|
|
|
|
|
'-strand' => $strand, |
257
|
|
|
|
|
|
|
'-complexity' => $complexity, |
258
|
|
|
|
|
|
|
'-format' => $format |
259
|
|
|
|
|
|
|
); |
260
|
0
|
|
|
|
|
0
|
return $self->get_request(%qualifiers); |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
else { |
263
|
0
|
|
|
|
|
0
|
$url->query_form(%params); |
264
|
0
|
|
|
|
|
0
|
return GET $url; |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head2 get_Stream_by_batch |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Title : get_Stream_by_batch |
272
|
|
|
|
|
|
|
Usage : $seq = $db->get_Stream_by_batch($ref); |
273
|
|
|
|
|
|
|
Function: Retrieves Seq objects from Entrez 'en masse', rather than one |
274
|
|
|
|
|
|
|
at a time. For large numbers of sequences, this is far superior |
275
|
|
|
|
|
|
|
than get_Stream_by_id or get_Stream_by_acc. |
276
|
|
|
|
|
|
|
Example : |
277
|
|
|
|
|
|
|
Returns : a Bio::SeqIO stream object |
278
|
|
|
|
|
|
|
Args : $ref : either an array reference, a filename, or a filehandle |
279
|
|
|
|
|
|
|
from which to get the list of unique ids/accession numbers. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
NOTE: deprecated API. Use get_Stream_by_id() instead. |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=cut |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
*get_Stream_by_batch = sub { |
286
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
287
|
0
|
|
|
|
|
0
|
$self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead'); |
288
|
0
|
|
|
|
|
0
|
$self->get_Stream_by_id(@_) |
289
|
|
|
|
|
|
|
}; |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
=head2 get_Stream_by_query |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
Title : get_Stream_by_query |
294
|
|
|
|
|
|
|
Usage : $seq = $db->get_Stream_by_query($query); |
295
|
|
|
|
|
|
|
Function: Retrieves Seq objects from Entrez 'en masse', rather than one |
296
|
|
|
|
|
|
|
at a time. For large numbers of sequences, this is far superior |
297
|
|
|
|
|
|
|
to get_Stream_by_id and get_Stream_by_acc. |
298
|
|
|
|
|
|
|
Example : |
299
|
|
|
|
|
|
|
Returns : a Bio::SeqIO stream object |
300
|
|
|
|
|
|
|
Args : An Entrez query string or a Bio::DB::Query::GenBank object. |
301
|
|
|
|
|
|
|
It is suggested that you create a Bio::DB::Query::GenBank object and get |
302
|
|
|
|
|
|
|
the entry count before you fetch a potentially large stream. |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
=cut |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
sub get_Stream_by_query { |
307
|
0
|
|
|
0
|
1
|
0
|
my ($self, $query) = @_; |
308
|
0
|
0
|
0
|
|
|
0
|
unless (ref $query && $query->can('query')) { |
309
|
0
|
|
|
|
|
0
|
$query = Bio::DB::Query::GenBank->new($query); |
310
|
|
|
|
|
|
|
} |
311
|
0
|
|
|
|
|
0
|
return $self->get_seq_stream('-query' => $query, '-mode'=>'query'); |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=head2 postprocess_data |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
Title : postprocess_data |
317
|
|
|
|
|
|
|
Usage : $self->postprocess_data ( 'type' => 'string', |
318
|
|
|
|
|
|
|
'location' => \$datastr ); |
319
|
|
|
|
|
|
|
Function: Process downloaded data before loading into a Bio::SeqIO. This |
320
|
|
|
|
|
|
|
works for Genbank and Genpept, other classes should override |
321
|
|
|
|
|
|
|
it with their own method. |
322
|
|
|
|
|
|
|
Returns : void |
323
|
|
|
|
|
|
|
Args : hash with two keys: |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
'type' can be 'string' or 'file' |
326
|
|
|
|
|
|
|
'location' either file location or string reference containing data |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=cut |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
0
|
1
|
|
sub postprocess_data { |
331
|
|
|
|
|
|
|
# retain this in case postprocessing is needed at a future date |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=head2 request_format |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
Title : request_format |
338
|
|
|
|
|
|
|
Usage : my ($req_format, $ioformat) = $self->request_format; |
339
|
|
|
|
|
|
|
$self->request_format("genbank"); |
340
|
|
|
|
|
|
|
$self->request_format("fasta"); |
341
|
|
|
|
|
|
|
Function: Get/Set sequence format retrieval. The get-form will normally not |
342
|
|
|
|
|
|
|
be used outside of this and derived modules. |
343
|
|
|
|
|
|
|
Returns : Array of two strings, the first representing the format for |
344
|
|
|
|
|
|
|
retrieval, and the second specifying the corresponding SeqIO format. |
345
|
|
|
|
|
|
|
Args : $format = sequence format |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
=cut |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
sub request_format { |
350
|
1
|
|
|
1
|
1
|
4
|
my ( $self, $value ) = @_; |
351
|
1
|
50
|
|
|
|
5
|
if ( defined $value ) { |
352
|
1
|
|
|
|
|
3
|
$value = lc $value; |
353
|
1
|
50
|
|
|
|
7
|
if ( defined $FORMATMAP{$value} ) { |
354
|
1
|
|
|
|
|
5
|
$self->{'_format'} = [ $value, $FORMATMAP{$value} ]; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
else { |
357
|
|
|
|
|
|
|
# Try to fall back to a default. Alternatively, we could throw |
358
|
|
|
|
|
|
|
# an exception |
359
|
0
|
|
|
|
|
0
|
$self->{'_format'} = [ $value, $value ]; |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
} |
362
|
1
|
|
|
|
|
1
|
return @{ $self->{'_format'} }; |
|
1
|
|
|
|
|
3
|
|
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=head2 redirect_refseq |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
Title : redirect_refseq |
369
|
|
|
|
|
|
|
Usage : $db->redirect_refseq(1) |
370
|
|
|
|
|
|
|
Function: simple getter/setter which redirects RefSeqs to use Bio::DB::RefSeq |
371
|
|
|
|
|
|
|
Returns : Boolean value |
372
|
|
|
|
|
|
|
Args : Boolean value (optional) |
373
|
|
|
|
|
|
|
Throws : 'unparseable output exception' |
374
|
|
|
|
|
|
|
Note : This replaces 'no_redirect' as a more straightforward flag to |
375
|
|
|
|
|
|
|
redirect possible RefSeqs to use Bio::DB::RefSeq (EBI interface) |
376
|
|
|
|
|
|
|
instead of retrieving the NCBI records |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=cut |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
sub redirect_refseq { |
381
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
382
|
1
|
50
|
|
|
|
4
|
return $self->{'_redirect_refseq'} = shift if @_; |
383
|
0
|
|
|
|
|
0
|
return $self->{'_redirect_refseq'}; |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head2 complexity |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Title : complexity |
389
|
|
|
|
|
|
|
Usage : $db->complexity(3) |
390
|
|
|
|
|
|
|
Function: get/set complexity value |
391
|
|
|
|
|
|
|
Returns : value from 0-4 indicating level of complexity |
392
|
|
|
|
|
|
|
Args : value from 0-4 (optional); if unset server assumes 1 |
393
|
|
|
|
|
|
|
Throws : if arg is not an integer or falls outside of noted range above |
394
|
|
|
|
|
|
|
Note : From efetch docs, the complexity regulates the display: |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
0 - get the whole blob |
397
|
|
|
|
|
|
|
1 - get the bioseq for gi of interest (default in Entrez) |
398
|
|
|
|
|
|
|
2 - get the minimal bioseq-set containing the gi of interest |
399
|
|
|
|
|
|
|
3 - get the minimal nuc-prot containing the gi of interest |
400
|
|
|
|
|
|
|
4 - get the minimal pub-set containing the gi of interest |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=cut |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
sub complexity { |
405
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $comp ) = @_; |
406
|
0
|
0
|
|
|
|
0
|
if ( defined $comp ) { |
407
|
0
|
0
|
0
|
|
|
0
|
$self->throw("Complexity value must be integer between 0 and 4") |
|
|
|
0
|
|
|
|
|
408
|
|
|
|
|
|
|
if $comp !~ /^\d+$/ || $comp < 0 || $comp > 4; |
409
|
0
|
|
|
|
|
0
|
$self->{'_complexity'} = $comp; |
410
|
|
|
|
|
|
|
} |
411
|
0
|
|
|
|
|
0
|
return $self->{'_complexity'}; |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head2 strand |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Title : strand |
417
|
|
|
|
|
|
|
Usage : $db->strand(1) |
418
|
|
|
|
|
|
|
Function: get/set strand value |
419
|
|
|
|
|
|
|
Returns : strand value if set |
420
|
|
|
|
|
|
|
Args : value of 1 (plus) or 2 (minus); if unset server assumes 1 |
421
|
|
|
|
|
|
|
Throws : if arg is not an integer or is not 1 or 2 |
422
|
|
|
|
|
|
|
Note : This differs from BioPerl's use of strand: 1 = plus, -1 = minus 0 = not relevant. |
423
|
|
|
|
|
|
|
We should probably add in some functionality to convert over in the future. |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=cut |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
sub strand { |
428
|
0
|
|
|
0
|
1
|
0
|
my ($self, $str) = @_; |
429
|
0
|
0
|
|
|
|
0
|
if ($str) { |
430
|
0
|
0
|
0
|
|
|
0
|
$self->throw("strand() must be integer value of 1 (plus strand) or 2 (minus strand) if set") if |
|
|
|
0
|
|
|
|
|
431
|
|
|
|
|
|
|
$str !~ /^\d+$/ || $str < 1 || $str > 2; |
432
|
0
|
|
|
|
|
0
|
$self->{'_strand'} = $str; |
433
|
|
|
|
|
|
|
} |
434
|
0
|
|
|
|
|
0
|
return $self->{'_strand'}; |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
=head2 seq_start |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
Title : seq_start |
440
|
|
|
|
|
|
|
Usage : $db->seq_start(123) |
441
|
|
|
|
|
|
|
Function: get/set sequence start location |
442
|
|
|
|
|
|
|
Returns : sequence start value if set |
443
|
|
|
|
|
|
|
Args : integer; if unset server assumes 1 |
444
|
|
|
|
|
|
|
Throws : if arg is not an integer |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=cut |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub seq_start { |
449
|
0
|
|
|
0
|
1
|
0
|
my ($self, $start) = @_; |
450
|
0
|
0
|
|
|
|
0
|
if ($start) { |
451
|
0
|
0
|
|
|
|
0
|
$self->throw("seq_start() must be integer value if set") if |
452
|
|
|
|
|
|
|
$start !~ /^\d+$/; |
453
|
0
|
|
|
|
|
0
|
$self->{'_seq_start'} = $start; |
454
|
|
|
|
|
|
|
} |
455
|
0
|
|
|
|
|
0
|
return $self->{'_seq_start'}; |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=head2 seq_stop |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
Title : seq_stop |
461
|
|
|
|
|
|
|
Usage : $db->seq_stop(456) |
462
|
|
|
|
|
|
|
Function: get/set sequence stop (end) location |
463
|
|
|
|
|
|
|
Returns : sequence stop (end) value if set |
464
|
|
|
|
|
|
|
Args : integer; if unset server assumes 1 |
465
|
|
|
|
|
|
|
Throws : if arg is not an integer |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=cut |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
sub seq_stop { |
470
|
0
|
|
|
0
|
1
|
0
|
my ($self, $stop) = @_; |
471
|
0
|
0
|
|
|
|
0
|
if ($stop) { |
472
|
0
|
0
|
|
|
|
0
|
$self->throw("seq_stop() must be integer if set") if |
473
|
|
|
|
|
|
|
$stop !~ /^\d+$/; |
474
|
0
|
|
|
|
|
0
|
$self->{'_seq_stop'} = $stop; |
475
|
|
|
|
|
|
|
} |
476
|
0
|
|
|
|
|
0
|
return $self->{'_seq_stop'}; |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=head2 Bio::DB::WebDBSeqI methods |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
Overriding WebDBSeqI method to help newbies to retrieve sequences |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
=head2 get_Stream_by_acc |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
Title : get_Stream_by_acc |
486
|
|
|
|
|
|
|
Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]); |
487
|
|
|
|
|
|
|
Function: gets a series of Seq objects by accession numbers |
488
|
|
|
|
|
|
|
Returns : a Bio::SeqIO stream object |
489
|
|
|
|
|
|
|
Args : $ref : a reference to an array of accession numbers for |
490
|
|
|
|
|
|
|
the desired sequence entries |
491
|
|
|
|
|
|
|
Note : For GenBank, this just calls the same code for get_Stream_by_id() |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=cut |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
sub get_Stream_by_acc { |
496
|
1
|
|
|
1
|
1
|
2
|
my ( $self, $ids ) = @_; |
497
|
1
|
|
|
|
|
9
|
my $newdb = $self->_check_id($ids); |
498
|
0
|
0
|
0
|
|
|
0
|
if ( defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq') ) { |
|
|
|
0
|
|
|
|
|
499
|
0
|
|
|
|
|
0
|
return $newdb->get_seq_stream( '-uids' => $ids, '-mode' => 'single' ); |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
else { |
502
|
0
|
|
|
|
|
0
|
return $self->get_seq_stream( '-uids' => $ids, '-mode' => 'single' ); |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
=head2 _check_id |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
Title : _check_id |
509
|
|
|
|
|
|
|
Usage : |
510
|
|
|
|
|
|
|
Function: |
511
|
|
|
|
|
|
|
Returns : a Bio::DB::RefSeq reference or throws |
512
|
|
|
|
|
|
|
Args : $id(s), $string |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=cut |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub _check_id { |
517
|
1
|
|
|
1
|
|
3
|
my ( $self, $ids ) = @_; |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# NT contigs can not be retrieved |
520
|
1
|
50
|
|
|
|
12
|
$self->throw("NT_ contigs are whole chromosome files which are not part of regular" |
521
|
|
|
|
|
|
|
. "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.") |
522
|
|
|
|
|
|
|
if $ids =~ /NT_/; |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
# Asking for a RefSeq from EMBL/GenBank |
525
|
0
|
0
|
|
|
|
0
|
if ( $self->redirect_refseq ) { |
526
|
0
|
0
|
|
|
|
0
|
if ( $ids =~ /N._/ ) { |
527
|
0
|
0
|
|
|
|
0
|
$self->warn( |
528
|
|
|
|
|
|
|
"[$ids] is not a normal sequence database but a RefSeq entry." |
529
|
|
|
|
|
|
|
. " Redirecting the request.\n" ) |
530
|
|
|
|
|
|
|
if $self->verbose >= 0; |
531
|
0
|
|
|
|
|
0
|
return Bio::DB::RefSeq->new(); |
532
|
|
|
|
|
|
|
} |
533
|
|
|
|
|
|
|
} |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=head2 delay_policy |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Title : delay_policy |
540
|
|
|
|
|
|
|
Usage : $secs = $self->delay_policy |
541
|
|
|
|
|
|
|
Function: NCBI requests a delay of 3 seconds between requests. This method |
542
|
|
|
|
|
|
|
implements that policy. |
543
|
|
|
|
|
|
|
Returns : number of seconds to delay |
544
|
|
|
|
|
|
|
Args : none |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
=cut |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
sub delay_policy { |
549
|
1
|
|
|
1
|
1
|
3
|
my $self = shift; |
550
|
1
|
|
|
|
|
2
|
return $REQUEST_DELAY; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
=head2 cookie |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
Title : cookie |
556
|
|
|
|
|
|
|
Usage : ($cookie,$querynum) = $db->cookie |
557
|
|
|
|
|
|
|
Function: return the NCBI query cookie, this information is used by |
558
|
|
|
|
|
|
|
Bio::DB::GenBank in conjunction with efetch, ripped from |
559
|
|
|
|
|
|
|
Bio::DB::Query::GenBank |
560
|
|
|
|
|
|
|
Returns : list of (cookie,querynum) |
561
|
|
|
|
|
|
|
Args : none |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
=cut |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
sub cookie { |
566
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
567
|
0
|
0
|
|
|
|
|
if (@_) { |
568
|
0
|
|
|
|
|
|
$self->{'_cookie'} = shift; |
569
|
0
|
|
|
|
|
|
$self->{'_querynum'} = shift; |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
else { |
572
|
0
|
|
|
|
|
|
return @{$self}{qw(_cookie _querynum)}; |
|
0
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
} |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=head2 _parse_response |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
Title : _parse_response |
579
|
|
|
|
|
|
|
Usage : $db->_parse_response($content) |
580
|
|
|
|
|
|
|
Function: parse out response for cookie, this is a trimmed-down version |
581
|
|
|
|
|
|
|
of _parse_response from Bio::DB::Query::GenBank |
582
|
|
|
|
|
|
|
Returns : empty |
583
|
|
|
|
|
|
|
Args : none |
584
|
|
|
|
|
|
|
Throws : 'unparseable output exception' |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
=cut |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
sub _parse_response { |
589
|
0
|
|
|
0
|
|
|
my $self = shift; |
590
|
0
|
|
|
|
|
|
my $content = shift; |
591
|
0
|
0
|
|
|
|
|
if ( my ($warning) = $content =~ m!(.+)!s ) { |
592
|
0
|
|
|
|
|
|
$self->warn("Warning(s) from GenBank: $warning\n"); |
593
|
|
|
|
|
|
|
} |
594
|
0
|
0
|
|
|
|
|
if ( my ($error) = $content =~ /([^<]+)/ ) { |
595
|
0
|
|
|
|
|
|
$self->throw("Error from Genbank: $error"); |
596
|
|
|
|
|
|
|
} |
597
|
0
|
|
|
|
|
|
my ($cookie) = $content =~ m!(\S+)!; |
598
|
0
|
|
|
|
|
|
my ($querykey) = $content =~ m!(\d+)!; |
599
|
0
|
|
|
|
|
|
$self->cookie( uri_unescape($cookie), $querykey ); |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
=head2 no_redirect |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
Title : no_redirect |
605
|
|
|
|
|
|
|
Usage : $db->no_redirect($content) |
606
|
|
|
|
|
|
|
Function: DEPRECATED - Used to indicate that Bio::DB::GenBank instance retrieves |
607
|
|
|
|
|
|
|
possible RefSeqs from EBI instead; default behavior is now to |
608
|
|
|
|
|
|
|
retrieve directly from NCBI |
609
|
|
|
|
|
|
|
Returns : None |
610
|
|
|
|
|
|
|
Args : None |
611
|
|
|
|
|
|
|
Throws : Method is deprecated in favor of positive flag method 'redirect_refseq' |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
=cut |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
sub no_redirect { |
616
|
|
|
|
|
|
|
shift->throw( |
617
|
0
|
|
|
0
|
1
|
|
"Use of no_redirect() is deprecated. Bio::DB::GenBank default is to always\n". |
618
|
|
|
|
|
|
|
"retrieve from NCBI. In order to redirect possible RefSeqs to EBI, set\n". |
619
|
|
|
|
|
|
|
"redirect_refseq flag to 1"); |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
1; |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
__END__ |