File Coverage

Bio/DB/Query/GenBank.pm
Criterion Covered Total %
statement 30 110 27.2
branch 0 36 0.0
condition 0 6 0.0
subroutine 10 22 45.4
pod 3 9 33.3
total 43 183 23.5


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Query::GenBank.pm
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Lincoln Stein
7             #
8             # Copyright Lincoln Stein
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13             #
14              
15             =head1 NAME
16              
17             Bio::DB::Query::GenBank - Build a GenBank Entrez Query
18              
19             =head1 SYNOPSIS
20              
21             use Bio::DB::Query::GenBank;
22             use Bio::DB::GenBank;
23              
24             my $query_string = 'Oryza[Organism] AND EST[Keyword]';
25             my $query = Bio::DB::Query::GenBank->new(-db => 'nucleotide',
26             -query => $query_string,
27             -mindate => '2001',
28             -maxdate => '2002');
29              
30             print $query->count,"\n";
31              
32             # get a Genbank database handle
33             my $gb = Bio::DB::GenBank->new();
34             my $stream = $gb->get_Stream_by_query($query);
35             while (my $seq = $stream->next_seq) {
36             # do something with the sequence object
37             }
38              
39             # initialize the list yourself
40             my $query = Bio::DB::Query::GenBank->new(-ids=>[195052,2981014,11127914]);
41              
42              
43             =head1 DESCRIPTION
44              
45             This class encapsulates NCBI Entrez queries. It can be used to store
46             a list of GI numbers, to translate an Entrez query expression into a
47             list of GI numbers, or to count the number of terms that would be
48             returned by a query. Once created, the query object can be passed to
49             a Bio::DB::GenBank object in order to retrieve the entries
50             corresponding to the query.
51              
52             =head1 FEEDBACK
53              
54             =head2 Mailing Lists
55              
56             User feedback is an integral part of the
57             evolution of this and other Bioperl modules. Send
58             your comments and suggestions preferably to one
59             of the Bioperl mailing lists. Your participation
60             is much appreciated.
61              
62             bioperl-l@bioperl.org - General discussion
63             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
64              
65             =head2 Support
66              
67             Please direct usage questions or support issues to the mailing list:
68              
69             I
70              
71             rather than to the module maintainer directly. Many experienced and
72             reponsive experts will be able look at the problem and quickly
73             address it. Please include a thorough description of the problem
74             with code and data examples if at all possible.
75              
76             =head2 Reporting Bugs
77              
78             Report bugs to the Bioperl bug tracking system to help us keep track
79             the bugs and their resolution. Bug reports can be submitted via the
80             web:
81              
82             https://github.com/bioperl/bioperl-live/issues
83              
84             =head1 AUTHOR - Lincoln Stein
85              
86             Email lstein@cshl.org
87              
88             =head1 APPENDIX
89              
90             The rest of the documentation details each of the
91             object methods. Internal methods are usually
92             preceded with a _
93              
94             =cut
95              
96             # Let the code begin...
97              
98             package Bio::DB::Query::GenBank;
99 3     3   9 use strict;
  3         3  
  3         69  
100 3     3   442 use URI::Escape 'uri_unescape';
  3         1043  
  3         133  
101 3     3   10 use Bio::DB::NCBIHelper;
  3         4  
  3         64  
102              
103              
104             #use constant EPOST => $Bio::DB::NCBIHelper::HOSTBASE . '/entrez/eutils/epost.fcgi';
105             #use constant ESEARCH => $Bio::DB::NCBIHelper::HOSTBASE . '/entrez/eutils/esearch.fcgi';
106             # the reference to the our variable of the $Bio::DB::NCBIHelper::HOSTBASE doesn't seem to work in
107             # the constant definition in perl 5.10.1 or 5.16.3
108 3     3   8 use constant EPOST => '/entrez/eutils/epost.fcgi';
  3         3  
  3         169  
109 3     3   10 use constant ESEARCH => '/entrez/eutils/esearch.fcgi';
  3         4  
  3         108  
110 3     3   9 use constant DEFAULT_DB => 'protein';
  3         4  
  3         104  
111 3     3   12 use constant MAXENTRY => 100;
  3         3  
  3         100  
112              
113 3     3   9 use vars qw(@ATTRIBUTES);
  3         3  
  3         98  
114              
115 3     3   9 use base qw(Bio::DB::Query::WebQuery);
  3         2  
  3         1003  
116              
117             BEGIN {
118 3     3   8 @ATTRIBUTES = qw(db reldate mindate maxdate datetype maxids);
119 3         5 for my $method (@ATTRIBUTES) {
120 18 0   0 0 2444 eval <
  0 0   0 0    
  0 0   0 0    
  0 0   0 0    
  0 0   0 0    
  0 0   0 0    
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
  0            
121             sub $method {
122             my \$self = shift;
123             my \$d = \$self->{'_$method'};
124             \$self->{'_$method'} = shift if \@_;
125             \$d;
126             }
127             END
128             }
129             }
130              
131             =head2 new
132              
133             Title : new
134             Usage : $db = Bio::DB::Query::GenBank->new(@args)
135             Function: create new query object
136             Returns : new query object
137             Args : -db database (see below for allowable values)
138             -query query string
139             -mindate minimum date to retrieve from (YYYY/MM/DD)
140             -maxdate maximum date to retrieve from (YYYY/MM/DD)
141             -reldate relative date to retrieve from (days)
142             -datetype date field to use ('edat' or 'mdat')
143             -ids array ref of gids (overrides query)
144             -maxids the maximum number of IDs you wish to collect
145             (defaults to 100)
146              
147             This method creates a new query object. Typically you will specify a
148             -db and a -query argument, possibly modified by -mindate, -maxdate, or
149             -reldate. -mindate and -maxdate specify minimum and maximum dates for
150             entries you are interested in retrieving, expressed in the form
151             YYYY/MM/DD. -reldate is used to fetch entries that are more recent
152             than the indicated number of days.
153              
154             If you provide an array reference of IDs in -ids, the query will be
155             ignored and the list of IDs will be used when the query is passed to a
156             Bio::DB::GenBank object's get_Stream_by_query() method. A variety of
157             IDs are automatically recognized, including GI numbers, Accession
158             numbers, Accession.version numbers and locus names.
159              
160             By default, the query will collect only the first 100 IDs and will
161             generate an exception if you call the ids() method and the query
162             returned more than that number. To increase this maximum, set -maxids
163             to a number larger than the number of IDs you expect to obtain. This
164             only affects the list of IDs you obtain when you call the ids()
165             method, and does not affect in any way the number of entries you
166             receive when you generate a SeqIO stream from the query.
167              
168             -db option values:
169              
170             The most commonly used databases are:
171              
172             protein
173             nucleotide
174             nuccore
175             nucgss
176             nucest
177             unigene
178              
179             An up to date list of database names supported by NCBI eUtils is
180             always available at:
181             https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?
182              
183             However, note that not all of these databases return datatypes that
184             are parsable by Bio::DB::GenBank
185              
186             =cut
187              
188             sub new {
189 0     0 1   my $class = shift;
190 0           my $self = $class->SUPER::new(@_);
191 0           my ($query,$db,$reldate,$mindate,$maxdate,$datetype,$ids,$maxids)
192             = $self->_rearrange([qw(QUERY DB RELDATE MINDATE MAXDATE DATETYPE IDS MAXIDS)],@_);
193 0   0       $self->db($db || DEFAULT_DB);
194 0 0         $reldate && $self->reldate($reldate);
195 0 0         $mindate && $self->mindate($mindate);
196 0 0         $maxdate && $self->maxdate($maxdate);
197 0 0         $maxids && $self->maxids($maxids);
198 0   0       $datetype ||= 'mdat';
199 0 0         $datetype && $self->datetype($datetype);
200 0           $self;
201             }
202              
203             =head2 cookie
204              
205             Title : cookie
206             Usage : ($cookie,$querynum) = $db->cookie
207             Function: return the NCBI query cookie
208             Returns : list of (cookie,querynum)
209             Args : none
210              
211             NOTE: this information is used by Bio::DB::GenBank in
212             conjunction with efetch.
213              
214             =cut
215              
216             sub cookie {
217 0     0 1   my $self = shift;
218 0 0         if (@_) {
219 0           $self->{'_cookie'} = shift;
220 0           $self->{'_querynum'} = shift;
221             }
222              
223             else {
224 0           $self->_run_query;
225 0           @{$self}{qw(_cookie _querynum)};
  0            
226             }
227             }
228              
229             =head2 _request_parameters
230              
231             Title : _request_parameters
232             Usage : ($method,$base,@params = $db->_request_parameters
233             Function: return information needed to construct the request
234             Returns : list of method, url base and key=>value pairs
235             Args : none
236              
237             =cut
238              
239             sub _request_parameters {
240 0     0     my $self = shift;
241 0           my ($method,$base);
242 0 0         my @params = map {eval("\$self->$_") ? ($_ => eval("\$self->$_")) : () } @ATTRIBUTES;
  0            
243 0           push @params,('usehistory'=>'y','tool'=>'bioperl');
244 0           $method = 'get';
245            
246 0           $base = $Bio::DB::NCBIHelper::HOSTBASE.ESEARCH; # this seems to need to be dynamic
247 0           push @params,('term' => $self->query);
248             # Providing 'retmax' limits queries to 500 sequences ?? I don't think so LS
249 0   0       push @params,('retmax' => $self->maxids || MAXENTRY);
250              
251             # And actually, it seems that we need 'retstart' equal to 0 ?? I don't think so LS
252             # push @params, ('retstart' => 0);
253              
254 0           ($method,$base,@params);
255             }
256              
257              
258             =head2 count
259              
260             Title : count
261             Usage : $count = $db->count;
262             Function: return count of number of entries retrieved by query
263             Returns : integer
264             Args : none
265              
266             Returns the number of entries that are matched by the query.
267              
268             =cut
269              
270             sub count {
271 0     0 1   my $self = shift;
272 0 0         if (@_) {
273 0           my $d = $self->{'_count'};
274 0           $self->{'_count'} = shift;
275 0           return $d;
276             }
277             else {
278 0           $self->_run_query;
279 0           return $self->{'_count'};
280             }
281             }
282              
283             =head2 ids
284              
285             Title : ids
286             Usage : @ids = $db->ids([@ids])
287             Function: get/set matching ids
288             Returns : array of sequence ids
289             Args : (optional) array ref with new set of ids
290              
291             =cut
292              
293             =head2 query
294              
295             Title : query
296             Usage : $query = $db->query([$query])
297             Function: get/set query string
298             Returns : string
299             Args : (optional) new query string
300              
301             =cut
302              
303             =head2 _parse_response
304              
305             Title : _parse_response
306             Usage : $db->_parse_response($content)
307             Function: parse out response
308             Returns : empty
309             Args : none
310             Throws : 'unparseable output exception'
311              
312             =cut
313              
314             sub _parse_response {
315 0     0     my $self = shift;
316 0           my $content = shift;
317 0 0         if (my ($warning) = $content =~ m!(.+)!s) {
318 0           $self->warn("Warning(s) from GenBank: $warning\n");
319             }
320 0 0         if (my ($error) = $content =~ /([^<]+)/) {
321 0           $self->throw("Error from Genbank: $error");
322             }
323              
324 0           my ($count) = $content =~ /(\d+)/;
325 0           my ($max) = $content =~ /(\d+)/;
326 0           my $truncated = $count > $max;
327 0           $self->count($count);
328 0 0         if (!$truncated) {
329 0           my @ids = $content =~ /(\d+)/g;
330 0           $self->ids(\@ids);
331             } else {
332 0           $self->debug("ids truncated at $max\n");
333             }
334 0           $self->_truncated($truncated);
335 0           my ($cookie) = $content =~ m!(\S+)!;
336 0           my ($querykey) = $content =~ m!(\d+)!;
337 0           $self->cookie(uri_unescape($cookie),$querykey);
338             }
339              
340             =head2 _generate_id_string
341              
342             Title : _generate_id_string
343             Usage : $string = $db->_generate_id_string
344             Function: joins IDs together in string (possibly implementation-dependent)
345             Returns : string of concatenated IDs
346             Args : array ref of ids (normally passed into the constructor)
347              
348             =cut
349              
350             sub _generate_id_string {
351 0     0     my ($self, $ids) = @_;
352             # this attempts to separate out accs (alphanumeric) from UIDs (numeric only)
353             # recent changes to esearch has wrought this upon us.. cjf 4/19/07
354             return sprintf('%s',join('|',map {
355 0 0         ($_ =~ m{^\d+$}) ? $_.'[UID]' : $_.'[PACC]'
  0            
356             } @$ids));
357             }
358              
359             1;