File Coverage

Bio/DB/NCBIHelper.pm
Criterion Covered Total %
statement 49 141 34.7
branch 9 90 10.0
condition 1 60 1.6
subroutine 14 27 51.8
pod 16 16 100.0
total 89 334 26.6


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::NCBIHelper
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13             #
14             # Interfaces with new WebDBSeqI interface
15              
16             =head1 NAME
17              
18             Bio::DB::NCBIHelper - A collection of routines useful for queries to
19             NCBI databases.
20              
21             =head1 SYNOPSIS
22              
23             # Do not use this module directly.
24              
25             # get a Bio::DB::NCBIHelper object somehow
26             my $seqio = $db->get_Stream_by_acc(['J00522']);
27             foreach my $seq ( $seqio->next_seq ) {
28             # process seq
29             }
30              
31             =head1 DESCRIPTION
32              
33             Provides a single place to setup some common methods for querying NCBI
34             web databases. This module just centralizes the methods for
35             constructing a URL for querying NCBI GenBank and NCBI GenPept and the
36             common HTML stripping done in L().
37              
38             The base NCBI query URL used is:
39             https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi
40              
41             =head1 FEEDBACK
42              
43             =head2 Mailing Lists
44              
45             User feedback is an integral part of the
46             evolution of this and other Bioperl modules. Send
47             your comments and suggestions preferably to one
48             of the Bioperl mailing lists. Your participation
49             is much appreciated.
50              
51             bioperl-l@bioperl.org - General discussion
52             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53              
54             =head2 Support
55              
56             Please direct usage questions or support issues to the mailing list:
57              
58             I
59              
60             rather than to the module maintainer directly. Many experienced and
61             reponsive experts will be able look at the problem and quickly
62             address it. Please include a thorough description of the problem
63             with code and data examples if at all possible.
64              
65             =head2 Reporting Bugs
66              
67             Report bugs to the Bioperl bug tracking system to
68             help us keep track the bugs and their resolution.
69             Bug reports can be submitted via the web.
70              
71             https://github.com/bioperl/bioperl-live/issues
72              
73             =head1 AUTHOR - Jason Stajich
74              
75             Email jason@bioperl.org
76              
77             =head1 APPENDIX
78              
79             The rest of the documentation details each of the
80             object methods. Internal methods are usually
81             preceded with a _
82              
83             =cut
84              
85             # Let the code begin...
86              
87             package Bio::DB::NCBIHelper;
88 3     3   13 use strict;
  3         3  
  3         67  
89              
90 3     3   897 use Bio::DB::Query::GenBank;
  3         4  
  3         70  
91 3     3   13 use HTTP::Request::Common;
  3         3  
  3         131  
92 3     3   11 use URI;
  3         3  
  3         39  
93 3     3   10 use Bio::Root::IO;
  3         4  
  3         46  
94 3     3   312 use Bio::DB::RefSeq;
  3         4  
  3         52  
95 3     3   10 use URI::Escape qw(uri_unescape);
  3         3  
  3         114  
96              
97 3     3   11 use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
  3         3  
  3         4162  
98              
99             our $HOSTBASE = 'https://eutils.ncbi.nlm.nih.gov';
100             our $MAX_ENTRIES = 19000;
101             our $REQUEST_DELAY = 3;
102             our %CGILOCATION = (
103             'batch' => [ 'post' => '/entrez/eutils/epost.fcgi' ],
104             'query' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
105             'single' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
106             'version' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
107             'gi' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
108             'webenv' => [ 'get' => '/entrez/eutils/efetch.fcgi' ]
109             );
110             our %FORMATMAP = (
111             'gb' => 'genbank',
112             'gp' => 'genbank',
113             'fasta' => 'fasta',
114             'asn.1' => 'entrezgene',
115             'gbwithparts' => 'genbank',
116             );
117             our $DEFAULTFORMAT = 'gb';
118              
119             =head2 new
120              
121             Title : new
122             Usage :
123             Function: the new way to make modules a little more lightweight
124             Returns :
125             Args :
126              
127             =cut
128              
129             sub new {
130 1     1 1 18 my ( $class, @args ) = @_;
131 1         9 my $self = $class->SUPER::new(@args);
132 1         4 my ($seq_start, $seq_stop, $no_redirect,
133             $redirect, $complexity, $strand
134             )
135             = $self->_rearrange(
136             [ qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND) ],
137             @args
138             );
139 1 50       5 $seq_start && $self->seq_start($seq_start);
140 1 50       2 $seq_stop && $self->seq_stop($seq_stop);
141 1 50       3 $no_redirect && $self->no_redirect($no_redirect);
142 1 50       7 $redirect && $self->redirect_refseq($redirect);
143 1 50       2 $strand && $self->strand($strand);
144              
145             # adjust statement to accept zero value
146 1 0 0     3 defined $complexity
      33        
147             && ( $complexity >= 0 && $complexity <= 4 )
148             && $self->complexity($complexity);
149 1         7 return $self;
150             }
151              
152              
153             =head2 get_params
154              
155             Title : get_params
156             Usage : my %params = $self->get_params($mode)
157             Function: returns key,value pairs to be passed to NCBI database
158             for either 'batch' or 'single' sequence retrieval method
159             Returns : a key,value pair hash
160             Args : 'single' or 'batch' mode for retrieval
161              
162             =cut
163              
164             sub get_params {
165 0     0 1 0 my ($self, $mode) = @_;
166 0         0 $self->throw("subclass did not implement get_params");
167             }
168              
169             =head2 default_format
170              
171             Title : default_format
172             Usage : my $format = $self->default_format
173             Function: returns default sequence format for this module
174             Returns : string
175             Args : none
176              
177             =cut
178              
179             sub default_format {
180 0     0 1 0 return $DEFAULTFORMAT;
181             }
182              
183             =head2 get_request
184              
185             Title : get_request
186             Usage : my $url = $self->get_request
187             Function: HTTP::Request
188             Returns :
189             Args : %qualifiers = a hash of qualifiers (ids, format, etc)
190              
191             =cut
192              
193             sub get_request {
194 0     0 1 0 my ( $self, @qualifiers ) = @_;
195 0         0 my ( $mode, $uids, $format, $query, $seq_start, $seq_stop, $strand,
196             $complexity )
197             = $self->_rearrange(
198             [qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
199             @qualifiers );
200 0         0 $mode = lc $mode;
201 0 0       0 ($format) = $self->request_format() unless ( defined $format );
202 0 0 0     0 if ( !defined $mode || $mode eq '' ) { $mode = 'single'; }
  0         0  
203 0         0 my %params = $self->get_params($mode);
204 0 0       0 if ( !%params ) {
205 0         0 $self->throw(
206             "must specify a valid retrieval mode 'single' or 'batch' not '$mode'"
207             );
208             }
209 0         0 my $url = URI->new( $HOSTBASE . $CGILOCATION{$mode}[1] );
210 0 0 0     0 unless ( $mode eq 'webenv' || defined $uids || defined $query ) {
      0        
211 0         0 $self->throw("Must specify a query or list of uids to fetch");
212             }
213 0 0 0     0 if ( $query && $query->can('cookie') ) {
    0 0        
    0          
    0          
214 0         0 @params{ 'WebEnv', 'query_key' } = $query->cookie;
215 0         0 $params{'db'} = $query->db;
216             }
217             elsif ($query) {
218 0         0 $params{'id'} = join ',', $query->ids;
219             }
220              
221             # for batch retrieval, non-query style
222             elsif ( $mode eq 'webenv' && $self->can('cookie') ) {
223 0         0 @params{ 'WebEnv', 'query_key' } = $self->cookie;
224             }
225             elsif ($uids) {
226 0 0       0 if ( ref($uids) =~ /array/i ) {
227 0         0 $uids = join( ",", @$uids );
228             }
229 0         0 $params{'id'} = $uids;
230             }
231 0 0       0 $seq_start && ( $params{'seq_start'} = $seq_start );
232 0 0       0 $seq_stop && ( $params{'seq_stop'} = $seq_stop );
233 0 0       0 $strand && ( $params{'strand'} = $strand );
234 0 0 0     0 if ( defined $complexity && ( $seq_start || $seq_stop || $strand ) ) {
      0        
235 0 0 0     0 $self->warn(
      0        
236             "Complexity set to $complexity; seq_start and seq_stop may not work!"
237             ) if ( $complexity != 1 && ( $seq_start || $seq_stop ) );
238 0 0 0     0 $self->warn(
      0        
239             "Complexity set to 0; expect strange results with strand set to 2"
240             ) if ( $complexity == 0 && $strand == 2 && $format eq 'fasta' );
241             }
242 0 0       0 defined $complexity && ( $params{'complexity'} = $complexity );
243 0 0       0 $params{'rettype'} = $format unless $mode eq 'batch';
244              
245             # for now, 'post' is batch retrieval
246 0 0       0 if ( $CGILOCATION{$mode}[0] eq 'post' ) {
247 0         0 my $response = $self->ua->request( POST $url, [%params] );
248 0 0       0 $response->proxy_authorization_basic( $self->authentication )
249             if ( $self->authentication );
250 0         0 $self->_parse_response( $response->content );
251 0         0 my ( $cookie, $querykey ) = $self->cookie;
252 0         0 my %qualifiers = (
253             '-mode' => 'webenv',
254             '-seq_start' => $seq_start,
255             '-seq_stop' => $seq_stop,
256             '-strand' => $strand,
257             '-complexity' => $complexity,
258             '-format' => $format
259             );
260 0         0 return $self->get_request(%qualifiers);
261             }
262             else {
263 0         0 $url->query_form(%params);
264 0         0 return GET $url;
265             }
266             }
267              
268              
269             =head2 get_Stream_by_batch
270              
271             Title : get_Stream_by_batch
272             Usage : $seq = $db->get_Stream_by_batch($ref);
273             Function: Retrieves Seq objects from Entrez 'en masse', rather than one
274             at a time. For large numbers of sequences, this is far superior
275             than get_Stream_by_id or get_Stream_by_acc.
276             Example :
277             Returns : a Bio::SeqIO stream object
278             Args : $ref : either an array reference, a filename, or a filehandle
279             from which to get the list of unique ids/accession numbers.
280              
281             NOTE: deprecated API. Use get_Stream_by_id() instead.
282              
283             =cut
284              
285             *get_Stream_by_batch = sub {
286 0     0   0 my $self = shift;
287 0         0 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
288 0         0 $self->get_Stream_by_id(@_)
289             };
290              
291             =head2 get_Stream_by_query
292              
293             Title : get_Stream_by_query
294             Usage : $seq = $db->get_Stream_by_query($query);
295             Function: Retrieves Seq objects from Entrez 'en masse', rather than one
296             at a time. For large numbers of sequences, this is far superior
297             to get_Stream_by_id and get_Stream_by_acc.
298             Example :
299             Returns : a Bio::SeqIO stream object
300             Args : An Entrez query string or a Bio::DB::Query::GenBank object.
301             It is suggested that you create a Bio::DB::Query::GenBank object and get
302             the entry count before you fetch a potentially large stream.
303              
304             =cut
305              
306             sub get_Stream_by_query {
307 0     0 1 0 my ($self, $query) = @_;
308 0 0 0     0 unless (ref $query && $query->can('query')) {
309 0         0 $query = Bio::DB::Query::GenBank->new($query);
310             }
311 0         0 return $self->get_seq_stream('-query' => $query, '-mode'=>'query');
312             }
313              
314             =head2 postprocess_data
315              
316             Title : postprocess_data
317             Usage : $self->postprocess_data ( 'type' => 'string',
318             'location' => \$datastr );
319             Function: Process downloaded data before loading into a Bio::SeqIO. This
320             works for Genbank and Genpept, other classes should override
321             it with their own method.
322             Returns : void
323             Args : hash with two keys:
324              
325             'type' can be 'string' or 'file'
326             'location' either file location or string reference containing data
327              
328             =cut
329              
330       0 1   sub postprocess_data {
331             # retain this in case postprocessing is needed at a future date
332             }
333              
334              
335             =head2 request_format
336              
337             Title : request_format
338             Usage : my ($req_format, $ioformat) = $self->request_format;
339             $self->request_format("genbank");
340             $self->request_format("fasta");
341             Function: Get/Set sequence format retrieval. The get-form will normally not
342             be used outside of this and derived modules.
343             Returns : Array of two strings, the first representing the format for
344             retrieval, and the second specifying the corresponding SeqIO format.
345             Args : $format = sequence format
346              
347             =cut
348              
349             sub request_format {
350 1     1 1 2 my ( $self, $value ) = @_;
351 1 50       2 if ( defined $value ) {
352 1         1 $value = lc $value;
353 1 50       3 if ( defined $FORMATMAP{$value} ) {
354 1         3 $self->{'_format'} = [ $value, $FORMATMAP{$value} ];
355             }
356             else {
357             # Try to fall back to a default. Alternatively, we could throw
358             # an exception
359 0         0 $self->{'_format'} = [ $value, $value ];
360             }
361             }
362 1         1 return @{ $self->{'_format'} };
  1         2  
363             }
364              
365              
366             =head2 redirect_refseq
367              
368             Title : redirect_refseq
369             Usage : $db->redirect_refseq(1)
370             Function: simple getter/setter which redirects RefSeqs to use Bio::DB::RefSeq
371             Returns : Boolean value
372             Args : Boolean value (optional)
373             Throws : 'unparseable output exception'
374             Note : This replaces 'no_redirect' as a more straightforward flag to
375             redirect possible RefSeqs to use Bio::DB::RefSeq (EBI interface)
376             instead of retrieving the NCBI records
377              
378             =cut
379              
380             sub redirect_refseq {
381 1     1 1 2 my $self = shift;
382 1 50       3 return $self->{'_redirect_refseq'} = shift if @_;
383 0         0 return $self->{'_redirect_refseq'};
384             }
385              
386             =head2 complexity
387              
388             Title : complexity
389             Usage : $db->complexity(3)
390             Function: get/set complexity value
391             Returns : value from 0-4 indicating level of complexity
392             Args : value from 0-4 (optional); if unset server assumes 1
393             Throws : if arg is not an integer or falls outside of noted range above
394             Note : From efetch docs, the complexity regulates the display:
395              
396             0 - get the whole blob
397             1 - get the bioseq for gi of interest (default in Entrez)
398             2 - get the minimal bioseq-set containing the gi of interest
399             3 - get the minimal nuc-prot containing the gi of interest
400             4 - get the minimal pub-set containing the gi of interest
401              
402             =cut
403              
404             sub complexity {
405 0     0 1 0 my ( $self, $comp ) = @_;
406 0 0       0 if ( defined $comp ) {
407 0 0 0     0 $self->throw("Complexity value must be integer between 0 and 4")
      0        
408             if $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
409 0         0 $self->{'_complexity'} = $comp;
410             }
411 0         0 return $self->{'_complexity'};
412             }
413              
414             =head2 strand
415              
416             Title : strand
417             Usage : $db->strand(1)
418             Function: get/set strand value
419             Returns : strand value if set
420             Args : value of 1 (plus) or 2 (minus); if unset server assumes 1
421             Throws : if arg is not an integer or is not 1 or 2
422             Note : This differs from BioPerl's use of strand: 1 = plus, -1 = minus 0 = not relevant.
423             We should probably add in some functionality to convert over in the future.
424              
425             =cut
426              
427             sub strand {
428 0     0 1 0 my ($self, $str) = @_;
429 0 0       0 if ($str) {
430 0 0 0     0 $self->throw("strand() must be integer value of 1 (plus strand) or 2 (minus strand) if set") if
      0        
431             $str !~ /^\d+$/ || $str < 1 || $str > 2;
432 0         0 $self->{'_strand'} = $str;
433             }
434 0         0 return $self->{'_strand'};
435             }
436              
437             =head2 seq_start
438              
439             Title : seq_start
440             Usage : $db->seq_start(123)
441             Function: get/set sequence start location
442             Returns : sequence start value if set
443             Args : integer; if unset server assumes 1
444             Throws : if arg is not an integer
445              
446             =cut
447              
448             sub seq_start {
449 0     0 1 0 my ($self, $start) = @_;
450 0 0       0 if ($start) {
451 0 0       0 $self->throw("seq_start() must be integer value if set") if
452             $start !~ /^\d+$/;
453 0         0 $self->{'_seq_start'} = $start;
454             }
455 0         0 return $self->{'_seq_start'};
456             }
457              
458             =head2 seq_stop
459              
460             Title : seq_stop
461             Usage : $db->seq_stop(456)
462             Function: get/set sequence stop (end) location
463             Returns : sequence stop (end) value if set
464             Args : integer; if unset server assumes 1
465             Throws : if arg is not an integer
466              
467             =cut
468              
469             sub seq_stop {
470 0     0 1 0 my ($self, $stop) = @_;
471 0 0       0 if ($stop) {
472 0 0       0 $self->throw("seq_stop() must be integer if set") if
473             $stop !~ /^\d+$/;
474 0         0 $self->{'_seq_stop'} = $stop;
475             }
476 0         0 return $self->{'_seq_stop'};
477             }
478              
479             =head2 Bio::DB::WebDBSeqI methods
480              
481             Overriding WebDBSeqI method to help newbies to retrieve sequences
482              
483             =head2 get_Stream_by_acc
484              
485             Title : get_Stream_by_acc
486             Usage : $seq = $db->get_Stream_by_acc([$acc1, $acc2]);
487             Function: gets a series of Seq objects by accession numbers
488             Returns : a Bio::SeqIO stream object
489             Args : $ref : a reference to an array of accession numbers for
490             the desired sequence entries
491             Note : For GenBank, this just calls the same code for get_Stream_by_id()
492              
493             =cut
494              
495             sub get_Stream_by_acc {
496 1     1 1 2 my ( $self, $ids ) = @_;
497 1         4 my $newdb = $self->_check_id($ids);
498 0 0 0     0 if ( defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq') ) {
      0        
499 0         0 return $newdb->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
500             }
501             else {
502 0         0 return $self->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
503             }
504             }
505              
506             =head2 _check_id
507              
508             Title : _check_id
509             Usage :
510             Function:
511             Returns : a Bio::DB::RefSeq reference or throws
512             Args : $id(s), $string
513              
514             =cut
515              
516             sub _check_id {
517 1     1   2 my ( $self, $ids ) = @_;
518              
519             # NT contigs can not be retrieved
520 1 50       9 $self->throw("NT_ contigs are whole chromosome files which are not part of regular"
521             . "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
522             if $ids =~ /NT_/;
523              
524             # Asking for a RefSeq from EMBL/GenBank
525 0 0       0 if ( $self->redirect_refseq ) {
526 0 0       0 if ( $ids =~ /N._/ ) {
527 0 0       0 $self->warn(
528             "[$ids] is not a normal sequence database but a RefSeq entry."
529             . " Redirecting the request.\n" )
530             if $self->verbose >= 0;
531 0         0 return Bio::DB::RefSeq->new();
532             }
533             }
534             }
535              
536              
537             =head2 delay_policy
538              
539             Title : delay_policy
540             Usage : $secs = $self->delay_policy
541             Function: NCBI requests a delay of 3 seconds between requests. This method
542             implements that policy.
543             Returns : number of seconds to delay
544             Args : none
545              
546             =cut
547              
548             sub delay_policy {
549 1     1 1 1 my $self = shift;
550 1         2 return $REQUEST_DELAY;
551             }
552              
553             =head2 cookie
554              
555             Title : cookie
556             Usage : ($cookie,$querynum) = $db->cookie
557             Function: return the NCBI query cookie, this information is used by
558             Bio::DB::GenBank in conjunction with efetch, ripped from
559             Bio::DB::Query::GenBank
560             Returns : list of (cookie,querynum)
561             Args : none
562              
563             =cut
564              
565             sub cookie {
566 0     0 1   my $self = shift;
567 0 0         if (@_) {
568 0           $self->{'_cookie'} = shift;
569 0           $self->{'_querynum'} = shift;
570             }
571             else {
572 0           return @{$self}{qw(_cookie _querynum)};
  0            
573             }
574             }
575              
576             =head2 _parse_response
577              
578             Title : _parse_response
579             Usage : $db->_parse_response($content)
580             Function: parse out response for cookie, this is a trimmed-down version
581             of _parse_response from Bio::DB::Query::GenBank
582             Returns : empty
583             Args : none
584             Throws : 'unparseable output exception'
585              
586             =cut
587              
588             sub _parse_response {
589 0     0     my $self = shift;
590 0           my $content = shift;
591 0 0         if ( my ($warning) = $content =~ m!(.+)!s ) {
592 0           $self->warn("Warning(s) from GenBank: $warning\n");
593             }
594 0 0         if ( my ($error) = $content =~ /([^<]+)/ ) {
595 0           $self->throw("Error from Genbank: $error");
596             }
597 0           my ($cookie) = $content =~ m!(\S+)!;
598 0           my ($querykey) = $content =~ m!(\d+)!;
599 0           $self->cookie( uri_unescape($cookie), $querykey );
600             }
601              
602             =head2 no_redirect
603              
604             Title : no_redirect
605             Usage : $db->no_redirect($content)
606             Function: DEPRECATED - Used to indicate that Bio::DB::GenBank instance retrieves
607             possible RefSeqs from EBI instead; default behavior is now to
608             retrieve directly from NCBI
609             Returns : None
610             Args : None
611             Throws : Method is deprecated in favor of positive flag method 'redirect_refseq'
612              
613             =cut
614              
615             sub no_redirect {
616             shift->throw(
617 0     0 1   "Use of no_redirect() is deprecated. Bio::DB::GenBank default is to always\n".
618             "retrieve from NCBI. In order to redirect possible RefSeqs to EBI, set\n".
619             "redirect_refseq flag to 1");
620             }
621              
622             1;
623              
624             __END__