File Coverage

Bio/DB/Taxonomy.pm
Criterion Covered Total %
statement 61 84 72.6
branch 13 22 59.0
condition 15 24 62.5
subroutine 9 15 60.0
pod 8 8 100.0
total 106 153 69.2


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::DB::Taxonomy
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::DB::Taxonomy - Access to a taxonomy database
17              
18             =head1 SYNOPSIS
19              
20             use Bio::DB::Taxonomy;
21             my $db = Bio::DB::Taxonomy->new(-source => 'entrez');
22             # use NCBI Entrez over HTTP
23             my $taxonid = $db->get_taxonid('Homo sapiens');
24              
25             # get a taxon
26             my $taxon = $db->get_taxon(-taxonid => $taxonid);
27              
28             =head1 DESCRIPTION
29              
30             This is a front end module for access to a taxonomy database.
31              
32             =head1 FEEDBACK
33              
34             =head2 Mailing Lists
35              
36             User feedback is an integral part of the evolution of this and other
37             Bioperl modules. Send your comments and suggestions preferably to
38             the Bioperl mailing list. Your participation is much appreciated.
39              
40             bioperl-l@bioperl.org - General discussion
41             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
42              
43             =head2 Support
44              
45             Please direct usage questions or support issues to the mailing list:
46              
47             I
48              
49             rather than to the module maintainer directly. Many experienced and
50             reponsive experts will be able look at the problem and quickly
51             address it. Please include a thorough description of the problem
52             with code and data examples if at all possible.
53              
54             =head2 Reporting Bugs
55              
56             Report bugs to the Bioperl bug tracking system to help us keep track
57             of the bugs and their resolution. Bug reports can be submitted via
58             the web:
59              
60             https://github.com/bioperl/bioperl-live/issues
61              
62             =head1 AUTHOR - Jason Stajich
63              
64             Email jason-at-bioperl.org
65              
66             =head1 CONTRIBUTORS
67              
68             Sendu Bala: bix@sendu.me.uk
69              
70             =head1 APPENDIX
71              
72             The rest of the documentation details each of the object methods.
73             Internal methods are usually preceded with a _
74              
75             =cut
76              
77             # Let the code begin...
78              
79             package Bio::DB::Taxonomy;
80 44     44   989 use vars qw($DefaultSource $TAXON_IIDS);
  44         47  
  44         2824  
81 44     44   236 use strict;
  44         49  
  44         690  
82 44     44   13437 use Bio::Tree::Tree;
  44         78  
  44         1180  
83              
84 44     44   213 use base qw(Bio::Root::Root);
  44         52  
  44         32948  
85              
86             $DefaultSource = 'entrez';
87             $TAXON_IIDS = {};
88              
89              
90             =head2 new
91              
92             Title : new
93             Usage : my $obj = Bio::DB::Taxonomy->new(-source => 'entrez');
94             Function: Builds a new Bio::DB::Taxonomy object.
95             Returns : an instance of Bio::DB::Taxonomy
96             Args : -source => which database source 'entrez' (NCBI taxonomy online),
97             'flatfile' (local NCBI taxonomy), 'greengenes' (local
98             GreenGenes taxonomy), 'silva' (local Silva taxonomy), or
99             'list' (Do-It-Yourself taxonomy)
100              
101             =cut
102              
103             sub new {
104 510     510 1 1402 my($class,@args) = @_;
105              
106 510 100       2001 if( $class =~ /Bio::DB::Taxonomy::(\S+)/ ) {
107 256         843 my ($self) = $class->SUPER::new(@args);
108 256         833 $self->_initialize(@args);
109 256         582 return $self;
110             } else {
111 254         907 my %param = @args;
112 254         809 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
  506         1233  
113 254   33     879 my $source = $param{'-source'} || $DefaultSource;
114              
115 254         406 $source = "\L$source"; # normalize capitalization to lower case
116              
117             # normalize capitalization
118 254 50       800 return unless( $class->_load_tax_module($source) );
119 254         1785 return "Bio::DB::Taxonomy::$source"->new(@args);
120             }
121             }
122              
123              
124             # empty for now
125       256     sub _initialize { }
126              
127              
128             =head2 get_num_taxa
129              
130             Title : get_num_taxa
131             Usage : my $num = $db->get_num_taxa();
132             Function: Get the number of taxa stored in the database.
133             Returns : A number
134             Args : None
135              
136             =cut
137              
138             sub get_num_taxa {
139 0     0 1 0 shift->throw_not_implemented();
140             }
141              
142              
143             =head2 get_taxon
144              
145             Title : get_taxon
146             Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid);
147             Function: Get a Bio::Taxon object from the database.
148             Returns : Bio::Taxon object
149             Args : just a single value which is the database id, OR named args:
150             -taxonid => taxonomy id (to query by taxonid)
151             OR
152             -name => string (to query by a taxonomy name: common name,
153             scientific name, etc)
154              
155             =cut
156              
157             sub get_taxon {
158 0     0 1 0 shift->throw_not_implemented();
159             }
160              
161             *get_Taxonomy_Node = \&get_taxon;
162              
163              
164             =head2 get_taxonids
165              
166             Title : get_taxonids
167             Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
168             Function: Searches for a taxonid (typically ncbi_taxon_id) based on a query
169             string. Note that multiple taxonids can match to the same supplied
170             name.
171             Returns : array of integer ids in list context, one of these in scalar context
172             Args : string representing the taxon's name
173              
174             =cut
175              
176             sub get_taxonids {
177 0     0 1 0 shift->throw_not_implemented();
178             }
179              
180             *get_taxonid = \&get_taxonids;
181             *get_taxaid = \&get_taxonids;
182              
183              
184             =head2 get_tree
185              
186             Title : get_tree
187             Usage : my $tree = $db->get_tree(@species_names);
188             Function: Generate a tree comprised of the full lineages of all the supplied
189             species names. The nodes for the requested species are given
190             name('supplied') values corresponding to the supplied name, such that
191             they can be identified if the real species name in the database
192             (stored under node_name()) is different. The nodes are also given an
193             arbitrary branch length of 1.
194             Returns : Bio::Tree::Tree
195             Args : A list of species names (strings) to include in the tree.
196              
197             =cut
198              
199             sub get_tree {
200 0     0 1 0 my ($self, @species_names) = @_;
201            
202             # the full lineages of the species are merged into a single tree
203 0         0 my $tree;
204 0         0 for my $name (@species_names) {
205 0         0 my @ids = $self->get_taxonids($name);
206 0 0       0 if (not scalar @ids) {
207 0         0 $self->throw("Could not find species $name in the taxonomy");
208             }
209 0         0 for my $id (@ids) {
210 0         0 my $node = $self->get_taxon(-taxonid => $id);
211 0         0 $node->name('supplied', $name);
212 0 0       0 if ($tree) {
213 0         0 $tree->merge_lineage($node);
214             } else {
215 0         0 $tree = Bio::Tree::Tree->new(-verbose => $self->verbose, -node => $node);
216             }
217             }
218             }
219              
220             # add arbitrary branch length
221 0         0 for my $node ($tree->get_nodes) {
222 0         0 $node->branch_length(1);
223             }
224            
225 0         0 return $tree;
226             }
227              
228              
229             =head2 ancestor
230              
231             Title : ancestor
232             Usage : my $ancestor_taxon = $db->ancestor($taxon);
233             Function: Retrieve the full ancestor taxon of a supplied Taxon from the
234             database.
235             Returns : Bio::Taxon
236             Args : Bio::Taxon (that was retrieved from this database)
237              
238             =cut
239              
240             sub ancestor {
241 0     0 1 0 shift->throw_not_implemented();
242             }
243              
244              
245             =head2 each_Descendent
246              
247             Title : each_Descendent
248             Usage : my @taxa = $db->each_Descendent($taxon);
249             Function: Get all the descendents of the supplied Taxon (but not their
250             descendents, ie. not a recursive fetchall).
251             Returns : Array of Bio::Taxon objects
252             Args : Bio::Taxon (that was retrieved from this database)
253              
254             =cut
255              
256             sub each_Descendent {
257 0     0 1 0 shift->throw_not_implemented();
258             }
259              
260              
261             =head2 get_all_Descendents
262              
263             Title : get_all_Descendents
264             Usage : my @taxa = $db->get_all_Descendents($taxon);
265             Function: Like each_Descendent(), but do a recursive fetchall
266             Returns : Array of Bio::Taxon objects
267             Args : Bio::Taxon (that was retrieved from this database)
268              
269             =cut
270              
271             sub get_all_Descendents {
272 14     14 1 13 my ($self, $taxon) = @_;
273 14         13 my @taxa;
274 14         18 foreach my $desc_taxon ($self->each_Descendent($taxon)) {
275 12         27 push @taxa, ($desc_taxon, $self->get_all_Descendents($desc_taxon));
276             }
277 14         25 return @taxa;
278             }
279              
280              
281             =head2 _load_tax_module
282              
283             Title : _load_tax_module
284             Usage : *INTERNAL Bio::DB::Taxonomy stuff*
285             Function: Loads up (like use) a module at run time on demand
286              
287             =cut
288              
289             sub _load_tax_module {
290 254     254   377 my ($self, $source) = @_;
291 254         551 my $module = "Bio::DB::Taxonomy::" . $source;
292 254         277 my $ok;
293              
294 254         368 eval { $ok = $self->_load_module($module) };
  254         926  
295 254 50       714 if ( $@ ) {
296 0         0 print STDERR $@;
297 0         0 print STDERR <
298             $self: $source cannot be found
299             Exception $@
300             For more information about the Bio::DB::Taxonomy system please see
301             the Bio::DB::Taxonomy docs. This includes ways of checking for
302             formats at compile time, not run time.
303             END
304             ;
305             }
306 254         717 return $ok;
307             }
308              
309              
310             =head2 _handle_internal_id
311              
312             Title : _handle_internal_id
313             Usage : *INTERNAL Bio::DB::Taxonomy stuff*
314             Function: Add an internal ID to a taxon object, ensuring that the taxon gets
315             the same internal ID, regardless of which database it is retrieved
316             from.
317             Returns : The assigned internal ID
318             Args : * A Bio::Taxon
319             * An optional boolean to decide whether or not to try and do the job
320             using scientific name & rank in addition to taxon ID. This is
321             useful if your IDs are not comparable to that of other databases,
322             e.g. if they are arbitrary, as in the case of Bio::DB::Taxonomy::list.
323             CAVEAT: will handle ambiguous names within a database fine, but not
324             across multiple databases.
325              
326             =cut
327              
328             sub _handle_internal_id {
329 6873     6873   6778 my ($self, $taxon, $try_name) = @_;
330 6873 50 33     30980 $self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
331              
332 6873   50     10131 my $taxid = $taxon->id || return;
333 6873   100     10905 my $name = $taxon->scientific_name || '';
334 6873   100     11474 my $rank = $taxon->rank || 'no rank';
335 6873 50       14579 my $dbh = $try_name ? $taxon->db_handle : 'any';
336              
337 6873         34056 my $iid = $TAXON_IIDS->{taxids}->{$dbh}->{$taxid};
338 6873 100 66     23942 if ( (not defined $iid) && $try_name && $name && exists $TAXON_IIDS->{names}->{$name}) {
      100        
      66        
339             # Search for a suitable IID based on species name and ranks
340 1531         1940 my %test_ranks = map {$_ => undef} ($rank, 'no rank');
  3062         6364  
341 1531         4691 SEARCH: while (my ($test_rank, undef) = each %test_ranks) {
342             # Search at the specified rank first, then with 'no rank'
343 1531         1394 while ( my ($test_iid, $test_info) = each %{$TAXON_IIDS->{names}->{$name}->{$rank}} ) {
  2056         9128  
344 1553         4880 while (my ($test_db, $test_taxid) = each %$test_info) {
345 1028 50 33     2798 if ( ($test_db eq $dbh) && not($test_taxid eq $taxid) ) {
346             # Taxa are different (same database, different taxid)
347 0         0 next;
348             }
349             # IID is acceptable since taxa are from different databases,
350             # or from the same database but have the same taxid
351 1028         1022 $iid = $test_iid;
352 1028         2111 $TAXON_IIDS->{taxids}->{$dbh}->{$taxid} = $iid;
353 1028         2801 last SEARCH;
354             }
355             }
356             }
357             }
358              
359 6873 100       9449 if (defined $iid) {
360             # Assign Bio::DB::Taxonomy IID with risky Bio::Tree::Node internal method
361 5523         9810 $taxon->_creation_id($iid);
362             } else {
363             # Register new IID in Bio::DB::Taxonomy
364 1350         2474 $iid = $taxon->internal_id;
365 1350         2756 $TAXON_IIDS->{taxids}->{$dbh}->{$taxid} = $iid;
366 1350 100       2099 if ($name) {
367 1348         4661 $TAXON_IIDS->{names}->{$name}->{$rank}->{$iid}->{$taxon->db_handle} = $taxid
368             }
369             }
370              
371 6873         11213 return $iid;
372              
373             }
374              
375              
376             1;