File Coverage

Bio/DB/Taxonomy/list.pm

Criterion	Covered	Total	%
statement	107	133	80.4
branch	42	64	65.6
condition	18	38	47.3
subroutine	9	10	90.0
pod	7	7	100.0
total	183	252	72.6

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::Taxonomy::list
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Sendu Bala
7							#
8							# Copyright Sendu Bala
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14
15							=head1 NAME
16
17							Bio::DB::Taxonomy::list - An implementation of Bio::DB::Taxonomy
18							that accepts lists of words to build a database
19
20							=head1 SYNOPSIS
21
22							use Bio::DB::Taxonomy;
23
24							my $db = Bio::DB::Taxonomy->new( -source => 'list' );
25
26							my @ranks = ('superkingdom', 'class', 'genus', 'species');
27							my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
28							$db->add_lineage(-names => \@names, -ranks => \@ranks);
29
30							@names = ('Eukaryota', 'Mammalia', 'Mus', 'Mus musculus');
31							$db->add_lineage(-names => \@names, -ranks => \@ranks);
32
33							=head1 DESCRIPTION
34
35							This is an implementation which uses supplied lists of words to create a
36							database from which you can extract Bio::Taxon objects.
37
38							=head1 TODO
39
40							It is possible this module could do something like store the data it builds
41							up to disc. Would that be useful?
42							At any rate, this is why the module is called 'list' and not 'in_memory' or
43							similar.
44
45							=head1 FEEDBACK
46
47							=head2 Mailing Lists
48
49							User feedback is an integral part of the evolution of this and other
50							Bioperl modules. Send your comments and suggestions preferably to
51							the Bioperl mailing list. Your participation is much appreciated.
52
53							bioperl-l@bioperl.org - General discussion
54							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55
56							=head2 Support
57
58							Please direct usage questions or support issues to the mailing list:
59
60							I
61
62							rather than to the module maintainer directly. Many experienced and
63							reponsive experts will be able look at the problem and quickly
64							address it. Please include a thorough description of the problem
65							with code and data examples if at all possible.
66
67							=head2 Reporting Bugs
68
69							Report bugs to the Bioperl bug tracking system to help us keep track
70							of the bugs and their resolution. Bug reports can be submitted via
71							the web:
72
73							https://github.com/bioperl/bioperl-live/issues
74
75							=head1 AUTHOR - Sendu Bala
76
77							Email bix@sendu.me.uk
78
79							=head1 APPENDIX
80
81							The rest of the documentation details each of the object methods.
82							Internal methods are usually preceded with a _
83
84							=cut
85
86							# Let the code begin...
87
88
89							package Bio::DB::Taxonomy::list;
90
91	34			34		191	use strict;
	34					54
	34					912
92	34			34		689	use Bio::Taxon;
	34					56
	34					822
93
94	34			34		137	use base qw(Bio::DB::Taxonomy);
	34					61
	34					37078
95
96							our $prefix = 'list';
97
98
99							=head2 new
100
101							Title : new
102							Usage : my $obj = Bio::DB::Taxonomy::list->new();
103							Function: Builds a new Bio::DB::Taxonomy::list object
104							Returns : an instance of Bio::DB::Taxonomy::list
105							Args : optional, as per the add_lineage() method.
106
107							=cut
108
109							sub new {
110	254			254	1	927	my ($class, @args) = @_;
111	254					1114	my $self = $class->SUPER::new(@args);
112	254					767	my %args = @args;
113	254					629	delete $args{'-source'};
114
115	254	100				1454	$self->add_lineage(%args) if %args;
116
117	254					1244	return $self;
118							}
119
120
121							=head2 add_lineage
122
123							Title : add_lineage
124							Usage : my @names = ('Eukaryota', 'Mammalia', 'Homo', 'Homo sapiens');
125							my @ranks = ('superkingdom', 'class', 'genus', 'species');
126							$db->add_lineage( -names => \@names, -ranks => \@ranks );
127							Function: Add a lineage to the database, where the lineage is described by
128							a list of scientific names in the order root->leaf. The rank of each
129							name can optionally be described by supplying an additional list
130							of rank names in the same order (eg. superkingdom->species).
131							Returns : 1 for success
132							Args : -names => [] : array ref of scientific names, REQUIRED
133							-ranks => [] : array ref of rank names, same order as above, OPTIONAL
134
135							=cut
136
137							sub add_lineage {
138	333			333	1	889	my ($self, @args) = @_;
139	333					1377	my ($names, $ranks) = $self->_rearrange([qw (NAMES RANKS)], @args);
140	333	50	33			1943	$self->throw("-names must be supplied and its value must be an array reference")
141							unless $names && ref($names) eq 'ARRAY';
142
143	333					785	my $names_idx = scalar @$names - 1;
144
145	333	100				863	if ($ranks) {
146	39	50				54	$self->throw("-ranks must be an array reference")
147							unless ref($ranks) eq 'ARRAY';
148	39	50				55	$self->throw("The -names and -ranks lists must be of equal length")
149							unless $names_idx == scalar @$ranks - 1;
150							}
151
152							# This is non-trivial because names are not guaranteed unique in a taxonomy,
153							# and neither are name&rank combinations. Furthermore, different name&rank
154							# combinations can actually refer to the same taxon, eg. when one time
155							# 'Homo'&'genus' is supplied, while another time 'Homo'&'no rank'.
156							#
157							# name&rank&ancestor could well be unique (or good enough 99.9999% of the
158							# time), but we have the added complication that lineages could sometimes be
159							# supplied with differing numbers of taxa. Ideally we want to realise that
160							# the first of these two lineages shares all its nodes with the second:
161							# ('Mammalia', 'Homo', 'Homo sapiens')
162							# ('Mammalia', 'Hominidae', 'Homo', 'Homo sapiens')
163							#
164							# Clearly with limited information we can't do a perfect job, but we can try
165							# and do a reasonable one. So, let's just do the trivial implementation now
166							# and see how bad it is! (assumes ranks are unique except for 'no rank')
167
168	333					661	my $ancestors = $self->{ancestors};
169	333					534	my $node_data = $self->{node_data};
170	333					526	my $name_to_id = $self->{name_to_id};
171	333					532	my $children = $self->{children};
172
173	333					593	my $my_ancestor_id = '';
174	333					509	my @node_ids;
175	333					1036	for my $i (0 .. $names_idx) {
176	3023					3592	my $name = $names->[$i];
177	3023					3089	my $rank = $ranks->[$i]; # if undef, this node has 'no rank'
178
179							# This is a new node with a new id if we haven't seen this name before.
180							# It's also always a new node if this is the first lineage going into
181							# the db.
182							#
183							# We need to handle, however, situations in the future where we try to
184							# merge in a new lineage but we have non-unique names in the lineage
185							# and possible missing classes in some lineages, e.g.
186							# '... Anophelinae, Anopheles, Anopheles, Angusticorn, Anopheles...'
187							# merged with
188							# '... Anophelinae, Anopheles, Angusticorn, Anopheles...'),
189							# but still need the 'tree' to be correct
190
191							# Look for a node that is consistent with this lineage
192	3023					2681	my $node_id;
193	3023					2876	SAME_NAMED: for my $same_id (@{$name_to_id->{$name}}) {
	3023					6502
194
195							# Taxa are the same if it they have the same ancestor or none
196	557		100			792	my $this_ancestor_id = $ancestors->{$same_id} \|\| '';
197	557	100				639	if ($my_ancestor_id eq $this_ancestor_id) {
198	282					244	$node_id = $same_id;
199	282					271	last SAME_NAMED;
200							}
201
202							# Compare children
203	275	100				302	next if $i >= $names_idx; # this taxon has no child
204	223					212	my $my_child_name = $names->[$i + 1];
205							#while ( my ($this_child_id, undef) = each %{$children->{$same_id}} ) {
206	223					183	for my $this_child_id (keys %{$children->{$same_id}}) {
	223					311
207	234	100				372	if ($my_child_name eq $node_data->{$this_child_id}->[0]) { # both children have same name
208	5	50				8	if ($my_ancestor_id) {
209	5					5	my @s_ancestors;
210	5					11	while ($this_ancestor_id = $ancestors->{$this_ancestor_id}) {
211	5	50				7	if ($my_ancestor_id eq $this_ancestor_id) {
212	0					0	$my_ancestor_id = $ancestors->{$same_id};
213	0					0	push @node_ids, @s_ancestors, $my_ancestor_id;
214	0					0	$node_id = $same_id;
215	0					0	last SAME_NAMED;
216							}
217	5					15	unshift @s_ancestors, $this_ancestor_id;
218							}
219							} else {
220							# This new lineage (@$names) doesn't start at the
221							# same root as the existing lineages. Assuming
222							# '$name' corresponds to node $same_id");
223	0					0	$node_id = $same_id;
224	0					0	last SAME_NAMED;
225							}
226							}
227							}
228							}
229
230	3023	100				4677	if (not defined $node_id) {
231							# This is a new node. Add it to the database, using the prefix 'list'
232							# for its ID to distinguish it from the IDs from other taxonomies.
233	2741					3575	my $next_num = ++$self->{node_ids};
234	2741					3507	$node_id = $prefix.$next_num;
235	2741					2642	push @{$self->{name_to_id}->{$name}}, $node_id;
	2741					5853
236	2741					7388	$self->{node_data}->{$node_id}->[0] = $name;
237							}
238
239	3023	100	100			5037	if ( (defined $rank) && (not defined $node_data->{$node_id}->[1]) ) {
240							# Save rank if node in database has no rank but the current node has one
241	114					127	$self->{node_data}->{$node_id}->[1] = $rank;
242							}
243
244	3023	100				4106	if ($my_ancestor_id) {
245	2690	50	66			4847	if ($self->{ancestors}->{$node_id} && $self->{ancestors}->{$node_id} ne $my_ancestor_id) {
246	0					0	$self->throw("The lineage '".join(', ', @$names)."' and a ".
247							"previously stored lineage share a node name but have ".
248							"different ancestries for that node. Can't cope!");
249							}
250	2690					5103	$self->{ancestors}->{$node_id} = $my_ancestor_id;
251							}
252
253	3023					3101	$my_ancestor_id = $node_id;
254	3023					4688	push @node_ids, $node_id;
255							}
256
257							# Go through the lineage in reverse so we can remember the children
258	333					1106	for (my $i = $names_idx - 1; $i >= 0; $i--) {
259	2690					6757	$self->{children}->{$node_ids[$i]}->{$node_ids[$i+1]} = undef;
260							}
261	333					1527	return 1;
262							}
263
264
265							=head2 Bio::DB::Taxonomy Interface implementation
266
267							=head2 get_num_taxa
268
269							Title : get_num_taxa
270							Usage : my $num = $db->get_num_taxa();
271							Function: Get the number of taxa stored in the database.
272							Returns : A number
273							Args : None
274
275							=cut
276
277							sub get_num_taxa {
278	0			0	1	0	my ($self) = @_;
279	0		0			0	return $self->{node_ids} \|\| 0;
280							}
281
282
283							=head2 get_taxon
284
285							Title : get_taxon
286							Usage : my $taxon = $db->get_taxon(-taxonid => $taxonid)
287							Function: Get a Bio::Taxon object from the database.
288							Returns : Bio::Taxon object
289							Args : A single value which is the ID of the taxon to retrieve
290							OR named args, as follows:
291							-taxonid => Taxonomy ID (NB: these are not NCBI taxonomy ids but
292							'list' pre-fixed ids unique to the list database).
293							OR
294							-name => String (to query by a taxonomy name). A given taxon name
295							can match different taxonomy objects. When that is the
296							case, a warning is displayed and the first matching taxon
297							is reported. See get_taxonids() to get all matching taxon
298							IDs.
299							OR
300							-names => Array ref of lineage names, like in add_lineage(). To
301							overcome the limitations of -name, you can use -names to
302							provide the full lineage of the taxon you want and get a
303							unique, unambiguous taxon object.
304
305							=cut
306
307							sub get_taxon {
308	6914			6914	1	13900	my ($self, @args) = @_;
309
310	6914					7934	my $taxonid;
311	6914	100				11093	if (scalar @args == 1) {
312							# Argument is a taxon ID
313	2156					2837	$taxonid = $args[0];
314							} else {
315							# Got named arguments
316	4758					6025	my ($name, $names);
317	4758					14156	($taxonid, $name, $names) = $self->_rearrange([qw(TAXONID NAME NAMES)], @args);
318	4758	100				10764	if ($name) {
319	2					6	$names = [$name];
320							}
321	4758	100				7789	if ($names) {
322	2					3	$name = $names->[-1];
323
324	2					9	my @taxonids = $self->get_taxonids($name);
325	2					4	$taxonid = $taxonids[0];
326
327							# Use provided lineage to find correct ID amongst several matching IDs
328	2	50	33			6	if ( (scalar @taxonids > 1) && (scalar @$names > 1) ) {
329	0					0	for my $query_taxonid (@taxonids) {
330	0					0	my $matched = 1;
331	0					0	my $db_ancestor = $self->get_taxon($query_taxonid);
332	0					0	for (my $i = $#$names-1; $i >= 0; $i--) {
333	0					0	my $query_ancestor_name = $names->[$i];
334	0					0	$db_ancestor = $db_ancestor->ancestor;
335	0					0	my $db_ancestor_name = '';
336	0	0				0	if ($db_ancestor) {
337	0					0	$db_ancestor_name = $db_ancestor->node_name;
338							}
339	0	0				0	if (not ($query_ancestor_name eq $db_ancestor_name) ) {
340	0					0	$matched = 0;
341	0					0	last; # done testing this taxonid
342							}
343							}
344	0	0				0	if ($matched == 1) {
345	0					0	@taxonids = [$query_taxonid];
346	0					0	$taxonid = $query_taxonid;
347	0					0	last; # done testing all taxonids
348							}
349							}
350							}
351
352							# Warn if several taxon IDs matched
353	2	50				9	if (scalar @taxonids > 1) {
354	0	0				0	$self->warn("There were multiple ids (@taxonids) matching '$name',".
355							" using '$taxonid'") if scalar @taxonids > 1;
356							}
357
358							}
359							}
360
361							# Now that we have the taxon ID, retrieve the corresponding Taxon object
362	6914					7504	my $taxon;
363	6914					10702	my $node = $self->{node_data}->{$taxonid};
364	6914	100				11086	if ($node) {
365	6910					11054	my ($sci_name, $rank) = @$node;
366	6910					19204	$taxon = Bio::Taxon->new(
367							-name => $sci_name,
368							-object_id => $taxonid, # not an ncbi taxid, simply an object id
369							);
370
371	6910	100				12583	if ($rank) {
372	53					100	$taxon->rank($rank);
373							}
374
375							# we can't use -dbh or the db_handle() method ourselves or we'll go
376							# infinite on the merge attempt
377	6910					9792	$taxon->{'db_handle'} = $self;
378
379	6910					15257	$self->_handle_internal_id($taxon, 1);
380							}
381
382	6914					17557	return $taxon;
383							}
384
385							*get_Taxonomy_Node = \&get_taxon;
386
387
388							=head2 get_taxonids
389
390							Title : get_taxonids
391							Usage : my @taxonids = $db->get_taxonids('Homo sapiens');
392							Function: Searches for a taxonid (generated by the list module) based on a
393							query string. Note that multiple taxonids can match to the same
394							supplied name.
395							Returns : array of integer ids in list context, one of these in scalar context
396							Args : string representing taxon's name
397
398							=cut
399
400							sub get_taxonids {
401	259			259	1	1857	my ($self, $name) = @_;
402	259	100				741	return wantarray() ? @{$self->{name_to_id}->{$name} \|\| []} : $self->{name_to_id}->{$name}->[0];
	259	50				1218
403							}
404
405							*get_taxonid = \&get_taxonids;
406
407
408							=head2 ancestor
409
410							Title : ancestor
411							Usage : my $ancestor_taxon = $db->ancestor($taxon)
412							Function: Retrieve the full ancestor taxon of a supplied Taxon from the
413							database.
414							Returns : Bio::Taxon
415							Args : Bio::Taxon (that was retrieved from this database)
416
417							=cut
418
419							sub ancestor {
420	4500			4500	1	6578	my ($self, $taxon) = @_;
421	4500	50				7419	$taxon \|\| return; # for bug 2092, or something similar to it at least: shouldn't need this!
422	4500	50	33			17189	$self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
423	4500	50	33			8067	$self->throw("The supplied Taxon must belong to this database")
424							unless $taxon->db_handle && $taxon->db_handle eq $self;
425	4500		33			8313	my $id = $taxon->id \|\| $self->throw("The supplied Taxon is missing its id!");
426
427	4500		100			15262	my $ancestor_id = $self->{ancestors}->{$id} \|\| return;
428	2131					3646	return $self->get_taxon($ancestor_id);
429							}
430
431
432							=head2 each_Descendent
433
434							Title : each_Descendent
435							Usage : my @taxa = $db->each_Descendent($taxon);
436							Function: Get all the descendents of the supplied Taxon (but not their
437							descendents, ie. not a recursive fetchall).
438							Returns : Array of Bio::Taxon objects
439							Args : Bio::Taxon (that was retrieved from this database)
440
441							=cut
442
443							sub each_Descendent {
444	18			18	1	37	my ($self, $taxon) = @_;
445	18	50	33			83	$self->throw("Must supply a Bio::Taxon") unless ref($taxon) && $taxon->isa('Bio::Taxon');
446	18	50	33			35	$self->throw("The supplied Taxon must belong to this database")
447							unless $taxon->db_handle && $taxon->db_handle eq $self;
448	18		33			34	my $id = $taxon->id \|\| $self->throw("The supplied Taxon is missing its id!");
449
450	18					23	my @children;
451	18					21	while ( my ($child_id, undef) = each %{$self->{children}->{$id}} ) {
	38					121
452	20		50			41	push @children, ($self->get_taxon($child_id) \|\| next);
453							}
454
455	18					54	return @children;
456							}
457
458
459							1;