File Coverage

Bio/DB/Taxonomy/greengenes.pm

Criterion	Covered	Total	%
statement	32	32	100.0
branch	7	8	87.5
condition	2	2	100.0
subroutine	4	4	100.0
pod	1	1	100.0
total	46	47	97.8

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::Taxonomy::greengenes
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Florent Angly
7							#
8							# Copyright Florent Angly
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::DB::Taxonomy::greengenes - Use the Greengenes taxonomy
17
18							=head1 SYNOPSIS
19
20							use Bio::DB::Taxonomy;
21
22							my $db = Bio::DB::Taxonomy->new(
23							-source => 'greengenes',
24							-taxofile => 'taxonomy_16S_candiv_gg_2011_1.txt'
25							);
26
27							=head1 DESCRIPTION
28
29							I
30
31							Bio::DB::Taxonomy::greengenes is an implementation of Bio::DB::Taxonomy which
32							stores and accesses the Greengenes taxonomy of Bacteria and Archaea. Internally,
33							it keeps the taxonomy into memory by using Bio::DB::Taxonomy::list. As a
34							consequence, note that the IDs assigned to the taxonomy nodes, e.g. gg123, are
35							arbitrary, contrary to the pre-defined IDs that NCBI assigns to taxons.
36
37							The latest release of the Greengene taxonomy (2011) contains about 4,600 taxa
38							and occupies about 4MB of memory once parsed into a Bio::DB::Taxonomy::greengenes
39							object. The taxonomy files taxonomy_16S_all_gg_2011_1.txt and
40							taxonomy_16S_candiv_gg_2011_1.txt that this module can use are available from
41							L.
42
43							=head1 FEEDBACK
44
45							=head2 Mailing Lists
46
47							User feedback is an integral part of the evolution of this and other
48							Bioperl modules. Send your comments and suggestions preferably to
49							the Bioperl mailing list. Your participation is much appreciated.
50
51							bioperl-l@bioperl.org - General discussion
52							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53
54							=head2 Support
55
56							Please direct usage questions or support issues to the mailing list:
57
58							I
59
60							rather than to the module maintainer directly. Many experienced and
61							reponsive experts will be able look at the problem and quickly
62							address it. Please include a thorough description of the problem
63							with code and data examples if at all possible.
64
65							=head2 Reporting Bugs
66
67							Report bugs to the Bioperl bug tracking system to help us keep track
68							of the bugs and their resolution. Bug reports can be submitted via
69							the web:
70
71							https://github.com/bioperl/bioperl-live/issues
72
73							=head1 AUTHOR - Florent Angly
74
75							florent.angly@gmail.com
76
77							=head1 APPENDIX
78
79							The rest of the documentation details each of the object methods.
80							Internal methods are usually preceded with a _
81
82							=cut
83
84							# Let the code begin...
85
86							package Bio::DB::Taxonomy::greengenes;
87
88	1			1		3	use strict;
	1					1
	1					26
89	1			1		3	use base qw(Bio::DB::Taxonomy Bio::DB::Taxonomy::list);
	1					1
	1					335
90
91							$Bio::DB::Taxonomy::list::prefix = 'gg';
92
93
94							=head2 new
95
96							Title : new
97							Usage : my $obj = Bio::DB::Taxonomy::greengenes->new();
98							Function: Builds a new Bio::DB::Taxonomy::greengenes object
99							Returns : an instance of Bio::DB::Taxonomy::greengenes
100							Args : -taxofile => name of the file containing the taxonomic information,
101							typically 'taxonomy_16S_candiv_gg_2011_1.txt' (mandatory)
102
103							=cut
104
105							sub new {
106							# Override Bio::DB::Taxonomy
107	2			2	1	4	my($class, @args) = @_;
108	2					7	my $self = $class->SUPER::new(@args);
109	2					8	my ($taxofile) = $self->_rearrange([qw(TAXOFILE)], @args);
110
111	2	100				5	if ( $taxofile ) {
112	1					2	$self = $self->_build_taxonomy($taxofile);
113							}
114
115	2					11	return $self;
116							}
117
118
119							sub _build_taxonomy {
120	1			1		1	my ($self, $taxofile) = @_;
121
122	1					2	my $all_ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'];
123
124	1					2	my $taxonomy = Bio::DB::Taxonomy::list->new();
125
126	1	50				36	open my $fh, '<', $taxofile or $self->throw("Could not read file '$taxofile': $!");
127
128							# Will skip header line: prokMSA_id taxonomy
129	1					1	my $prev_taxo_string = 'taxonomy';
130
131	1					1	my $line;
132
133							# Parse taxonomy lines. Example:
134							# 348902 k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__Bacteroides plebeius
135	1					20	while ($line = <$fh>) {
136	51					39	chomp $line;
137	51					98	my ($prokmsa_id, $taxo_string) = split "\t", $line;
138
139							# Skip taxonomy string already seen on previous line (much faster!)
140	51	100				84	next if $taxo_string eq $prev_taxo_string;
141	40					33	$prev_taxo_string = $taxo_string;
142
143							# Remove ambiguous taxons, i.e. go from:
144							# k__Archaea; p__pMC2A384; c__; o__; f__; g__; s__
145							# to:
146							# k__Archaea; p__pMC2A384
147	40					194	my $names = [split /;\s*/, $taxo_string];
148	40		100			117	while ( ($names->[-1] \|\| '') =~ m/__$/) {
149	50					101	pop @$names;
150							}
151
152	40					30	my $nof_ranks = scalar @$names;
153	40	100				51	next if $nof_ranks < 1;
154
155							$taxonomy->add_lineage(
156	39					56	-ranks => [ @{$all_ranks}[0..$nof_ranks-1] ],
	39					118
157							-names => $names,
158							);
159
160							}
161
162	1					28	close $fh;
163
164	1					5	return $taxonomy;
165							}
166
167
168							1;