File Coverage

blib/lib/Bio/LITE/Taxonomy/NCBI/Gi2taxid.pm
Criterion Covered Total %
statement 92 100 92.0
branch 24 38 63.1
condition 8 16 50.0
subroutine 13 13 100.0
pod 3 3 100.0
total 140 170 82.3


line stmt bran cond sub pod time code
1             package Bio::LITE::Taxonomy::NCBI::Gi2taxid;
2              
3             =head1 NAME
4              
5             Bio::LITE::Taxonomy::NCBI::Gi2taxid - Mappings of NCBI GI's to Taxids fast and with very low memory footprint.
6              
7             =head1 SYNOPSIS
8              
9             Creation of a new Taxid to GI dictionary (binary mapping file):
10              
11             use Bio::LITE::Taxonomy::NCBI::Gi2taxid qw/new_dict/;
12              
13             new_dict (in => "gi_taxid_prot.dmp",
14             out => "gi_taxid_prot.bin");
15              
16             Usage of the dictionary:
17              
18             use Bio::LITE::Taxonomy::NCBI::Gi2taxid;
19              
20             my $dict = Bio::LITE::Taxonomy::NCBI::Gi2taxid->new(dict=>"gi_taxid_prot.bin");
21             my $taxid = $dict->get_taxid(12553);
22              
23             =head1 DESCRIPTION
24              
25             The NCBI site offers a file to map gene and protein sequences (GIs) with their corresponding taxon of origin (Taxids). If you want to use this information inside a Perl script you will find that (given the high amount of sequences available) it is fairly inefficient to store this information in, for example, a regular hash. Only for creating such a hash you will need more than 10 GBs of system memory.
26              
27             This is a very simple module that has been designed to efficiently map NCBI GIs to Taxids with speed as the primary goal. It is designed to retrieve taxids from GIs very fast and with low memory usage. It is even faster than using a SQL database to retrieve the mappings or using a local DBHash.
28              
29             To achieve this, it uses a binary index that can be created with the function C. This index has to be created one time for each mapping file.
30              
31             The original mapping files can be downloaded from the NCBI site at the following address: L.
32              
33             =head1 FUNCTIONS
34              
35             =head2 C
36              
37             This function creates a new binary dictionary from the NCBI mapping file. The file should be uncompressed before being passed to the script. The function accepts the following parameters:
38              
39             *NOTE* From version 0.05, the lib uses a more compacted memory file. This means that binary files created with earlier versions will not work with this one and vice-versa. You need to create the new binary db with this version.
40              
41             =over 4
42              
43             =item in
44              
45             This is the uncompressed mapping file from the NCBI. The function accepts a filename or a filehandle
46              
47             =item out
48              
49             Optional. Where the binary dictionary is going to be printed. The function accepts a filename or a filehandle (that should be opened with writing permissions). If absent STDOUT will be assumed.
50              
51             =item chunk_size
52              
53             Optional. While bin conversion, the lib stores chunks of data in memory to speed up the conversion. This number specifies the size of the chunks. By default 30Mb is used. The whole chunk is stored in a Perl scalar so be careful not to overflow the scalar capacity.
54              
55             =back
56              
57             =head1 CONSTRUCTOR
58              
59             =head2 C
60              
61             Once the binary dictionary is created it can be used as an object using this constructor. It accepts the following parameters
62              
63             =over 4
64              
65             =item dict
66              
67             This is the binary dictionary obtained with the C function. The name of the file or a filehandle is accepted.
68              
69             =item save_mem
70              
71             Optional. Use this option to avoid to load the binary dictionary into memory. This will save almost 1GB of system memory but looking up for Taxids will be ~20% slower. This option is I by default.
72              
73             =back
74              
75             =head1 METHODS
76              
77             =head2 C
78              
79             This method receives a GI and returns the corresponding Taxid.
80              
81             =head1 SEE ALSO
82              
83             L
84             L
85              
86             =head1 AUTHOR
87              
88             Miguel Pignatelli
89              
90             Any comments should be addressed to emepyc@gmail.com
91              
92             =head1 LICENSE
93              
94             Copyright 2013 Miguel Pignatelli, all rights reserved.
95              
96             This library is free software; you may redistribute it and/or modify it under the same terms as Perl itself.
97              
98              
99             =cut
100              
101 3     3   48797 use strict;
  3         6  
  3         126  
102 3     3   16 use warnings;
  3         5  
  3         158  
103 3     3   17 use Carp qw/croak/;
  3         9  
  3         205  
104 3     3   1700 use File::Tail;
  3         71411  
  3         168  
105 3     3   1999 use POSIX;
  3         15960  
  3         18  
106              
107 3     3   7616 use Exporter;
  3         6  
  3         138  
108 3     3   15 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
  3         4  
  3         1155  
109             @ISA = qw(Exporter);
110              
111             @EXPORT = (); # Only qualified exports are allowed
112             @EXPORT_OK = qw(new_dict);
113             $VERSION = '0.13';
114              
115             sub new
116             {
117 3     3 1 1093 my ($class, %args) = @_;
118 3         3 my %opts;
119              
120 3 50       8 $args{'dict'} or croak "Need the dictionary file";
121 3 50       41 croak "\nERROR\n$args{dict}: File not found\n" unless -e $args{dict};
122              
123 3   50     12 my $save_mem = $args{'save_mem'} || 0;
124 3         4 my $dictfh;
125 3 50 33     22 if ((UNIVERSAL::isa($args{dict}, 'GLOB')) or (ref \$args{dict} eq 'GLOB')) {
126 0         0 $dictfh = $args{dict}; # TO DO: Test flags, the filehandle must be readable
127             } else {
128 3 100       276 croak "\nERROR\nThe file containing the gi <-> taxid correspondences must be converted to binary format. See the documentation of this module for more details." unless (-B $args{dict});
129 2 50       45 open $dictfh, "<:raw", $args{dict} or croak "$!: $args{dict}";
130             }
131              
132 2         5 $opts{fh} = $dictfh;
133 2         5 $opts{save_mem} = $save_mem;
134 2         3 my $self = bless \%opts;
135              
136 2         6 $self -> _build_dict();
137 2         6 return $self;
138             }
139              
140             sub _build_dict
141             {
142 2     2   3 my ($self) = @_;
143 2         2 my $data;
144 2 50       11 $self->{save_mem} && return;
145 2         17 my $n = sysread( $self->{fh}, $data, -s $self->{fh} );
146 2 50       5 croak "Can't read input file" unless ($n);
147 2         5 $self->{dict} = \$data;
148             }
149              
150             sub get_taxid
151             {
152 4     4 1 638 my ($self, $gi) = @_;
153 4         8 return $self->_direct_lookup ($gi);
154             }
155              
156             sub _direct_lookup {
157 4     4   4 my ($self,$gi) = @_;
158 4 50       9 if ($self->{save_mem}){
159 0         0 my $taxidBytes;
160 0         0 sysseek ($self->{fh},$gi*3,0);
161 0         0 sysread($self->{fh},$taxidBytes,3);
162 0         0 my ($taxid1, $taxid2, $taxid3) = unpack "CCC", $taxidBytes;
163 0         0 return $taxid3 | $taxid2 << 8 | $taxid1 << 16 | 0 << 24 ;
164             } else {
165 3     3   15 no warnings qw/substr/;
  3         3  
  3         1550  
166 4         4 my $v = substr(${$self->{dict}}, $gi*3, 3);
  4         14  
167 4 50       11 return 0 unless ($v);
168 4         18 my ($taxid1, $taxid2, $taxid3) = unpack "CCC", $v;
169 4         13 my $taxid = $taxid3 | $taxid2 << 8 | $taxid1 << 16 | 0 << 24 ;
170 4         15 return $taxid;
171             }
172             }
173              
174             sub new_dict {
175 3     3 1 1989 my (%args) = @_;
176              
177             # TO DO -> Multiple inputs should be allowed
178 3 100       154 defined $args{in} or croak "No input dictionary file provided";
179              
180 2         4 my $outfh;
181 2 50 66     26 if (! defined $args{out}) {
    100          
182 0         0 $outfh = \*STDOUT
183             } elsif ((UNIVERSAL::isa($args{out}, 'GLOB')) or (ref \$args{out} eq 'GLOB')) {
184 1         4 $outfh = $args{out}; ## TO DO: Test flags, the filehandle must be writeable
185             } else {
186 1 50       76 open $outfh, ">", $args{out} or croak $!;
187             }
188              
189 2         3 my $infh;
190 2 50 33     29 if ((UNIVERSAL::isa($args{in}, 'GLOB')) or (ref \$args{in} eq 'GLOB')) {
191 0         0 $infh = $args{in}; ## TO DO: Test flags, the filehandle must be readable
192             } else {
193 2 50       74 open $infh, "<", $args{in} or croak $!;
194             }
195              
196 2 50       20 my $ftail = File::Tail->new(name=>$args{in},tail=>1) or croak "$!: $args{in}";
197 2         879 my $last_line = $ftail->read();
198 2 50       48 croak "$args{in} is empty" unless (defined $last_line);
199 2         9 my ($last_val) = split /\t/, $last_line;
200              
201 2   50     6 my $chunk_size = $args{chunk_size} || 30000000; # 30Mb -- 10 million records per chunk
202 2         16 $chunk_size = floor($chunk_size / 3);
203              
204 2         7 my $bin_chunk = "\0" x (3 * ($chunk_size));
205 2         3 my $n_chunk = 0;
206 2         3 my $chunk_pos = $chunk_size * $n_chunk;
207 2         3 my $chunk_limit = $chunk_size * ($n_chunk + 1);
208 2         23 while (<$infh>) {
209 24         28 chomp;
210 24         55 my ($key,$val) = split /\t/;
211 24         85 my ($taxid1, $taxid2, $taxid3, $taxid4) = unpack("CCCC", pack("N", $val));
212 24 100       53 if ($key >= $chunk_limit) {
213 8         8 print {$outfh} $bin_chunk;
  8         27  
214 8         13 $bin_chunk = "\0" x (3 * ($chunk_size));
215 8         31 $n_chunk++;
216 8         4 $chunk_pos = $chunk_size * $n_chunk;
217 8         8 $chunk_limit = $chunk_size * ($n_chunk + 1);
218             }
219 24         49 substr($bin_chunk, ($key-$chunk_pos)*3, 1, pack("C", $taxid2));
220 24         40 substr($bin_chunk, ($key-$chunk_pos)*3+1, 1, pack("C", $taxid3));
221 24         99 substr($bin_chunk, ($key-$chunk_pos)*3+2, 1, pack("C", $taxid4));
222             }
223 2         2 print {$outfh} $bin_chunk;
  2         3  
224              
225 2         12 close ($infh);
226 2 100 66     21 if (defined $args{out} && ref \$args{out} eq "SCALAR") {
227 1         27 close($outfh)}
228             }
229              
230             1;