File Coverage

Bio/DB/Fasta.pm

Criterion	Covered	Total	%
statement	132	139	94.9
branch	29	40	72.5
condition	14	21	66.6
subroutine	22	24	91.6
pod	1	2	50.0
total	198	226	87.6

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::Fasta
3							#
4							# You may distribute this module under the same terms as perl itself
5							#
6
7
8							=head1 NAME
9
10							Bio::DB::Fasta - Fast indexed access to fasta files
11
12							=head1 SYNOPSIS
13
14							use Bio::DB::Fasta;
15
16							# Create database from a directory of Fasta files
17							my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
18							my @ids = $db->get_all_primary_ids;
19
20							# Simple access
21							my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
22							my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
23							my $length = $db->length('CHROMOSOME_I');
24							my $header = $db->header('CHROMOSOME_I');
25							my $alphabet = $db->alphabet('CHROMOSOME_I');
26
27							# Access to sequence objects. See Bio::PrimarySeqI.
28							my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
29							my $seqstr = $seq->seq;
30							my $subseq = $seq->subseq(4_000_000 => 4_100_000);
31							my $trunc = $seq->trunc(4_000_000 => 4_100_000);
32							my $length = $seq->length;
33
34							# Loop through sequence objects
35							my $stream = $db->get_PrimarySeq_stream;
36							while (my $seq = $stream->next_seq) {
37							# Bio::PrimarySeqI stuff
38							}
39
40							# Filehandle access
41							my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
42							while (my $seq = <$fh>) {
43							# Bio::PrimarySeqI stuff
44							}
45
46							# Tied hash access
47							tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
48							print $sequences{'CHROMOSOME_I:1,20000'};
49
50							=head1 DESCRIPTION
51
52							Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
53							or a directory of files. It provides persistent random access to each sequence
54							entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
55							subsequences within each entry, allowing you to retrieve portions of very large
56							sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
57							based on Bio::DB::IndexedBase. See this module's documentation for details.
58
59							The Fasta files may contain any combination of nucleotide and protein sequences;
60							during indexing the module guesses the molecular type. Entries may have any line
61							length up to 65,536 characters, and different line lengths are allowed in the
62							same file. However, within a sequence entry, all lines must be the same length
63							except for the last. An error will be thrown if this is not the case.
64
65							The module uses /^E(\S+)/ to extract the primary ID of each sequence
66							from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
67							routine to reversibly modify this primary ID, e.g. if you wish to extract a
68							specific portion of the gi\|gb\|abc\|xyz GenBank IDs.
69
70							=head1 DATABASE CREATION AND INDEXING
71
72							The object-oriented constructor is new(), the filehandle constructor is newFh()
73							and the tied hash constructor is tie(). They all allow to index a single Fasta
74							file, several files, or a directory of files. See Bio::DB::IndexedBase.
75
76							=head1 SEE ALSO
77
78							L
79
80							L
81
82							L
83
84							=head1 AUTHOR
85
86							Lincoln Stein Elstein@cshl.orgE.
87
88							Copyright (c) 2001 Cold Spring Harbor Laboratory.
89
90							This library is free software; you can redistribute it and/or modify
91							it under the same terms as Perl itself. See DISCLAIMER.txt for
92							disclaimers of warranty.
93
94							=head1 APPENDIX
95
96							The rest of the documentation details each of the object
97							methods. Internal methods are usually preceded with a _
98
99							For BioPerl-style access, the following methods are provided:
100
101							=head2 get_Seq_by_id
102
103							Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
104							Usage : my $seq = $db->get_Seq_by_id($id);
105							Function: Given an ID, fetch the corresponding sequence from the database.
106							Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
107							Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
108							only load the sequence string into memory when requested using seq().
109							See L for methods provided by the sequence objects
110							returned from get_Seq_by_id() and get_PrimarySeq_stream().
111							Args : ID
112
113							=head2 get_PrimarySeq_stream
114
115							Title : get_PrimarySeq_stream
116							Usage : my $stream = $db->get_PrimarySeq_stream();
117							Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
118							single method, next_seq(). Each call to next_seq() returns a new
119							Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
120							Returns : A Bio::DB::Indexed::Stream object
121							Args : None
122
123							=head1
124
125							For simple access, the following methods are provided:
126
127							=cut
128
129
130							package Bio::DB::Fasta;
131
132	1			1		3	use strict;
	1					1
	1					23
133	1			1		394	use IO::File;
	1					767
	1					88
134	1			1		5	use File::Spec;
	1					1
	1					15
135	1			1		331	use Bio::PrimarySeqI;
	1					2
	1					29
136
137	1			1		5	use base qw(Bio::DB::IndexedBase);
	1					2
	1					663
138
139							our $obj_class = 'Bio::PrimarySeq::Fasta';
140							our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
141
142
143							=head2 new
144
145							Title : new
146							Usage : my $db = Bio::DB::Fasta->new( $path, %options);
147							Function: Initialize a new database object. When indexing a directory, files
148							ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
149							Returns : A new Bio::DB::Fasta object.
150							Args : A single file, or path to dir, or arrayref of files
151							Optional arguments: see Bio::DB::IndexedBase
152
153							=cut
154
155
156							sub _calculate_offsets {
157							# Bio::DB::IndexedBase calls this to calculate offsets
158	53			53		125	my ($self, $fileno, $file, $offsets) = @_;
159
160	53	50				541	my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
161	53					5620	binmode $fh;
162	53	50				203	warn "Indexing $file\n" if $self->{debug};
163	53					70	my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
164							$last_line, %offsets);
165	53					190	my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
166
167	53					136	my $termination_length = $self->{termination_length};
168	53					895	while (my $line = <$fh>) {
169							# Account for crlf-terminated Windows files
170	36648	100				88521	if (index($line, '>') == 0) {
		100
171	3009	100				8618	if ($line =~ /^>(\S+)/) {
172							print STDERR "Indexed $count sequences...\n"
173	3008	50	33			6240	if $self->{debug} && (++$count%1000) == 0;
174
175							# please, do not enforce arbitrary line length requirements.
176							# It's good practice but not enforced.
177
178							#$self->_check_linelength($linelen);
179	3008					3851	my $pos = tell($fh);
180	3008	100				5369	if (@ids) {
181	2956					3243	my $strlen = $pos - $offset - length($line);
182	2956					2687	$strlen -= $termination_length * $seq_lines;
183	2956					2336	my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
	2956					7710
184							$linelen, $headerlen, $alphabet, $fileno);
185	2956					3162	$alphabet = Bio::DB::IndexedBase::NA;
186	2956					3695	for my $id (@ids) {
187	2962					50274	$offsets->{$id} = $ppos;
188							}
189							}
190	3008					7141	@ids = $self->_makeid($line);
191	3008					3907	($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
192	3008					3563	($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
193							} else {
194							# Catch bad header lines, bug 3172
195	1					28	$self->throw("FASTA header doesn't match '>(\\S+)': $line");
196							}
197							} elsif ($line !~ /\S/) {
198							# Skip blank line
199	89					74	$blank_lines++;
200	89					333	next;
201							} else {
202							# Need to check every line :(
203	33550					24154	$l3_len = $l2_len;
204	33550					20584	$l2_len = $l_len;
205	33550					20312	$l_len = length $line;
206	33550					21671	if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
207	33550	50	66			118855	if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
			66
208	0					0	my $fap = substr($line, 0, 20)."..";
209	0					0	$self->throw("Each line of the fasta entry must be the same ".
210							"length except the last. Line above #$. '$fap' is $l2_len".
211							" != $l3_len chars.");
212							}
213	33550	100				40920	if ($blank_lines) {
214							# Blank lines not allowed in entry
215	1					9	$self->throw("Blank lines can only precede header lines, ".
216							"found preceding line #$.");
217							}
218							}
219	33549		66			44425	$linelen \|\|= length $line;
220	33549		66			39936	$alphabet \|\|= $self->_guess_alphabet($line);
221	33549					22163	$seq_lines++;
222							}
223	36557					84267	$last_line = $line;
224							}
225
226							# Process last entry
227	51					290	$self->_check_linelength($linelen);
228	51					106	my $pos = tell $fh;
229	51	50				138	if (@ids) {
230	51					82	my $strlen = $pos - $offset;
231	51	100				114	if ($linelen == 0) { # yet another pesky empty chr_random.fa file
232	11					20	$strlen = 0;
233							} else {
234	40	50				240	if ($last_line !~ /\s$/) {
235	0					0	$seq_lines--;
236							}
237	40					82	$strlen -= $termination_length * $seq_lines;
238							}
239	51					70	my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
	51					225
240							$headerlen, $alphabet, $fileno);
241	51					130	for my $id (@ids) {
242	52					1020	$offsets->{$id} = $ppos;
243							}
244							}
245
246	51					2395	return \%offsets;
247							}
248
249							=head2 seq
250
251							Title : seq, sequence, subseq
252							Usage : # Entire sequence string
253							my $seqstr = $db->seq($id);
254							# Subsequence
255							my $subseqstr = $db->seq($id, $start, $stop, $strand);
256							# or...
257							my $subseqstr = $db->seq($compound_id);
258							Function: Get a subseq of a sequence from the database. For your convenience,
259							the sequence to extract can be specified with any of the following
260							compound IDs:
261							$db->seq("$id:$start,$stop")
262							$db->seq("$id:$start..$stop")
263							$db->seq("$id:$start-$stop")
264							$db->seq("$id:$start,$stop/$strand")
265							$db->seq("$id:$start..$stop/$strand")
266							$db->seq("$id:$start-$stop/$strand")
267							$db->seq("$id/$strand")
268							In the case of DNA or RNA sequence, if $stop is less than $start,
269							then the reverse complement of the sequence is returned. Avoid using
270							it if possible since this goes against Bio::Seq conventions.
271							Returns : A string
272							Args : ID of sequence to retrieve
273							or
274							Compound ID of subsequence to fetch
275							or
276							ID, optional start (defaults to 1), optional end (defaults to length
277							of sequence) and optional strand (defaults to 1).
278
279							=cut
280
281							sub subseq {
282	29			29	0	841	my ($self, $id, $start, $stop, $strand) = @_;
283	29	50				69	$self->throw('Need to provide a sequence ID') if not defined $id;
284	29					85	($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
285
286	29					44	my $data;
287
288	29	100				81	my $fh = $self->_fh($id) or return;
289	28					99	my $filestart = $self->_calc_offset($id, $start);
290	28					63	my $filestop = $self->_calc_offset($id, $stop );
291
292	28					106	seek($fh, $filestart,0);
293	28					280	read($fh, $data, $filestop-$filestart+1);
294
295	28					86	$data = Bio::DB::IndexedBase::_strip_crnl($data);
296
297	28	100				63	if ($strand == -1) {
298							# Reverse-complement the sequence
299	7					24	$data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id));
300							}
301	28					130	return $data;
302							}
303
304							seq = sequence = \&subseq;
305
306
307							=head2 length
308
309							Title : length
310							Usage : my $length = $qualdb->length($id);
311							Function: Get the number of residues in the indicated sequence.
312							Returns : Number
313							Args : ID of entry
314
315							=head2 header
316
317							Title : header
318							Usage : my $header = $db->header($id);
319							Function: Get the header line (ID and description fields) of the specified
320							sequence.
321							Returns : String
322							Args : ID of sequence
323
324							=cut
325
326							sub header {
327	2			2	1	6	my ($self, $id) = @_;
328	2	50				8	$self->throw('Need to provide a sequence ID') if not defined $id;
329	2					10	my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
	2					8
330	2					6	$offset -= $headerlen;
331	2					2	my $data;
332	2	50				8	my $fh = $self->_fh($id) or return;
333	2					9	seek($fh, $offset, 0);
334	2					16	read($fh, $data, $headerlen);
335							# On Windows chomp remove '\n' but leaves '\r'
336							# when reading '\r\n' in binary mode
337	2					6	$data = Bio::DB::IndexedBase::_strip_crnl($data);
338	2					7	substr($data, 0, 1) = '';
339	2					8	return $data;
340							}
341
342
343							=head2 alphabet
344
345							Title : alphabet
346							Usage : my $alphabet = $db->alphabet($id);
347							Function: Get the molecular type of the indicated sequence: dna, rna or protein
348							Returns : String
349							Args : ID of sequence
350
351							=cut
352
353
354							#-------------------------------------------------------------
355							# Bio::PrimarySeqI compatibility
356							#
357							package Bio::PrimarySeq::Fasta;
358	1			1		16	use overload '""' => 'display_id';
	1					2
	1					19
359
360	1			1		120	use base qw(Bio::Root::Root Bio::PrimarySeqI);
	1					2
	1					1016
361
362							sub new {
363	18			18		38	my ($class, @args) = @_;
364	18					64	my $self = $class->SUPER::new(@args);
365	18					97	my ($db, $id, $start, $stop) = $self->_rearrange(
366							[qw(DATABASE ID START STOP)],
367							@args);
368	18					36	$self->{db} = $db;
369	18					27	$self->{id} = $id;
370	18		100			56	$self->{stop} = $stop \|\| $db->length($id);
371	18		66			87	$self->{start} = $start \|\| ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
372	18					72	return $self;
373							}
374
375							sub fetch_sequence {
376	0			0		0	return shift->seq(@_);
377							}
378
379							sub seq {
380	4			4		7	my $self = shift;
381	4					20	return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop});
382							}
383
384							sub subseq {
385	1			1		3	my $self = shift;
386	1					5	return $self->trunc(@_)->seq();
387							}
388
389							sub trunc {
390							# Override Bio::PrimarySeqI trunc() method. This way, we create an object
391							# that does not store the sequence in memory.
392	2			2		5	my ($self, $start, $stop) = @_;
393	2	50				7	$self->throw("Stop cannot be smaller than start") if $stop < $start;
394	2	50				13	if ($self->{start} <= $self->{stop}) {
395	2					5	$start = $self->{start}+$start-1;
396	2					4	$stop = $self->{start}+$stop-1;
397							} else {
398	0					0	$start = $self->{start}-($start-1);
399	0					0	$stop = $self->{start}-($stop-1);
400							}
401	2					8	return $self->new( $self->{db}, $self->{id}, $start, $stop );
402							}
403
404							sub is_circular {
405	1			1		3	my $self = shift;
406	1					3	return $self->{is_circular};
407							}
408
409							sub display_id {
410	9			9		798	my $self = shift;
411	9					30	return $self->{id};
412							}
413
414							sub accession_number {
415	1			1		3	my $self = shift;
416	1					5	return 'unknown';
417							}
418
419							sub primary_id {
420							# Following Bio::PrimarySeqI, since this sequence has no accession number,
421							# its primary_id should be a stringified memory location.
422	1			1		3	my $self = shift;
423	1					11	return overload::StrVal($self);
424							}
425
426							sub can_call_new {
427	0			0		0	return 0;
428							}
429
430							sub alphabet {
431	1			1		577	my $self = shift;
432	1					7	return $self->{db}->alphabet($self->{id});
433							}
434
435							sub revcom {
436							# Override Bio::PrimarySeqI revcom() with optimized method.
437	1			1		3	my $self = shift;
438	1					2	return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
	1					3
439							}
440
441							sub length {
442							# Get length from sequence location, not the sequence string (too expensive)
443	2			2		3	my $self = shift;
444							return $self->{start} < $self->{stop} ?
445							$self->{stop} - $self->{start} + 1 :
446	2	100				15	$self->{start} - $self->{stop} + 1 ;
447							}
448
449							sub description {
450	1			1		3	my $self = shift;
451	1					8	my $header = $self->{'db'}->header($self->{id});
452							# Remove the ID from the header
453	1					14	return (split(/\s+/, $header, 2))[1];
454							}
455							*desc = \&description;
456
457							1;