File Coverage

Bio/DB/Qual.pm

Criterion	Covered	Total	%
statement	139	150	92.6
branch	29	50	58.0
condition	10	24	41.6
subroutine	21	21	100.0
pod	1	4	25.0
total	200	249	80.3

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::Qual
3							#
4							# You may distribute this module under the same terms as perl itself
5							#
6
7							=head1 NAME
8
9							Bio::DB::Qual - Fast indexed access to quality files
10
11							=head1 SYNOPSIS
12
13							use Bio::DB::Qual;
14
15							# create database from directory of qual files
16							my $db = Bio::DB::Qual->new('/path/to/qual/files/');
17							my @ids = $db->get_all_primary_ids;
18
19							# Simple access
20							my @qualarr = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)};
21							my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)};
22							my $length = $db->length('CHROMOSOME_I');
23							my $header = $db->header('CHROMOSOME_I');
24
25							# Access to sequence objects. See Bio::PrimarySeqI.
26							my $obj = $db->get_Qual_by_id('CHROMOSOME_I');
27							my @qual = @{$obj->qual};
28							my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)};
29							my $length = $obj->length;
30
31							# Loop through sequence objects
32							my $stream = $db->get_PrimarySeq_stream;
33							while (my $qual = $stream->next_seq) {
34							# Bio::Seq::PrimaryQual operations
35							}
36
37							# Filehandle access
38							my $fh = Bio::DB::Qual->newFh('/path/to/qual/files/');
39							while (my $qual = <$fh>) {
40							# Bio::Seq::PrimaryQual operations
41							}
42
43							# Tied hash access
44							tie %qualities,'Bio::DB::Qual','/path/to/qual/files/';
45							print $qualities{'CHROMOSOME_I:1,20000'};
46
47							=head1 DESCRIPTION
48
49							Bio::DB::Qual provides indexed access to a single Fasta file, several files,
50							or a directory of files. It provides random access to each quality score entry
51							without having to read the file from the beginning. Access to subqualities
52							(portions of a quality score) is provided, although contrary to Bio::DB::Fasta,
53							the full quality score has to be brought in memory. Bio::DB::Qual is based on
54							Bio::DB::IndexedBase. See this module's documentation for details.
55
56							The qual files should contain decimal quality scores. Entries may have any line
57							length up to 65,536 characters, and different line lengths are allowed in the
58							same file. However, within a quality score entry, all lines must be the same
59							length except for the last. An error will be thrown if this is not the case.
60
61							The module uses /^E(\S+)/ to extract the primary ID of each quality score
62							from the qual header. See -makeid in Bio::DB::IndexedBase to pass a callback
63							routine to reversibly modify this primary ID, e.g. if you wish to extract a
64							specific portion of the gi\|gb\|abc\|xyz GenBank IDs.
65
66							=head1 DATABASE CREATION AND INDEXING
67
68							The object-oriented constructor is new(), the filehandle constructor is newFh()
69							and the tied hash constructor is tie(). They all allow to index a single Fasta
70							file, several files, or a directory of files. See Bio::DB::IndexedBase.
71
72							=head1 SEE ALSO
73
74							L
75
76							L
77
78							L
79
80							=head1 LIMITATIONS
81
82							When a quality score is deleted from one of the qual files, this deletion is not
83							detected by the module and removed from the index. As a result, a "ghost" entry
84							will remain in the index and will return garbage results if accessed. Currently,
85							the only way to accommodate deletions is to rebuild the entire index, either by
86							deleting it manually, or by passing -reindex=E1 to new() when
87							initializing the module.
88
89							All quality score lines for a given quality score must have the same length
90							except for the last (not sure why there is this limitation). This is not
91							problematic for sequences but could be annoying for quality scores. A workaround
92							is to make sure that your quality scores fit on no more than 2 lines. Another
93							solution could be to padd them with blank spaces so that each line has the same
94							number of characters (maybe this padding should be implemented in
95							Bio::SeqIO::qual?).
96
97							=head1 AUTHOR
98
99							Florent E Angly Eflorent . angly @ gmail-dot-comE.
100
101							Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein.
102
103							Copyright (c) 2007 Florent E Angly.
104
105							This library is free software; you can redistribute it and/or modify
106							it under the same terms as Perl itself.
107
108							=head1 APPENDIX
109
110							The rest of the documentation details each of the object
111							methods. Internal methods are usually preceded with a _
112
113							For BioPerl-style access, the following methods are provided:
114
115							=head2 get_Seq_by_id
116
117							Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id,
118							get_Qual_by_id, get_qual_by_acc, get_qual_by_version, get_qual_by_primary_id,
119							Usage : my $seq = $db->get_Seq_by_id($id);
120							Function: Given an ID, fetch the corresponding sequence from the database.
121							Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
122							Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
123							only load the sequence string into memory when requested using seq().
124							See L for methods provided by the sequence objects
125							returned from get_Seq_by_id() and get_PrimarySeq_stream().
126							Args : ID
127
128							=head2 get_PrimarySeq_stream
129
130							Title : get_Seq_stream, get_PrimarySeq_stream
131							Usage : my $stream = $db->get_Seq_stream();
132							Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
133							single method, next_seq(). Each call to next_seq() returns a new
134							Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
135							Returns : A Bio::DB::Indexed::Stream object
136							Args : None
137
138							=head1
139
140							For simple access, the following methods are provided:
141
142							=cut
143
144
145							package Bio::DB::Qual;
146
147	1			1		3	use strict;
	1					1
	1					22
148	1			1		373	use IO::File;
	1					709
	1					90
149	1			1		5	use File::Spec;
	1					1
	1					16
150
151	1			1		3	use base qw(Bio::DB::IndexedBase);
	1					1
	1					398
152
153							our $obj_class = 'Bio::Seq::PrimaryQual::Qual';
154							our $file_glob = '*.{qual,QUAL,qa,QA}';
155
156
157							=head2 new
158
159							Title : new
160							Usage : my $db = Bio::DB::Qual->new( $path, %options);
161							Function: Initialize a new database object. When indexing a directory, files
162							ending in .qual,qa are indexed by default.
163							Returns : A new Bio::DB::Qual object
164							Args : A single file, or path to dir, or arrayref of files
165							Optional arguments: see Bio::DB::IndexedBase
166
167							=cut
168
169
170							sub _calculate_offsets {
171							# Bio::DB::IndexedBase calls this to calculate offsets
172	9			9		10	my ($self, $fileno, $file, $offsets) = @_;
173
174	9	50				29	my $fh = IO::File->new($file) or $self->throw("Could not open $file: $!");
175	9					395	binmode $fh;
176	9	50				19	warn "Indexing $file\n" if $self->{debug};
177	9					7	my ($offset, @ids, $linelen, $headerlen, $count, $qual_lines, $last_line,
178							$numres, %offsets);
179	9					10	my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
180
181	9					9	my $termination_length = $self->{termination_length};
182	9					64	while (my $line = <$fh>) {
183							# Account for crlf-terminated Windows files
184	90	100				208	if (index($line, '>') == 0) {
		50
185	45	50				101	if ($line =~ /^>(\S+)/) {
186							print STDERR "Indexed $count quality scores...\n"
187	45	50	33			82	if $self->{debug} && (++$count%1000) == 0;
188	45					80	$self->_check_linelength($linelen);
189	45					45	my $pos = tell($fh);
190	45	100				55	if (@ids) {
191	36					32	my $strlen = $pos - $offset - length($line);
192	36					27	$strlen -= $termination_length * $qual_lines;
193	36					32	my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres,
	36					55
194							$linelen, $headerlen, Bio::DB::IndexedBase::NA, $fileno);
195	36					44	for my $id (@ids) {
196	36					395	$offsets->{$id} = $ppos;
197							}
198	36					40	$numres = 0;
199							}
200	45					82	@ids = $self->_makeid($line);
201	45					47	($offset, $headerlen, $linelen, $qual_lines) = ($pos, length $line, 0, 0);
202	45					45	($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
203							} else {
204							# Catch bad header lines, bug 3172
205	0					0	$self->throw("FASTA header doesn't match '>(\\S+)': $line");
206							}
207							} elsif ($line !~ /\S/) {
208							# Skip blank line
209	0					0	$blank_lines++;
210	0					0	next;
211							} else {
212							# Need to check every line :(
213	45					35	$l3_len = $l2_len;
214	45					30	$l2_len = $l_len;
215	45					24	$l_len = length $line;
216	45					28	if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
217	45	0	33			68	if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
			33
218	0					0	my $fap = substr($line, 0, 20)."..";
219	0					0	$self->throw("Each line of the qual entry must be the same ".
220							"length except the last. Line above #$. '$fap' is $l2_len".
221							" != $l3_len chars.");
222							}
223	45	50				58	if ($blank_lines) {
224							# Blank lines not allowed in entry
225	0					0	$self->throw("Blank lines can only precede header lines, ".
226							"found preceding line #$.");
227							}
228							}
229	45		33			111	$linelen \|\|= length $line;
230	45					29	$qual_lines++;
231	45					694	$numres += scalar(split /\s+/, $line);
232							}
233	90					244	$last_line = $line;
234							}
235
236							# Process last entry
237	9					16	$self->_check_linelength($linelen);
238	9					9	my $pos = tell($fh);
239	9	50				15	if (@ids) {
240	9					8	my $strlen = $pos - $offset;
241	9	50				12	if ($linelen == 0) {
242	0					0	$strlen = 0;
243							} else {
244	9	50				24	if ($last_line !~ /\s$/) {
245	0					0	$qual_lines--;
246							}
247	9					9	$strlen -= $termination_length * $qual_lines;
248							}
249	9					9	my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres, $linelen,
	9					16
250							$headerlen, Bio::DB::IndexedBase::NA, $fileno);
251	9					13	for my $id (@ids) {
252	9					83	$offsets->{$id} = $ppos;
253							}
254							}
255
256	9					74	return \%offsets;
257							}
258
259
260							# for backward compatibility
261							sub get_PrimaryQual_stream {
262	1			1	0	4	my $self = shift;
263	1					8	return $self->get_PrimarySeq_stream;
264							}
265
266
267							# for backward compatibility
268							sub get_Qual_by_id {
269	2			2	0	4	my ($self, $id) = @_;
270	2					8	return $self->get_Seq_by_id($id);
271							}
272
273							get_qual_by_version = get_qual_by_primary_id = *get_qual_by_acc = \&get_Qual_by_id;
274
275
276							=head2 qual
277
278							Title : qual, quality, subqual
279							Usage : # All quality scores
280							my @qualarr = @{$qualdb->subqual($id)};
281							# Subset of the quality scores
282							my @subqualarr = @{$qualdb->subqual($id, $start, $stop, $strand)};
283							# or...
284							my @subqualarr = @{$qualdb->subqual($compound_id)};
285							Function: Get a subqual of an entry in the database. For your convenience,
286							the sequence to extract can be specified with any of the following
287							compound IDs:
288							$db->qual("$id:$start,$stop")
289							$db->qual("$id:$start..$stop")
290							$db->qual("$id:$start-$stop")
291							$db->qual("$id:$start,$stop/$strand")
292							$db->qual("$id:$start..$stop/$strand")
293							$db->qual("$id:$start-$stop/$strand")
294							$db->qual("$id/$strand")
295							If $stop is less than $start, then the reverse complement of the
296							sequence is returned. Avoid using it if possible since this goes
297							against Bio::Seq conventions.
298							Returns : Reference to an array of quality scores
299							Args : Compound ID of entry to retrieve
300							or
301							ID, optional start (defaults to 1), optional end (defaults to the
302							number of quality scores for this sequence), and strand (defaults to
303							1).
304
305							=cut
306
307							sub subqual {
308	16			16	0	24	my ($self, $id, $start, $stop, $strand) = @_;
309
310							# Quality values in a quality score can have 1 or 2 digits and are separated
311							# by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there
312							# is no easy way match the position of a quality value to its position in
313							# the quality string.
314							# As a consequence, if a subqual of the quality is requested, we still need
315							# to grab the full quality string first - performance penalty for big
316							# quality scores :(
317							# I think there is no way around starting at the begining of the quality
318							# score but maybe there is a resource-efficient way of starting at the
319							# begining of the quality score and stopping when the the position of the
320							# last quality value requested is reached??
321
322	16	50				30	$self->throw('Need to provide a sequence ID') if not defined $id;
323	16					33	($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
324
325							# Position in quality string
326	16					20	my $string_start = 1;
327	16					28	my $string_stop = $self->strlen($id);
328
329							# Fetch full quality string
330	16	50				30	my $fh = $self->_fh($id) or return;
331	16					36	my $filestart = $self->_calc_offset($id, $string_start);
332	16					23	my $filestop = $self->_calc_offset($id, $string_stop );
333	16					45	seek($fh, $filestart,0);
334	16					12	my $data;
335	16					79	read($fh, $data, $filestop-$filestart+1);
336
337							# Process quality score
338	16					28	Bio::DB::IndexedBase::_strip_crnl($data);
339	16					12	my $subqual = 0;
340	16	50	33			33	$subqual = 1 if ( $start \|\| $stop );
341	16					13	my @data;
342	16	50	33			31	if ( $subqual \|\| ($strand == -1) ) {
343	16					80	@data = split / /, $data, $stop+1;
344	16					16	my $length = scalar(@data);
345	16	50				24	$start = 1 if $start < 1;
346	16	50				20	$stop = $length if $stop > $length;
347	16	100				27	pop @data if ($stop != $length);
348	16					22	splice @data, 0, $start-1;
349	16	100				26	@data = reverse(@data) if $strand == -1;
350	16					32	$data = join ' ', @data;
351							} else {
352	0					0	@data = split / /, $data;
353							}
354
355	16					57	return \@data;
356							}
357
358							qual = quality = \&subqual;
359
360
361							=head2 header
362
363							Title : header
364							Usage : my $header = $db->header($id);
365							Function: Get the header line (ID and description fields) of the specified entry.
366							Returns : String
367							Args : ID of entry
368
369							=cut
370
371							sub header {
372	2			2	1	4	my ($self, $id) = @_;
373	2	50				5	$self->throw('Need to provide a sequence ID') if not defined $id;
374	2					7	my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
	2					4
375	2					5	$offset -= $headerlen;
376	2					2	my $data;
377	2	50				4	my $fh = $self->_fh($id) or return;
378	2					8	seek($fh, $offset, 0);
379	2					11	read($fh, $data, $headerlen);
380							# On Windows chomp remove '\n' but leaves '\r'
381							# when reading '\r\n' in binary mode,
382							# _strip_crnl removes both
383	2					5	$data = Bio::DB::IndexedBase::_strip_crnl($data);
384	2					4	substr($data, 0, 1) = '';
385	2					5	return $data;
386							}
387
388
389							#-------------------------------------------------------------
390							# Tied hash overrides
391							#
392
393							sub FETCH {
394	3			3		11	return shift->subqual(@_);
395							}
396
397
398							#-------------------------------------------------------------
399							# Bio::Seq::PrimaryQual compatibility
400							#
401							# Usage is the same as in Bio::Seq::PrimaryQual
402
403							package Bio::Seq::PrimaryQual::Qual;
404	1			1		6	use overload '""' => 'display_id';
	1					2
	1					7
405
406	1			1		70	use base qw(Bio::Root::Root Bio::Seq::PrimaryQual);
	1					2
	1					371
407
408							sub new {
409	7			7		14	my ($class, @args) = @_;
410	7					19	my $self = $class->SUPER::new(@args);
411	7					25	my ($db, $id, $start, $stop) = $self->_rearrange(
412							[qw(DATABASE ID START STOP)],
413							@args);
414	7					12	$self->{db} = $db;
415	7					9	$self->{id} = $id;
416	7		66			15	$self->{stop} = $stop \|\| $db->length($id);
417	7		66			19	$self->{start} = $start \|\| ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
418	7					16	return $self;
419							}
420
421
422							sub qual {
423	6			6		452	my $self = shift;
424	6					14	my $qual = $self->{db}->qual($self->{id}, $self->{start}, $self->{stop});
425	6					28	return $qual;
426							}
427
428
429							sub subqual {
430	2			2		3	my ($self, $start, $stop) = @_;
431	2					5	return $self->trunc($start, $stop)->qual;
432							}
433
434
435							sub trunc {
436							# Override Bio::Seq::QualI trunc() method. This way, we create an object
437							# that does not store the quality array in memory.
438	3			3		5	my ($self, $start, $stop) = @_;
439	3	50				7	$self->throw(
440							"$stop is smaller than $stop. If you want to truncate and reverse ".
441							"complement, you must call trunc followed by revcom."
442							) if $start > $stop;
443	3	50				7	if ($self->{start} <= $self->{stop}) {
444	3					5	$start = $self->{start}+$start-1;
445	3					3	$stop = $self->{start}+$stop-1;
446							} else {
447	0					0	$start = $self->{start}-($start-1);
448	0					0	$stop = $self->{start}-($stop-1);
449							}
450							my $obj = $self->new( -database => $self->{db},
451							-id => $self->{id},
452	3					14	-start => $start,
453							-stop => $stop
454							);
455	3					8	return $obj;
456							}
457
458
459							sub display_id {
460	8			8		222	my $self = shift;
461	8					20	return $self->{id};
462							}
463
464
465							sub primary_id {
466	1			1		2	my $self = shift;
467	1					4	return overload::StrVal($self);
468							}
469
470							sub revcom {
471							# Override Bio::QualI revcom() with optimized method.
472	1			1		2	my $self = shift;
473	1					2	return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
	1					3
474							}
475
476							sub length {
477							# Get length from quality location, not the quality array (too expensive)
478	3			3		4	my $self = shift;
479							return $self->{start} < $self->{stop} ?
480							$self->{stop} - $self->{start} + 1 :
481	3	100				18	$self->{start} - $self->{stop} + 1 ;
482							}
483
484
485							sub description {
486	1			1		2	my $self = shift;
487	1					3	my $header = $self->{'db'}->header($self->{id});
488							# remove the id from the header
489	1					5	$header = (split(/\s+/, $header, 2))[2];
490	1					3	return $header;
491							}
492							*desc = \&description;
493
494
495							1;