line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::DB::Qual |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Bio::DB::Qual - Fast indexed access to quality files |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 SYNOPSIS |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use Bio::DB::Qual; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# create database from directory of qual files |
16
|
|
|
|
|
|
|
my $db = Bio::DB::Qual->new('/path/to/qual/files/'); |
17
|
|
|
|
|
|
|
my @ids = $db->get_all_primary_ids; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# Simple access |
20
|
|
|
|
|
|
|
my @qualarr = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)}; |
21
|
|
|
|
|
|
|
my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)}; |
22
|
|
|
|
|
|
|
my $length = $db->length('CHROMOSOME_I'); |
23
|
|
|
|
|
|
|
my $header = $db->header('CHROMOSOME_I'); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# Access to sequence objects. See Bio::PrimarySeqI. |
26
|
|
|
|
|
|
|
my $obj = $db->get_Qual_by_id('CHROMOSOME_I'); |
27
|
|
|
|
|
|
|
my @qual = @{$obj->qual}; |
28
|
|
|
|
|
|
|
my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)}; |
29
|
|
|
|
|
|
|
my $length = $obj->length; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# Loop through sequence objects |
32
|
|
|
|
|
|
|
my $stream = $db->get_PrimarySeq_stream; |
33
|
|
|
|
|
|
|
while (my $qual = $stream->next_seq) { |
34
|
|
|
|
|
|
|
# Bio::Seq::PrimaryQual operations |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# Filehandle access |
38
|
|
|
|
|
|
|
my $fh = Bio::DB::Qual->newFh('/path/to/qual/files/'); |
39
|
|
|
|
|
|
|
while (my $qual = <$fh>) { |
40
|
|
|
|
|
|
|
# Bio::Seq::PrimaryQual operations |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# Tied hash access |
44
|
|
|
|
|
|
|
tie %qualities,'Bio::DB::Qual','/path/to/qual/files/'; |
45
|
|
|
|
|
|
|
print $qualities{'CHROMOSOME_I:1,20000'}; |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 DESCRIPTION |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
Bio::DB::Qual provides indexed access to a single Fasta file, several files, |
50
|
|
|
|
|
|
|
or a directory of files. It provides random access to each quality score entry |
51
|
|
|
|
|
|
|
without having to read the file from the beginning. Access to subqualities |
52
|
|
|
|
|
|
|
(portions of a quality score) is provided, although contrary to Bio::DB::Fasta, |
53
|
|
|
|
|
|
|
the full quality score has to be brought in memory. Bio::DB::Qual is based on |
54
|
|
|
|
|
|
|
Bio::DB::IndexedBase. See this module's documentation for details. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
The qual files should contain decimal quality scores. Entries may have any line |
57
|
|
|
|
|
|
|
length up to 65,536 characters, and different line lengths are allowed in the |
58
|
|
|
|
|
|
|
same file. However, within a quality score entry, all lines must be the same |
59
|
|
|
|
|
|
|
length except for the last. An error will be thrown if this is not the case. |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
The module uses /^E(\S+)/ to extract the primary ID of each quality score |
62
|
|
|
|
|
|
|
from the qual header. See -makeid in Bio::DB::IndexedBase to pass a callback |
63
|
|
|
|
|
|
|
routine to reversibly modify this primary ID, e.g. if you wish to extract a |
64
|
|
|
|
|
|
|
specific portion of the gi|gb|abc|xyz GenBank IDs. |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head1 DATABASE CREATION AND INDEXING |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
The object-oriented constructor is new(), the filehandle constructor is newFh() |
69
|
|
|
|
|
|
|
and the tied hash constructor is tie(). They all allow to index a single Fasta |
70
|
|
|
|
|
|
|
file, several files, or a directory of files. See Bio::DB::IndexedBase. |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head1 SEE ALSO |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
L |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
L |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
L |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head1 LIMITATIONS |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
When a quality score is deleted from one of the qual files, this deletion is not |
83
|
|
|
|
|
|
|
detected by the module and removed from the index. As a result, a "ghost" entry |
84
|
|
|
|
|
|
|
will remain in the index and will return garbage results if accessed. Currently, |
85
|
|
|
|
|
|
|
the only way to accommodate deletions is to rebuild the entire index, either by |
86
|
|
|
|
|
|
|
deleting it manually, or by passing -reindex=E1 to new() when |
87
|
|
|
|
|
|
|
initializing the module. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
All quality score lines for a given quality score must have the same length |
90
|
|
|
|
|
|
|
except for the last (not sure why there is this limitation). This is not |
91
|
|
|
|
|
|
|
problematic for sequences but could be annoying for quality scores. A workaround |
92
|
|
|
|
|
|
|
is to make sure that your quality scores fit on no more than 2 lines. Another |
93
|
|
|
|
|
|
|
solution could be to padd them with blank spaces so that each line has the same |
94
|
|
|
|
|
|
|
number of characters (maybe this padding should be implemented in |
95
|
|
|
|
|
|
|
Bio::SeqIO::qual?). |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head1 AUTHOR |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Florent E Angly Eflorent . angly @ gmail-dot-comE. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein. |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
Copyright (c) 2007 Florent E Angly. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
106
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head1 APPENDIX |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
111
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
For BioPerl-style access, the following methods are provided: |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=head2 get_Seq_by_id |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id, |
118
|
|
|
|
|
|
|
get_Qual_by_id, get_qual_by_acc, get_qual_by_version, get_qual_by_primary_id, |
119
|
|
|
|
|
|
|
Usage : my $seq = $db->get_Seq_by_id($id); |
120
|
|
|
|
|
|
|
Function: Given an ID, fetch the corresponding sequence from the database. |
121
|
|
|
|
|
|
|
Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant) |
122
|
|
|
|
|
|
|
Note that to save resource, Bio::PrimarySeq::Fasta sequence objects |
123
|
|
|
|
|
|
|
only load the sequence string into memory when requested using seq(). |
124
|
|
|
|
|
|
|
See L for methods provided by the sequence objects |
125
|
|
|
|
|
|
|
returned from get_Seq_by_id() and get_PrimarySeq_stream(). |
126
|
|
|
|
|
|
|
Args : ID |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head2 get_PrimarySeq_stream |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Title : get_Seq_stream, get_PrimarySeq_stream |
131
|
|
|
|
|
|
|
Usage : my $stream = $db->get_Seq_stream(); |
132
|
|
|
|
|
|
|
Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a |
133
|
|
|
|
|
|
|
single method, next_seq(). Each call to next_seq() returns a new |
134
|
|
|
|
|
|
|
Bio::PrimarySeq::Fasta sequence object, until no more sequences remain. |
135
|
|
|
|
|
|
|
Returns : A Bio::DB::Indexed::Stream object |
136
|
|
|
|
|
|
|
Args : None |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head1 |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
For simple access, the following methods are provided: |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=cut |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
package Bio::DB::Qual; |
146
|
|
|
|
|
|
|
|
147
|
1
|
|
|
1
|
|
3
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
21
|
|
148
|
1
|
|
|
1
|
|
359
|
use IO::File; |
|
1
|
|
|
|
|
637
|
|
|
1
|
|
|
|
|
83
|
|
149
|
1
|
|
|
1
|
|
5
|
use File::Spec; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
15
|
|
150
|
|
|
|
|
|
|
|
151
|
1
|
|
|
1
|
|
2
|
use base qw(Bio::DB::IndexedBase); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
410
|
|
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
our $obj_class = 'Bio::Seq::PrimaryQual::Qual'; |
154
|
|
|
|
|
|
|
our $file_glob = '*.{qual,QUAL,qa,QA}'; |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 new |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Title : new |
160
|
|
|
|
|
|
|
Usage : my $db = Bio::DB::Qual->new( $path, %options); |
161
|
|
|
|
|
|
|
Function: Initialize a new database object. When indexing a directory, files |
162
|
|
|
|
|
|
|
ending in .qual,qa are indexed by default. |
163
|
|
|
|
|
|
|
Returns : A new Bio::DB::Qual object |
164
|
|
|
|
|
|
|
Args : A single file, or path to dir, or arrayref of files |
165
|
|
|
|
|
|
|
Optional arguments: see Bio::DB::IndexedBase |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=cut |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub _calculate_offsets { |
171
|
|
|
|
|
|
|
# Bio::DB::IndexedBase calls this to calculate offsets |
172
|
9
|
|
|
9
|
|
10
|
my ($self, $fileno, $file, $offsets) = @_; |
173
|
|
|
|
|
|
|
|
174
|
9
|
50
|
|
|
|
31
|
my $fh = IO::File->new($file) or $self->throw("Could not open $file: $!"); |
175
|
9
|
|
|
|
|
390
|
binmode $fh; |
176
|
9
|
50
|
|
|
|
14
|
warn "Indexing $file\n" if $self->{debug}; |
177
|
9
|
|
|
|
|
8
|
my ($offset, @ids, $linelen, $headerlen, $count, $qual_lines, $last_line, |
178
|
|
|
|
|
|
|
$numres, %offsets); |
179
|
9
|
|
|
|
|
11
|
my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0); |
180
|
|
|
|
|
|
|
|
181
|
9
|
|
|
|
|
9
|
my $termination_length = $self->{termination_length}; |
182
|
9
|
|
|
|
|
59
|
while (my $line = <$fh>) { |
183
|
|
|
|
|
|
|
# Account for crlf-terminated Windows files |
184
|
90
|
100
|
|
|
|
192
|
if (index($line, '>') == 0) { |
|
|
50
|
|
|
|
|
|
185
|
45
|
50
|
|
|
|
108
|
if ($line =~ /^>(\S+)/) { |
186
|
|
|
|
|
|
|
print STDERR "Indexed $count quality scores...\n" |
187
|
45
|
50
|
33
|
|
|
73
|
if $self->{debug} && (++$count%1000) == 0; |
188
|
45
|
|
|
|
|
79
|
$self->_check_linelength($linelen); |
189
|
45
|
|
|
|
|
42
|
my $pos = tell($fh); |
190
|
45
|
100
|
|
|
|
55
|
if (@ids) { |
191
|
36
|
|
|
|
|
37
|
my $strlen = $pos - $offset - length($line); |
192
|
36
|
|
|
|
|
28
|
$strlen -= $termination_length * $qual_lines; |
193
|
36
|
|
|
|
|
27
|
my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres, |
|
36
|
|
|
|
|
53
|
|
194
|
|
|
|
|
|
|
$linelen, $headerlen, Bio::DB::IndexedBase::NA, $fileno); |
195
|
36
|
|
|
|
|
37
|
for my $id (@ids) { |
196
|
36
|
|
|
|
|
371
|
$offsets->{$id} = $ppos; |
197
|
|
|
|
|
|
|
} |
198
|
36
|
|
|
|
|
41
|
$numres = 0; |
199
|
|
|
|
|
|
|
} |
200
|
45
|
|
|
|
|
69
|
@ids = $self->_makeid($line); |
201
|
45
|
|
|
|
|
50
|
($offset, $headerlen, $linelen, $qual_lines) = ($pos, length $line, 0, 0); |
202
|
45
|
|
|
|
|
44
|
($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0); |
203
|
|
|
|
|
|
|
} else { |
204
|
|
|
|
|
|
|
# Catch bad header lines, bug 3172 |
205
|
0
|
|
|
|
|
0
|
$self->throw("FASTA header doesn't match '>(\\S+)': $line"); |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
} elsif ($line !~ /\S/) { |
208
|
|
|
|
|
|
|
# Skip blank line |
209
|
0
|
|
|
|
|
0
|
$blank_lines++; |
210
|
0
|
|
|
|
|
0
|
next; |
211
|
|
|
|
|
|
|
} else { |
212
|
|
|
|
|
|
|
# Need to check every line :( |
213
|
45
|
|
|
|
|
26
|
$l3_len = $l2_len; |
214
|
45
|
|
|
|
|
32
|
$l2_len = $l_len; |
215
|
45
|
|
|
|
|
28
|
$l_len = length $line; |
216
|
45
|
|
|
|
|
30
|
if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) { |
217
|
45
|
0
|
33
|
|
|
61
|
if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) { |
|
|
|
33
|
|
|
|
|
218
|
0
|
|
|
|
|
0
|
my $fap = substr($line, 0, 20).".."; |
219
|
0
|
|
|
|
|
0
|
$self->throw("Each line of the qual entry must be the same ". |
220
|
|
|
|
|
|
|
"length except the last. Line above #$. '$fap' is $l2_len". |
221
|
|
|
|
|
|
|
" != $l3_len chars."); |
222
|
|
|
|
|
|
|
} |
223
|
45
|
50
|
|
|
|
59
|
if ($blank_lines) { |
224
|
|
|
|
|
|
|
# Blank lines not allowed in entry |
225
|
0
|
|
|
|
|
0
|
$self->throw("Blank lines can only precede header lines, ". |
226
|
|
|
|
|
|
|
"found preceding line #$."); |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
} |
229
|
45
|
|
33
|
|
|
108
|
$linelen ||= length $line; |
230
|
45
|
|
|
|
|
25
|
$qual_lines++; |
231
|
45
|
|
|
|
|
655
|
$numres += scalar(split /\s+/, $line); |
232
|
|
|
|
|
|
|
} |
233
|
90
|
|
|
|
|
236
|
$last_line = $line; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
# Process last entry |
237
|
9
|
|
|
|
|
18
|
$self->_check_linelength($linelen); |
238
|
9
|
|
|
|
|
9
|
my $pos = tell($fh); |
239
|
9
|
50
|
|
|
|
17
|
if (@ids) { |
240
|
9
|
|
|
|
|
10
|
my $strlen = $pos - $offset; |
241
|
9
|
50
|
|
|
|
10
|
if ($linelen == 0) { |
242
|
0
|
|
|
|
|
0
|
$strlen = 0; |
243
|
|
|
|
|
|
|
} else { |
244
|
9
|
50
|
|
|
|
22
|
if ($last_line !~ /\s$/) { |
245
|
0
|
|
|
|
|
0
|
$qual_lines--; |
246
|
|
|
|
|
|
|
} |
247
|
9
|
|
|
|
|
12
|
$strlen -= $termination_length * $qual_lines; |
248
|
|
|
|
|
|
|
} |
249
|
9
|
|
|
|
|
10
|
my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres, $linelen, |
|
9
|
|
|
|
|
16
|
|
250
|
|
|
|
|
|
|
$headerlen, Bio::DB::IndexedBase::NA, $fileno); |
251
|
9
|
|
|
|
|
13
|
for my $id (@ids) { |
252
|
9
|
|
|
|
|
81
|
$offsets->{$id} = $ppos; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
9
|
|
|
|
|
73
|
return \%offsets; |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# for backward compatibility |
261
|
|
|
|
|
|
|
sub get_PrimaryQual_stream { |
262
|
1
|
|
|
1
|
0
|
4
|
my $self = shift; |
263
|
1
|
|
|
|
|
7
|
return $self->get_PrimarySeq_stream; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# for backward compatibility |
268
|
|
|
|
|
|
|
sub get_Qual_by_id { |
269
|
2
|
|
|
2
|
0
|
2
|
my ($self, $id) = @_; |
270
|
2
|
|
|
|
|
9
|
return $self->get_Seq_by_id($id); |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
*get_qual_by_version = *get_qual_by_primary_id = *get_qual_by_acc = \&get_Qual_by_id; |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=head2 qual |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
Title : qual, quality, subqual |
279
|
|
|
|
|
|
|
Usage : # All quality scores |
280
|
|
|
|
|
|
|
my @qualarr = @{$qualdb->subqual($id)}; |
281
|
|
|
|
|
|
|
# Subset of the quality scores |
282
|
|
|
|
|
|
|
my @subqualarr = @{$qualdb->subqual($id, $start, $stop, $strand)}; |
283
|
|
|
|
|
|
|
# or... |
284
|
|
|
|
|
|
|
my @subqualarr = @{$qualdb->subqual($compound_id)}; |
285
|
|
|
|
|
|
|
Function: Get a subqual of an entry in the database. For your convenience, |
286
|
|
|
|
|
|
|
the sequence to extract can be specified with any of the following |
287
|
|
|
|
|
|
|
compound IDs: |
288
|
|
|
|
|
|
|
$db->qual("$id:$start,$stop") |
289
|
|
|
|
|
|
|
$db->qual("$id:$start..$stop") |
290
|
|
|
|
|
|
|
$db->qual("$id:$start-$stop") |
291
|
|
|
|
|
|
|
$db->qual("$id:$start,$stop/$strand") |
292
|
|
|
|
|
|
|
$db->qual("$id:$start..$stop/$strand") |
293
|
|
|
|
|
|
|
$db->qual("$id:$start-$stop/$strand") |
294
|
|
|
|
|
|
|
$db->qual("$id/$strand") |
295
|
|
|
|
|
|
|
If $stop is less than $start, then the reverse complement of the |
296
|
|
|
|
|
|
|
sequence is returned. Avoid using it if possible since this goes |
297
|
|
|
|
|
|
|
against Bio::Seq conventions. |
298
|
|
|
|
|
|
|
Returns : Reference to an array of quality scores |
299
|
|
|
|
|
|
|
Args : Compound ID of entry to retrieve |
300
|
|
|
|
|
|
|
or |
301
|
|
|
|
|
|
|
ID, optional start (defaults to 1), optional end (defaults to the |
302
|
|
|
|
|
|
|
number of quality scores for this sequence), and strand (defaults to |
303
|
|
|
|
|
|
|
1). |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=cut |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub subqual { |
308
|
16
|
|
|
16
|
0
|
23
|
my ($self, $id, $start, $stop, $strand) = @_; |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# Quality values in a quality score can have 1 or 2 digits and are separated |
311
|
|
|
|
|
|
|
# by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there |
312
|
|
|
|
|
|
|
# is no easy way match the position of a quality value to its position in |
313
|
|
|
|
|
|
|
# the quality string. |
314
|
|
|
|
|
|
|
# As a consequence, if a subqual of the quality is requested, we still need |
315
|
|
|
|
|
|
|
# to grab the full quality string first - performance penalty for big |
316
|
|
|
|
|
|
|
# quality scores :( |
317
|
|
|
|
|
|
|
# I think there is no way around starting at the begining of the quality |
318
|
|
|
|
|
|
|
# score but maybe there is a resource-efficient way of starting at the |
319
|
|
|
|
|
|
|
# begining of the quality score and stopping when the the position of the |
320
|
|
|
|
|
|
|
# last quality value requested is reached?? |
321
|
|
|
|
|
|
|
|
322
|
16
|
50
|
|
|
|
28
|
$self->throw('Need to provide a sequence ID') if not defined $id; |
323
|
16
|
|
|
|
|
34
|
($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand); |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
# Position in quality string |
326
|
16
|
|
|
|
|
17
|
my $string_start = 1; |
327
|
16
|
|
|
|
|
31
|
my $string_stop = $self->strlen($id); |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# Fetch full quality string |
330
|
16
|
50
|
|
|
|
30
|
my $fh = $self->_fh($id) or return; |
331
|
16
|
|
|
|
|
36
|
my $filestart = $self->_calc_offset($id, $string_start); |
332
|
16
|
|
|
|
|
22
|
my $filestop = $self->_calc_offset($id, $string_stop ); |
333
|
16
|
|
|
|
|
44
|
seek($fh, $filestart,0); |
334
|
16
|
|
|
|
|
12
|
my $data; |
335
|
16
|
|
|
|
|
84
|
read($fh, $data, $filestop-$filestart+1); |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# Process quality score |
338
|
16
|
|
|
|
|
31
|
Bio::DB::IndexedBase::_strip_crnl($data); |
339
|
16
|
|
|
|
|
10
|
my $subqual = 0; |
340
|
16
|
50
|
33
|
|
|
28
|
$subqual = 1 if ( $start || $stop ); |
341
|
16
|
|
|
|
|
15
|
my @data; |
342
|
16
|
50
|
33
|
|
|
30
|
if ( $subqual || ($strand == -1) ) { |
343
|
16
|
|
|
|
|
76
|
@data = split / /, $data, $stop+1; |
344
|
16
|
|
|
|
|
15
|
my $length = scalar(@data); |
345
|
16
|
50
|
|
|
|
23
|
$start = 1 if $start < 1; |
346
|
16
|
50
|
|
|
|
19
|
$stop = $length if $stop > $length; |
347
|
16
|
100
|
|
|
|
23
|
pop @data if ($stop != $length); |
348
|
16
|
|
|
|
|
19
|
splice @data, 0, $start-1; |
349
|
16
|
100
|
|
|
|
31
|
@data = reverse(@data) if $strand == -1; |
350
|
16
|
|
|
|
|
30
|
$data = join ' ', @data; |
351
|
|
|
|
|
|
|
} else { |
352
|
0
|
|
|
|
|
0
|
@data = split / /, $data; |
353
|
|
|
|
|
|
|
} |
354
|
|
|
|
|
|
|
|
355
|
16
|
|
|
|
|
51
|
return \@data; |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
*qual = *quality = \&subqual; |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=head2 header |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Title : header |
364
|
|
|
|
|
|
|
Usage : my $header = $db->header($id); |
365
|
|
|
|
|
|
|
Function: Get the header line (ID and description fields) of the specified entry. |
366
|
|
|
|
|
|
|
Returns : String |
367
|
|
|
|
|
|
|
Args : ID of entry |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=cut |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
sub header { |
372
|
2
|
|
|
2
|
1
|
3
|
my ($self, $id) = @_; |
373
|
2
|
50
|
|
|
|
5
|
$self->throw('Need to provide a sequence ID') if not defined $id; |
374
|
2
|
|
|
|
|
5
|
my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4]; |
|
2
|
|
|
|
|
5
|
|
375
|
2
|
|
|
|
|
4
|
$offset -= $headerlen; |
376
|
2
|
|
|
|
|
2
|
my $data; |
377
|
2
|
50
|
|
|
|
4
|
my $fh = $self->_fh($id) or return; |
378
|
2
|
|
|
|
|
30
|
seek($fh, $offset, 0); |
379
|
2
|
|
|
|
|
10
|
read($fh, $data, $headerlen); |
380
|
|
|
|
|
|
|
# On Windows chomp remove '\n' but leaves '\r' |
381
|
|
|
|
|
|
|
# when reading '\r\n' in binary mode, |
382
|
|
|
|
|
|
|
# _strip_crnl removes both |
383
|
2
|
|
|
|
|
5
|
$data = Bio::DB::IndexedBase::_strip_crnl($data); |
384
|
2
|
|
|
|
|
4
|
substr($data, 0, 1) = ''; |
385
|
2
|
|
|
|
|
5
|
return $data; |
386
|
|
|
|
|
|
|
} |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
#------------------------------------------------------------- |
390
|
|
|
|
|
|
|
# Tied hash overrides |
391
|
|
|
|
|
|
|
# |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
sub FETCH { |
394
|
3
|
|
|
3
|
|
9
|
return shift->subqual(@_); |
395
|
|
|
|
|
|
|
} |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
#------------------------------------------------------------- |
399
|
|
|
|
|
|
|
# Bio::Seq::PrimaryQual compatibility |
400
|
|
|
|
|
|
|
# |
401
|
|
|
|
|
|
|
# Usage is the same as in Bio::Seq::PrimaryQual |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
package Bio::Seq::PrimaryQual::Qual; |
404
|
1
|
|
|
1
|
|
4
|
use overload '""' => 'display_id'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
7
|
|
405
|
|
|
|
|
|
|
|
406
|
1
|
|
|
1
|
|
53
|
use base qw(Bio::Root::Root Bio::Seq::PrimaryQual); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
339
|
|
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
sub new { |
409
|
7
|
|
|
7
|
|
12
|
my ($class, @args) = @_; |
410
|
7
|
|
|
|
|
21
|
my $self = $class->SUPER::new(@args); |
411
|
7
|
|
|
|
|
23
|
my ($db, $id, $start, $stop) = $self->_rearrange( |
412
|
|
|
|
|
|
|
[qw(DATABASE ID START STOP)], |
413
|
|
|
|
|
|
|
@args); |
414
|
7
|
|
|
|
|
15
|
$self->{db} = $db; |
415
|
7
|
|
|
|
|
7
|
$self->{id} = $id; |
416
|
7
|
|
66
|
|
|
14
|
$self->{stop} = $stop || $db->length($id); |
417
|
7
|
|
66
|
|
|
18
|
$self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs |
418
|
7
|
|
|
|
|
14
|
return $self; |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
sub qual { |
423
|
6
|
|
|
6
|
|
457
|
my $self = shift; |
424
|
6
|
|
|
|
|
17
|
my $qual = $self->{db}->qual($self->{id}, $self->{start}, $self->{stop}); |
425
|
6
|
|
|
|
|
28
|
return $qual; |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
sub subqual { |
430
|
2
|
|
|
2
|
|
3
|
my ($self, $start, $stop) = @_; |
431
|
2
|
|
|
|
|
4
|
return $self->trunc($start, $stop)->qual; |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
sub trunc { |
436
|
|
|
|
|
|
|
# Override Bio::Seq::QualI trunc() method. This way, we create an object |
437
|
|
|
|
|
|
|
# that does not store the quality array in memory. |
438
|
3
|
|
|
3
|
|
3
|
my ($self, $start, $stop) = @_; |
439
|
3
|
50
|
|
|
|
7
|
$self->throw( |
440
|
|
|
|
|
|
|
"$stop is smaller than $stop. If you want to truncate and reverse ". |
441
|
|
|
|
|
|
|
"complement, you must call trunc followed by revcom." |
442
|
|
|
|
|
|
|
) if $start > $stop; |
443
|
3
|
50
|
|
|
|
7
|
if ($self->{start} <= $self->{stop}) { |
444
|
3
|
|
|
|
|
4
|
$start = $self->{start}+$start-1; |
445
|
3
|
|
|
|
|
4
|
$stop = $self->{start}+$stop-1; |
446
|
|
|
|
|
|
|
} else { |
447
|
0
|
|
|
|
|
0
|
$start = $self->{start}-($start-1); |
448
|
0
|
|
|
|
|
0
|
$stop = $self->{start}-($stop-1); |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
my $obj = $self->new( -database => $self->{db}, |
451
|
|
|
|
|
|
|
-id => $self->{id}, |
452
|
3
|
|
|
|
|
12
|
-start => $start, |
453
|
|
|
|
|
|
|
-stop => $stop |
454
|
|
|
|
|
|
|
); |
455
|
3
|
|
|
|
|
8
|
return $obj; |
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub display_id { |
460
|
8
|
|
|
8
|
|
221
|
my $self = shift; |
461
|
8
|
|
|
|
|
19
|
return $self->{id}; |
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
sub primary_id { |
466
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
467
|
1
|
|
|
|
|
4
|
return overload::StrVal($self); |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
sub revcom { |
471
|
|
|
|
|
|
|
# Override Bio::QualI revcom() with optimized method. |
472
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
473
|
1
|
|
|
|
|
2
|
return $self->new(@{$self}{'db', 'id', 'stop', 'start'}); |
|
1
|
|
|
|
|
3
|
|
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub length { |
477
|
|
|
|
|
|
|
# Get length from quality location, not the quality array (too expensive) |
478
|
3
|
|
|
3
|
|
6
|
my $self = shift; |
479
|
|
|
|
|
|
|
return $self->{start} < $self->{stop} ? |
480
|
|
|
|
|
|
|
$self->{stop} - $self->{start} + 1 : |
481
|
3
|
100
|
|
|
|
14
|
$self->{start} - $self->{stop} + 1 ; |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
sub description { |
486
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
487
|
1
|
|
|
|
|
3
|
my $header = $self->{'db'}->header($self->{id}); |
488
|
|
|
|
|
|
|
# remove the id from the header |
489
|
1
|
|
|
|
|
4
|
$header = (split(/\s+/, $header, 2))[2]; |
490
|
1
|
|
|
|
|
3
|
return $header; |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
*desc = \&description; |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
1; |