| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# BioPerl module for Bio::DB::Qual |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Bio::DB::Qual - Fast indexed access to quality files |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use Bio::DB::Qual; |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# create database from directory of qual files |
|
16
|
|
|
|
|
|
|
my $db = Bio::DB::Qual->new('/path/to/qual/files/'); |
|
17
|
|
|
|
|
|
|
my @ids = $db->get_all_primary_ids; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# Simple access |
|
20
|
|
|
|
|
|
|
my @qualarr = @{$db->qual('CHROMOSOME_I',4_000_000 => 4_100_000)}; |
|
21
|
|
|
|
|
|
|
my @revqual = @{$db->qual('CHROMOSOME_I',4_100_000 => 4_000_000)}; |
|
22
|
|
|
|
|
|
|
my $length = $db->length('CHROMOSOME_I'); |
|
23
|
|
|
|
|
|
|
my $header = $db->header('CHROMOSOME_I'); |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# Access to sequence objects. See Bio::PrimarySeqI. |
|
26
|
|
|
|
|
|
|
my $obj = $db->get_Qual_by_id('CHROMOSOME_I'); |
|
27
|
|
|
|
|
|
|
my @qual = @{$obj->qual}; |
|
28
|
|
|
|
|
|
|
my @subqual = @{$obj->subqual(4_000_000 => 4_100_000)}; |
|
29
|
|
|
|
|
|
|
my $length = $obj->length; |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# Loop through sequence objects |
|
32
|
|
|
|
|
|
|
my $stream = $db->get_PrimarySeq_stream; |
|
33
|
|
|
|
|
|
|
while (my $qual = $stream->next_seq) { |
|
34
|
|
|
|
|
|
|
# Bio::Seq::PrimaryQual operations |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# Filehandle access |
|
38
|
|
|
|
|
|
|
my $fh = Bio::DB::Qual->newFh('/path/to/qual/files/'); |
|
39
|
|
|
|
|
|
|
while (my $qual = <$fh>) { |
|
40
|
|
|
|
|
|
|
# Bio::Seq::PrimaryQual operations |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# Tied hash access |
|
44
|
|
|
|
|
|
|
tie %qualities,'Bio::DB::Qual','/path/to/qual/files/'; |
|
45
|
|
|
|
|
|
|
print $qualities{'CHROMOSOME_I:1,20000'}; |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
Bio::DB::Qual provides indexed access to a single Fasta file, several files, |
|
50
|
|
|
|
|
|
|
or a directory of files. It provides random access to each quality score entry |
|
51
|
|
|
|
|
|
|
without having to read the file from the beginning. Access to subqualities |
|
52
|
|
|
|
|
|
|
(portions of a quality score) is provided, although contrary to Bio::DB::Fasta, |
|
53
|
|
|
|
|
|
|
the full quality score has to be brought in memory. Bio::DB::Qual is based on |
|
54
|
|
|
|
|
|
|
Bio::DB::IndexedBase. See this module's documentation for details. |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
The qual files should contain decimal quality scores. Entries may have any line |
|
57
|
|
|
|
|
|
|
length up to 65,536 characters, and different line lengths are allowed in the |
|
58
|
|
|
|
|
|
|
same file. However, within a quality score entry, all lines must be the same |
|
59
|
|
|
|
|
|
|
length except for the last. An error will be thrown if this is not the case. |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
The module uses /^E(\S+)/ to extract the primary ID of each quality score |
|
62
|
|
|
|
|
|
|
from the qual header. See -makeid in Bio::DB::IndexedBase to pass a callback |
|
63
|
|
|
|
|
|
|
routine to reversibly modify this primary ID, e.g. if you wish to extract a |
|
64
|
|
|
|
|
|
|
specific portion of the gi|gb|abc|xyz GenBank IDs. |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head1 DATABASE CREATION AND INDEXING |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
The object-oriented constructor is new(), the filehandle constructor is newFh() |
|
69
|
|
|
|
|
|
|
and the tied hash constructor is tie(). They all allow to index a single Fasta |
|
70
|
|
|
|
|
|
|
file, several files, or a directory of files. See Bio::DB::IndexedBase. |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
L |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
L |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
L |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head1 LIMITATIONS |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
When a quality score is deleted from one of the qual files, this deletion is not |
|
83
|
|
|
|
|
|
|
detected by the module and removed from the index. As a result, a "ghost" entry |
|
84
|
|
|
|
|
|
|
will remain in the index and will return garbage results if accessed. Currently, |
|
85
|
|
|
|
|
|
|
the only way to accommodate deletions is to rebuild the entire index, either by |
|
86
|
|
|
|
|
|
|
deleting it manually, or by passing -reindex=E1 to new() when |
|
87
|
|
|
|
|
|
|
initializing the module. |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
All quality score lines for a given quality score must have the same length |
|
90
|
|
|
|
|
|
|
except for the last (not sure why there is this limitation). This is not |
|
91
|
|
|
|
|
|
|
problematic for sequences but could be annoying for quality scores. A workaround |
|
92
|
|
|
|
|
|
|
is to make sure that your quality scores fit on no more than 2 lines. Another |
|
93
|
|
|
|
|
|
|
solution could be to padd them with blank spaces so that each line has the same |
|
94
|
|
|
|
|
|
|
number of characters (maybe this padding should be implemented in |
|
95
|
|
|
|
|
|
|
Bio::SeqIO::qual?). |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head1 AUTHOR |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Florent E Angly Eflorent . angly @ gmail-dot-comE. |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Module largely based on and adapted from Bio::DB::Fasta by Lincoln Stein. |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
Copyright (c) 2007 Florent E Angly. |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
|
106
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head1 APPENDIX |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
|
111
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
For BioPerl-style access, the following methods are provided: |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=head2 get_Seq_by_id |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id, |
|
118
|
|
|
|
|
|
|
get_Qual_by_id, get_qual_by_acc, get_qual_by_version, get_qual_by_primary_id, |
|
119
|
|
|
|
|
|
|
Usage : my $seq = $db->get_Seq_by_id($id); |
|
120
|
|
|
|
|
|
|
Function: Given an ID, fetch the corresponding sequence from the database. |
|
121
|
|
|
|
|
|
|
Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant) |
|
122
|
|
|
|
|
|
|
Note that to save resource, Bio::PrimarySeq::Fasta sequence objects |
|
123
|
|
|
|
|
|
|
only load the sequence string into memory when requested using seq(). |
|
124
|
|
|
|
|
|
|
See L for methods provided by the sequence objects |
|
125
|
|
|
|
|
|
|
returned from get_Seq_by_id() and get_PrimarySeq_stream(). |
|
126
|
|
|
|
|
|
|
Args : ID |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head2 get_PrimarySeq_stream |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Title : get_Seq_stream, get_PrimarySeq_stream |
|
131
|
|
|
|
|
|
|
Usage : my $stream = $db->get_Seq_stream(); |
|
132
|
|
|
|
|
|
|
Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a |
|
133
|
|
|
|
|
|
|
single method, next_seq(). Each call to next_seq() returns a new |
|
134
|
|
|
|
|
|
|
Bio::PrimarySeq::Fasta sequence object, until no more sequences remain. |
|
135
|
|
|
|
|
|
|
Returns : A Bio::DB::Indexed::Stream object |
|
136
|
|
|
|
|
|
|
Args : None |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head1 |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
For simple access, the following methods are provided: |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=cut |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
package Bio::DB::Qual; |
|
146
|
|
|
|
|
|
|
|
|
147
|
1
|
|
|
1
|
|
3
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
22
|
|
|
148
|
1
|
|
|
1
|
|
373
|
use IO::File; |
|
|
1
|
|
|
|
|
709
|
|
|
|
1
|
|
|
|
|
90
|
|
|
149
|
1
|
|
|
1
|
|
5
|
use File::Spec; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
16
|
|
|
150
|
|
|
|
|
|
|
|
|
151
|
1
|
|
|
1
|
|
3
|
use base qw(Bio::DB::IndexedBase); |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
398
|
|
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
our $obj_class = 'Bio::Seq::PrimaryQual::Qual'; |
|
154
|
|
|
|
|
|
|
our $file_glob = '*.{qual,QUAL,qa,QA}'; |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head2 new |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Title : new |
|
160
|
|
|
|
|
|
|
Usage : my $db = Bio::DB::Qual->new( $path, %options); |
|
161
|
|
|
|
|
|
|
Function: Initialize a new database object. When indexing a directory, files |
|
162
|
|
|
|
|
|
|
ending in .qual,qa are indexed by default. |
|
163
|
|
|
|
|
|
|
Returns : A new Bio::DB::Qual object |
|
164
|
|
|
|
|
|
|
Args : A single file, or path to dir, or arrayref of files |
|
165
|
|
|
|
|
|
|
Optional arguments: see Bio::DB::IndexedBase |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=cut |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
sub _calculate_offsets { |
|
171
|
|
|
|
|
|
|
# Bio::DB::IndexedBase calls this to calculate offsets |
|
172
|
9
|
|
|
9
|
|
10
|
my ($self, $fileno, $file, $offsets) = @_; |
|
173
|
|
|
|
|
|
|
|
|
174
|
9
|
50
|
|
|
|
29
|
my $fh = IO::File->new($file) or $self->throw("Could not open $file: $!"); |
|
175
|
9
|
|
|
|
|
395
|
binmode $fh; |
|
176
|
9
|
50
|
|
|
|
19
|
warn "Indexing $file\n" if $self->{debug}; |
|
177
|
9
|
|
|
|
|
7
|
my ($offset, @ids, $linelen, $headerlen, $count, $qual_lines, $last_line, |
|
178
|
|
|
|
|
|
|
$numres, %offsets); |
|
179
|
9
|
|
|
|
|
10
|
my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0); |
|
180
|
|
|
|
|
|
|
|
|
181
|
9
|
|
|
|
|
9
|
my $termination_length = $self->{termination_length}; |
|
182
|
9
|
|
|
|
|
64
|
while (my $line = <$fh>) { |
|
183
|
|
|
|
|
|
|
# Account for crlf-terminated Windows files |
|
184
|
90
|
100
|
|
|
|
208
|
if (index($line, '>') == 0) { |
|
|
|
50
|
|
|
|
|
|
|
185
|
45
|
50
|
|
|
|
101
|
if ($line =~ /^>(\S+)/) { |
|
186
|
|
|
|
|
|
|
print STDERR "Indexed $count quality scores...\n" |
|
187
|
45
|
50
|
33
|
|
|
82
|
if $self->{debug} && (++$count%1000) == 0; |
|
188
|
45
|
|
|
|
|
80
|
$self->_check_linelength($linelen); |
|
189
|
45
|
|
|
|
|
45
|
my $pos = tell($fh); |
|
190
|
45
|
100
|
|
|
|
55
|
if (@ids) { |
|
191
|
36
|
|
|
|
|
32
|
my $strlen = $pos - $offset - length($line); |
|
192
|
36
|
|
|
|
|
27
|
$strlen -= $termination_length * $qual_lines; |
|
193
|
36
|
|
|
|
|
32
|
my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres, |
|
|
36
|
|
|
|
|
55
|
|
|
194
|
|
|
|
|
|
|
$linelen, $headerlen, Bio::DB::IndexedBase::NA, $fileno); |
|
195
|
36
|
|
|
|
|
44
|
for my $id (@ids) { |
|
196
|
36
|
|
|
|
|
395
|
$offsets->{$id} = $ppos; |
|
197
|
|
|
|
|
|
|
} |
|
198
|
36
|
|
|
|
|
40
|
$numres = 0; |
|
199
|
|
|
|
|
|
|
} |
|
200
|
45
|
|
|
|
|
82
|
@ids = $self->_makeid($line); |
|
201
|
45
|
|
|
|
|
47
|
($offset, $headerlen, $linelen, $qual_lines) = ($pos, length $line, 0, 0); |
|
202
|
45
|
|
|
|
|
45
|
($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0); |
|
203
|
|
|
|
|
|
|
} else { |
|
204
|
|
|
|
|
|
|
# Catch bad header lines, bug 3172 |
|
205
|
0
|
|
|
|
|
0
|
$self->throw("FASTA header doesn't match '>(\\S+)': $line"); |
|
206
|
|
|
|
|
|
|
} |
|
207
|
|
|
|
|
|
|
} elsif ($line !~ /\S/) { |
|
208
|
|
|
|
|
|
|
# Skip blank line |
|
209
|
0
|
|
|
|
|
0
|
$blank_lines++; |
|
210
|
0
|
|
|
|
|
0
|
next; |
|
211
|
|
|
|
|
|
|
} else { |
|
212
|
|
|
|
|
|
|
# Need to check every line :( |
|
213
|
45
|
|
|
|
|
35
|
$l3_len = $l2_len; |
|
214
|
45
|
|
|
|
|
30
|
$l2_len = $l_len; |
|
215
|
45
|
|
|
|
|
24
|
$l_len = length $line; |
|
216
|
45
|
|
|
|
|
28
|
if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) { |
|
217
|
45
|
0
|
33
|
|
|
68
|
if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) { |
|
|
|
|
33
|
|
|
|
|
|
218
|
0
|
|
|
|
|
0
|
my $fap = substr($line, 0, 20).".."; |
|
219
|
0
|
|
|
|
|
0
|
$self->throw("Each line of the qual entry must be the same ". |
|
220
|
|
|
|
|
|
|
"length except the last. Line above #$. '$fap' is $l2_len". |
|
221
|
|
|
|
|
|
|
" != $l3_len chars."); |
|
222
|
|
|
|
|
|
|
} |
|
223
|
45
|
50
|
|
|
|
58
|
if ($blank_lines) { |
|
224
|
|
|
|
|
|
|
# Blank lines not allowed in entry |
|
225
|
0
|
|
|
|
|
0
|
$self->throw("Blank lines can only precede header lines, ". |
|
226
|
|
|
|
|
|
|
"found preceding line #$."); |
|
227
|
|
|
|
|
|
|
} |
|
228
|
|
|
|
|
|
|
} |
|
229
|
45
|
|
33
|
|
|
111
|
$linelen ||= length $line; |
|
230
|
45
|
|
|
|
|
29
|
$qual_lines++; |
|
231
|
45
|
|
|
|
|
694
|
$numres += scalar(split /\s+/, $line); |
|
232
|
|
|
|
|
|
|
} |
|
233
|
90
|
|
|
|
|
244
|
$last_line = $line; |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
# Process last entry |
|
237
|
9
|
|
|
|
|
16
|
$self->_check_linelength($linelen); |
|
238
|
9
|
|
|
|
|
9
|
my $pos = tell($fh); |
|
239
|
9
|
50
|
|
|
|
15
|
if (@ids) { |
|
240
|
9
|
|
|
|
|
8
|
my $strlen = $pos - $offset; |
|
241
|
9
|
50
|
|
|
|
12
|
if ($linelen == 0) { |
|
242
|
0
|
|
|
|
|
0
|
$strlen = 0; |
|
243
|
|
|
|
|
|
|
} else { |
|
244
|
9
|
50
|
|
|
|
24
|
if ($last_line !~ /\s$/) { |
|
245
|
0
|
|
|
|
|
0
|
$qual_lines--; |
|
246
|
|
|
|
|
|
|
} |
|
247
|
9
|
|
|
|
|
9
|
$strlen -= $termination_length * $qual_lines; |
|
248
|
|
|
|
|
|
|
} |
|
249
|
9
|
|
|
|
|
9
|
my $ppos = &{$self->{packmeth}}($offset, $strlen, $numres, $linelen, |
|
|
9
|
|
|
|
|
16
|
|
|
250
|
|
|
|
|
|
|
$headerlen, Bio::DB::IndexedBase::NA, $fileno); |
|
251
|
9
|
|
|
|
|
13
|
for my $id (@ids) { |
|
252
|
9
|
|
|
|
|
83
|
$offsets->{$id} = $ppos; |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
|
|
256
|
9
|
|
|
|
|
74
|
return \%offsets; |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# for backward compatibility |
|
261
|
|
|
|
|
|
|
sub get_PrimaryQual_stream { |
|
262
|
1
|
|
|
1
|
0
|
4
|
my $self = shift; |
|
263
|
1
|
|
|
|
|
8
|
return $self->get_PrimarySeq_stream; |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# for backward compatibility |
|
268
|
|
|
|
|
|
|
sub get_Qual_by_id { |
|
269
|
2
|
|
|
2
|
0
|
4
|
my ($self, $id) = @_; |
|
270
|
2
|
|
|
|
|
8
|
return $self->get_Seq_by_id($id); |
|
271
|
|
|
|
|
|
|
} |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
*get_qual_by_version = *get_qual_by_primary_id = *get_qual_by_acc = \&get_Qual_by_id; |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=head2 qual |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
Title : qual, quality, subqual |
|
279
|
|
|
|
|
|
|
Usage : # All quality scores |
|
280
|
|
|
|
|
|
|
my @qualarr = @{$qualdb->subqual($id)}; |
|
281
|
|
|
|
|
|
|
# Subset of the quality scores |
|
282
|
|
|
|
|
|
|
my @subqualarr = @{$qualdb->subqual($id, $start, $stop, $strand)}; |
|
283
|
|
|
|
|
|
|
# or... |
|
284
|
|
|
|
|
|
|
my @subqualarr = @{$qualdb->subqual($compound_id)}; |
|
285
|
|
|
|
|
|
|
Function: Get a subqual of an entry in the database. For your convenience, |
|
286
|
|
|
|
|
|
|
the sequence to extract can be specified with any of the following |
|
287
|
|
|
|
|
|
|
compound IDs: |
|
288
|
|
|
|
|
|
|
$db->qual("$id:$start,$stop") |
|
289
|
|
|
|
|
|
|
$db->qual("$id:$start..$stop") |
|
290
|
|
|
|
|
|
|
$db->qual("$id:$start-$stop") |
|
291
|
|
|
|
|
|
|
$db->qual("$id:$start,$stop/$strand") |
|
292
|
|
|
|
|
|
|
$db->qual("$id:$start..$stop/$strand") |
|
293
|
|
|
|
|
|
|
$db->qual("$id:$start-$stop/$strand") |
|
294
|
|
|
|
|
|
|
$db->qual("$id/$strand") |
|
295
|
|
|
|
|
|
|
If $stop is less than $start, then the reverse complement of the |
|
296
|
|
|
|
|
|
|
sequence is returned. Avoid using it if possible since this goes |
|
297
|
|
|
|
|
|
|
against Bio::Seq conventions. |
|
298
|
|
|
|
|
|
|
Returns : Reference to an array of quality scores |
|
299
|
|
|
|
|
|
|
Args : Compound ID of entry to retrieve |
|
300
|
|
|
|
|
|
|
or |
|
301
|
|
|
|
|
|
|
ID, optional start (defaults to 1), optional end (defaults to the |
|
302
|
|
|
|
|
|
|
number of quality scores for this sequence), and strand (defaults to |
|
303
|
|
|
|
|
|
|
1). |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=cut |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
sub subqual { |
|
308
|
16
|
|
|
16
|
0
|
24
|
my ($self, $id, $start, $stop, $strand) = @_; |
|
309
|
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# Quality values in a quality score can have 1 or 2 digits and are separated |
|
311
|
|
|
|
|
|
|
# by one (or several?) spaces. Thus contrary to Bio::DB::Fasta, here there |
|
312
|
|
|
|
|
|
|
# is no easy way match the position of a quality value to its position in |
|
313
|
|
|
|
|
|
|
# the quality string. |
|
314
|
|
|
|
|
|
|
# As a consequence, if a subqual of the quality is requested, we still need |
|
315
|
|
|
|
|
|
|
# to grab the full quality string first - performance penalty for big |
|
316
|
|
|
|
|
|
|
# quality scores :( |
|
317
|
|
|
|
|
|
|
# I think there is no way around starting at the begining of the quality |
|
318
|
|
|
|
|
|
|
# score but maybe there is a resource-efficient way of starting at the |
|
319
|
|
|
|
|
|
|
# begining of the quality score and stopping when the the position of the |
|
320
|
|
|
|
|
|
|
# last quality value requested is reached?? |
|
321
|
|
|
|
|
|
|
|
|
322
|
16
|
50
|
|
|
|
30
|
$self->throw('Need to provide a sequence ID') if not defined $id; |
|
323
|
16
|
|
|
|
|
33
|
($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand); |
|
324
|
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
# Position in quality string |
|
326
|
16
|
|
|
|
|
20
|
my $string_start = 1; |
|
327
|
16
|
|
|
|
|
28
|
my $string_stop = $self->strlen($id); |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# Fetch full quality string |
|
330
|
16
|
50
|
|
|
|
30
|
my $fh = $self->_fh($id) or return; |
|
331
|
16
|
|
|
|
|
36
|
my $filestart = $self->_calc_offset($id, $string_start); |
|
332
|
16
|
|
|
|
|
23
|
my $filestop = $self->_calc_offset($id, $string_stop ); |
|
333
|
16
|
|
|
|
|
45
|
seek($fh, $filestart,0); |
|
334
|
16
|
|
|
|
|
12
|
my $data; |
|
335
|
16
|
|
|
|
|
79
|
read($fh, $data, $filestop-$filestart+1); |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# Process quality score |
|
338
|
16
|
|
|
|
|
28
|
Bio::DB::IndexedBase::_strip_crnl($data); |
|
339
|
16
|
|
|
|
|
12
|
my $subqual = 0; |
|
340
|
16
|
50
|
33
|
|
|
33
|
$subqual = 1 if ( $start || $stop ); |
|
341
|
16
|
|
|
|
|
13
|
my @data; |
|
342
|
16
|
50
|
33
|
|
|
31
|
if ( $subqual || ($strand == -1) ) { |
|
343
|
16
|
|
|
|
|
80
|
@data = split / /, $data, $stop+1; |
|
344
|
16
|
|
|
|
|
16
|
my $length = scalar(@data); |
|
345
|
16
|
50
|
|
|
|
24
|
$start = 1 if $start < 1; |
|
346
|
16
|
50
|
|
|
|
20
|
$stop = $length if $stop > $length; |
|
347
|
16
|
100
|
|
|
|
27
|
pop @data if ($stop != $length); |
|
348
|
16
|
|
|
|
|
22
|
splice @data, 0, $start-1; |
|
349
|
16
|
100
|
|
|
|
26
|
@data = reverse(@data) if $strand == -1; |
|
350
|
16
|
|
|
|
|
32
|
$data = join ' ', @data; |
|
351
|
|
|
|
|
|
|
} else { |
|
352
|
0
|
|
|
|
|
0
|
@data = split / /, $data; |
|
353
|
|
|
|
|
|
|
} |
|
354
|
|
|
|
|
|
|
|
|
355
|
16
|
|
|
|
|
57
|
return \@data; |
|
356
|
|
|
|
|
|
|
} |
|
357
|
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
*qual = *quality = \&subqual; |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=head2 header |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Title : header |
|
364
|
|
|
|
|
|
|
Usage : my $header = $db->header($id); |
|
365
|
|
|
|
|
|
|
Function: Get the header line (ID and description fields) of the specified entry. |
|
366
|
|
|
|
|
|
|
Returns : String |
|
367
|
|
|
|
|
|
|
Args : ID of entry |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=cut |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
sub header { |
|
372
|
2
|
|
|
2
|
1
|
4
|
my ($self, $id) = @_; |
|
373
|
2
|
50
|
|
|
|
5
|
$self->throw('Need to provide a sequence ID') if not defined $id; |
|
374
|
2
|
|
|
|
|
7
|
my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4]; |
|
|
2
|
|
|
|
|
4
|
|
|
375
|
2
|
|
|
|
|
5
|
$offset -= $headerlen; |
|
376
|
2
|
|
|
|
|
2
|
my $data; |
|
377
|
2
|
50
|
|
|
|
4
|
my $fh = $self->_fh($id) or return; |
|
378
|
2
|
|
|
|
|
8
|
seek($fh, $offset, 0); |
|
379
|
2
|
|
|
|
|
11
|
read($fh, $data, $headerlen); |
|
380
|
|
|
|
|
|
|
# On Windows chomp remove '\n' but leaves '\r' |
|
381
|
|
|
|
|
|
|
# when reading '\r\n' in binary mode, |
|
382
|
|
|
|
|
|
|
# _strip_crnl removes both |
|
383
|
2
|
|
|
|
|
5
|
$data = Bio::DB::IndexedBase::_strip_crnl($data); |
|
384
|
2
|
|
|
|
|
4
|
substr($data, 0, 1) = ''; |
|
385
|
2
|
|
|
|
|
5
|
return $data; |
|
386
|
|
|
|
|
|
|
} |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
#------------------------------------------------------------- |
|
390
|
|
|
|
|
|
|
# Tied hash overrides |
|
391
|
|
|
|
|
|
|
# |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
sub FETCH { |
|
394
|
3
|
|
|
3
|
|
11
|
return shift->subqual(@_); |
|
395
|
|
|
|
|
|
|
} |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
#------------------------------------------------------------- |
|
399
|
|
|
|
|
|
|
# Bio::Seq::PrimaryQual compatibility |
|
400
|
|
|
|
|
|
|
# |
|
401
|
|
|
|
|
|
|
# Usage is the same as in Bio::Seq::PrimaryQual |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
package Bio::Seq::PrimaryQual::Qual; |
|
404
|
1
|
|
|
1
|
|
6
|
use overload '""' => 'display_id'; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
7
|
|
|
405
|
|
|
|
|
|
|
|
|
406
|
1
|
|
|
1
|
|
70
|
use base qw(Bio::Root::Root Bio::Seq::PrimaryQual); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
371
|
|
|
407
|
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
sub new { |
|
409
|
7
|
|
|
7
|
|
14
|
my ($class, @args) = @_; |
|
410
|
7
|
|
|
|
|
19
|
my $self = $class->SUPER::new(@args); |
|
411
|
7
|
|
|
|
|
25
|
my ($db, $id, $start, $stop) = $self->_rearrange( |
|
412
|
|
|
|
|
|
|
[qw(DATABASE ID START STOP)], |
|
413
|
|
|
|
|
|
|
@args); |
|
414
|
7
|
|
|
|
|
12
|
$self->{db} = $db; |
|
415
|
7
|
|
|
|
|
9
|
$self->{id} = $id; |
|
416
|
7
|
|
66
|
|
|
15
|
$self->{stop} = $stop || $db->length($id); |
|
417
|
7
|
|
66
|
|
|
19
|
$self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs |
|
418
|
7
|
|
|
|
|
16
|
return $self; |
|
419
|
|
|
|
|
|
|
} |
|
420
|
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
sub qual { |
|
423
|
6
|
|
|
6
|
|
452
|
my $self = shift; |
|
424
|
6
|
|
|
|
|
14
|
my $qual = $self->{db}->qual($self->{id}, $self->{start}, $self->{stop}); |
|
425
|
6
|
|
|
|
|
28
|
return $qual; |
|
426
|
|
|
|
|
|
|
} |
|
427
|
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
sub subqual { |
|
430
|
2
|
|
|
2
|
|
3
|
my ($self, $start, $stop) = @_; |
|
431
|
2
|
|
|
|
|
5
|
return $self->trunc($start, $stop)->qual; |
|
432
|
|
|
|
|
|
|
} |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
sub trunc { |
|
436
|
|
|
|
|
|
|
# Override Bio::Seq::QualI trunc() method. This way, we create an object |
|
437
|
|
|
|
|
|
|
# that does not store the quality array in memory. |
|
438
|
3
|
|
|
3
|
|
5
|
my ($self, $start, $stop) = @_; |
|
439
|
3
|
50
|
|
|
|
7
|
$self->throw( |
|
440
|
|
|
|
|
|
|
"$stop is smaller than $stop. If you want to truncate and reverse ". |
|
441
|
|
|
|
|
|
|
"complement, you must call trunc followed by revcom." |
|
442
|
|
|
|
|
|
|
) if $start > $stop; |
|
443
|
3
|
50
|
|
|
|
7
|
if ($self->{start} <= $self->{stop}) { |
|
444
|
3
|
|
|
|
|
5
|
$start = $self->{start}+$start-1; |
|
445
|
3
|
|
|
|
|
3
|
$stop = $self->{start}+$stop-1; |
|
446
|
|
|
|
|
|
|
} else { |
|
447
|
0
|
|
|
|
|
0
|
$start = $self->{start}-($start-1); |
|
448
|
0
|
|
|
|
|
0
|
$stop = $self->{start}-($stop-1); |
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
my $obj = $self->new( -database => $self->{db}, |
|
451
|
|
|
|
|
|
|
-id => $self->{id}, |
|
452
|
3
|
|
|
|
|
14
|
-start => $start, |
|
453
|
|
|
|
|
|
|
-stop => $stop |
|
454
|
|
|
|
|
|
|
); |
|
455
|
3
|
|
|
|
|
8
|
return $obj; |
|
456
|
|
|
|
|
|
|
} |
|
457
|
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub display_id { |
|
460
|
8
|
|
|
8
|
|
222
|
my $self = shift; |
|
461
|
8
|
|
|
|
|
20
|
return $self->{id}; |
|
462
|
|
|
|
|
|
|
} |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
sub primary_id { |
|
466
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
467
|
1
|
|
|
|
|
4
|
return overload::StrVal($self); |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
sub revcom { |
|
471
|
|
|
|
|
|
|
# Override Bio::QualI revcom() with optimized method. |
|
472
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
473
|
1
|
|
|
|
|
2
|
return $self->new(@{$self}{'db', 'id', 'stop', 'start'}); |
|
|
1
|
|
|
|
|
3
|
|
|
474
|
|
|
|
|
|
|
} |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub length { |
|
477
|
|
|
|
|
|
|
# Get length from quality location, not the quality array (too expensive) |
|
478
|
3
|
|
|
3
|
|
4
|
my $self = shift; |
|
479
|
|
|
|
|
|
|
return $self->{start} < $self->{stop} ? |
|
480
|
|
|
|
|
|
|
$self->{stop} - $self->{start} + 1 : |
|
481
|
3
|
100
|
|
|
|
18
|
$self->{start} - $self->{stop} + 1 ; |
|
482
|
|
|
|
|
|
|
} |
|
483
|
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
sub description { |
|
486
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
487
|
1
|
|
|
|
|
3
|
my $header = $self->{'db'}->header($self->{id}); |
|
488
|
|
|
|
|
|
|
# remove the id from the header |
|
489
|
1
|
|
|
|
|
5
|
$header = (split(/\s+/, $header, 2))[2]; |
|
490
|
1
|
|
|
|
|
3
|
return $header; |
|
491
|
|
|
|
|
|
|
} |
|
492
|
|
|
|
|
|
|
*desc = \&description; |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
1; |