line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Compress::BGZF::Reader; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
36163
|
use strict; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
45
|
|
4
|
2
|
|
|
2
|
|
4
|
use warnings; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
32
|
|
5
|
|
|
|
|
|
|
|
6
|
2
|
|
|
2
|
|
6
|
use Carp; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
71
|
|
7
|
2
|
|
|
2
|
|
1067
|
use Compress::Zlib; |
|
2
|
|
|
|
|
89125
|
|
|
2
|
|
|
|
|
387
|
|
8
|
2
|
|
|
2
|
|
11
|
use List::Util qw/sum/; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
146
|
|
9
|
|
|
|
|
|
|
|
10
|
2
|
|
|
2
|
|
8
|
use constant BGZF_MAGIC => pack "H*", '1f8b0804'; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
82
|
|
11
|
2
|
|
|
2
|
|
6
|
use constant HEAD_BYTES => 12; # not including extra fields |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
73
|
|
12
|
2
|
|
|
2
|
|
6
|
use constant FOOT_BYTES => 8; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
3536
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
## no critic |
15
|
|
|
|
|
|
|
# allow for filehandle tie'ing |
16
|
3
|
|
|
3
|
|
7
|
sub TIEHANDLE { Compress::BGZF::Reader::new(@_) } |
17
|
39
|
|
|
39
|
|
813
|
sub READ { Compress::BGZF::Reader::_read(@_) } |
18
|
1013
|
|
|
1013
|
|
3449
|
sub READLINE { Compress::BGZF::Reader::getline(@_) } |
19
|
42
|
|
|
42
|
|
7876
|
sub SEEK { Compress::BGZF::Reader::_seek(@_) } |
20
|
3
|
|
|
3
|
|
1535
|
sub CLOSE { close $_[0]->{fh} } |
21
|
3
|
|
|
3
|
|
714
|
sub TELL { return $_[0]->{u_offset} } |
22
|
0
|
|
|
0
|
|
0
|
sub EOF { return $_[0]->{buffer_len} == -1 } |
23
|
3
|
|
|
3
|
|
42
|
sub FILENO { return fileno $_[0]->{fh} } |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# accessors |
26
|
3
|
|
|
3
|
1
|
886
|
sub usize { return $_[0]->{u_file_size} }; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
## use critic |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub new_filehandle { |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
33
|
|
|
|
|
|
|
# ARG 0 : BGZF input filename |
34
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
35
|
|
|
|
|
|
|
# RET 0 : Filehandle GLOB (internally an IO::File object tied to |
36
|
|
|
|
|
|
|
# Compress::BGZF::Reader) |
37
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
38
|
|
|
|
|
|
|
|
39
|
3
|
|
|
3
|
1
|
30
|
my ($class, $fn_in) = @_; |
40
|
3
|
50
|
|
|
|
8
|
croak "input filename required" if (! defined $fn_in); |
41
|
3
|
50
|
|
|
|
11
|
tie *FH, $class, $fn_in or croak "failed to tie filehandle"; |
42
|
|
|
|
|
|
|
|
43
|
3
|
|
|
|
|
6
|
return \*FH; |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub new { |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
50
|
|
|
|
|
|
|
# ARG 0 : BGZF input filename |
51
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
52
|
|
|
|
|
|
|
# RET 0 : Compress::BGZF::Reader object |
53
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
54
|
|
|
|
|
|
|
|
55
|
6
|
|
|
6
|
1
|
2128
|
my ($class, $fn_in) = @_; |
56
|
6
|
|
|
|
|
9
|
my $self = bless {}, $class; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# initialize |
59
|
6
|
50
|
|
|
|
28
|
$self->{fn_in} = $fn_in |
60
|
|
|
|
|
|
|
or croak "Input name required"; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# open compressed file in binary mode |
63
|
6
|
50
|
|
|
|
149
|
open my $fh, '<:raw', $fn_in or croak "Failed to open input file"; ## no critic |
64
|
6
|
|
|
|
|
9
|
$self->{fh} = $fh; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# these member variables allow for data extraction |
67
|
6
|
|
|
|
|
8
|
$self->{buffer} = ''; # contents of currently uncompressed block |
68
|
6
|
|
|
|
|
8
|
$self->{buffer_len} = 0; # save time on frequent length() calls |
69
|
6
|
|
|
|
|
6
|
$self->{block_offset} = 0; # offset of current block in compressed data |
70
|
6
|
|
|
|
|
5
|
$self->{buffer_offset} = 0; # offset of current pos in uncompressed block |
71
|
6
|
|
|
|
|
4
|
$self->{block_size} = 0; # compressed size of current block |
72
|
6
|
|
|
|
|
44
|
$self->{file_size} = -s $fn_in; # size of compressed file |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# these variables are tracked to allow for full seek implementation |
75
|
6
|
|
|
|
|
6
|
$self->{u_offset} = 0; # calculated current uncompressed offset |
76
|
6
|
|
|
|
|
7
|
$self->{u_file_size} = 0; # size of uncompressed file (filled during |
77
|
|
|
|
|
|
|
# indexing |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# load offset index |
80
|
6
|
100
|
|
|
|
139
|
if (-e "$fn_in.gzi") { |
81
|
3
|
|
|
|
|
8
|
$self->_load_index( "$fn_in.gzi" ); # from disk |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
else { |
84
|
3
|
|
|
|
|
8
|
$self->_generate_index(); # on-the-fly |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# load initial block |
88
|
6
|
|
|
|
|
11
|
$self->_load_block(); |
89
|
|
|
|
|
|
|
|
90
|
6
|
|
|
|
|
29
|
return $self; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub _load_block { |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
97
|
|
|
|
|
|
|
# ARG 0 : offset of block start in compressed file |
98
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
99
|
|
|
|
|
|
|
# no returns |
100
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
101
|
|
|
|
|
|
|
|
102
|
57
|
|
|
57
|
|
71
|
my ($self, $block_offset) = @_; |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
# loading a block should always reset buffer offset |
105
|
57
|
|
|
|
|
53
|
$self->{buffer_offset} = 0; |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
# avoid reload of current block |
108
|
|
|
|
|
|
|
return if (defined $block_offset |
109
|
57
|
100
|
100
|
|
|
142
|
&& $block_offset == $self->{block_offset}); |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# if no offset given (e.g. sequential reads), move to next block |
112
|
44
|
100
|
|
|
|
59
|
if (! defined $block_offset) { |
113
|
25
|
|
|
|
|
24
|
$block_offset = $self->{block_offset} + $self->{block_size}; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
44
|
|
|
|
|
38
|
$self->{block_offset} = $block_offset; |
117
|
|
|
|
|
|
|
# deal with EOF |
118
|
|
|
|
|
|
|
croak "Read past file end (perhaps corrupted/truncated input?)" |
119
|
44
|
50
|
|
|
|
57
|
if ($self->{block_offset} > $self->{file_size}); |
120
|
44
|
100
|
|
|
|
63
|
if ($self->{block_offset} == $self->{file_size}) { |
121
|
11
|
|
|
|
|
8
|
$self->{buffer} = ''; |
122
|
11
|
|
|
|
|
12
|
$self->{buffer_len} = -1; |
123
|
11
|
|
|
|
|
17
|
return; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# never assume we're already there |
127
|
33
|
|
|
|
|
70
|
sysseek $self->{fh}, $self->{block_offset}, 0; |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# parse block according to GZIP spec, including content inflation |
130
|
33
|
|
|
|
|
47
|
my ($block_size, $uncompressed_size, $content) |
131
|
|
|
|
|
|
|
= $self->_unpack_block(1); |
132
|
33
|
|
|
|
|
34
|
$self->{block_size} = $block_size; |
133
|
33
|
|
|
|
|
32
|
$self->{buffer_len} = $uncompressed_size; |
134
|
33
|
|
|
|
|
28
|
$self->{buffer} = $content; |
135
|
|
|
|
|
|
|
|
136
|
33
|
|
|
|
|
47
|
return; |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub _unpack_block { |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
143
|
|
|
|
|
|
|
# ARG 0 : bool indicating whether to inflate (and return) actual payload |
144
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
145
|
|
|
|
|
|
|
# RET 0 : compressed block size |
146
|
|
|
|
|
|
|
# RET 1 : uncompressed content size |
147
|
|
|
|
|
|
|
# RET 2 : content (if ARG 0) |
148
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
149
|
|
|
|
|
|
|
|
150
|
44
|
|
|
44
|
|
38
|
my ($self, $do_unpack) = @_; |
151
|
|
|
|
|
|
|
|
152
|
44
|
|
|
|
|
33
|
my @return_values; |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
my ($magic, $mod, $flags, $os, $len_extra) = unpack 'A4A4CCv', |
156
|
44
|
|
|
|
|
64
|
_safe_sysread($self->{fh}, HEAD_BYTES); |
157
|
44
|
|
|
|
|
64
|
my $t = sysseek $self->{fh}, 0, 1; |
158
|
44
|
50
|
|
|
|
67
|
croak "invalid header at $t (corrupt file or not BGZF?)" |
159
|
|
|
|
|
|
|
if ($magic ne BGZF_MAGIC); |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# extract stated block size according to BGZF spec |
162
|
44
|
|
|
|
|
31
|
my $block_size; |
163
|
44
|
|
|
|
|
33
|
my $l = 0; |
164
|
44
|
|
|
|
|
63
|
while ($l < $len_extra) { |
165
|
|
|
|
|
|
|
my ($field_id, $field_len) = unpack 'A2v', |
166
|
44
|
|
|
|
|
52
|
_safe_sysread($self->{fh}, 4); |
167
|
44
|
50
|
|
|
|
80
|
if ($field_id eq 'BC') { |
168
|
44
|
50
|
|
|
|
55
|
croak "invalid BC length" if ($field_len != 2); |
169
|
44
|
50
|
|
|
|
55
|
croak "multiple BC fields" if (defined $block_size); |
170
|
|
|
|
|
|
|
$block_size = unpack 'v', |
171
|
44
|
|
|
|
|
55
|
_safe_sysread($self->{fh} => $field_len); |
172
|
44
|
|
|
|
|
48
|
$block_size += 1; # convert to 1-based |
173
|
|
|
|
|
|
|
} |
174
|
44
|
|
|
|
|
69
|
$l += 4 + $field_len; |
175
|
|
|
|
|
|
|
} |
176
|
44
|
50
|
|
|
|
57
|
croak "invalid extra field length" if ($l != $len_extra); |
177
|
44
|
50
|
|
|
|
61
|
croak "failed to read block size" if (! defined $block_size); |
178
|
|
|
|
|
|
|
|
179
|
44
|
|
|
|
|
36
|
push @return_values, $block_size; |
180
|
44
|
|
|
|
|
43
|
my $payload_len = $block_size - HEAD_BYTES - FOOT_BYTES - $len_extra; |
181
|
44
|
|
|
|
|
27
|
my $content; |
182
|
44
|
100
|
|
|
|
51
|
if ($do_unpack) { |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# decode actual content |
185
|
33
|
|
|
|
|
39
|
my $payload = _safe_sysread($self->{fh}, $payload_len); |
186
|
33
|
|
|
|
|
104
|
my ($i,$status) = inflateInit(-WindowBits => -&MAX_WBITS()); |
187
|
33
|
50
|
|
|
|
3085
|
croak "Error during inflate init\n" if ($status != Z_OK); |
188
|
33
|
|
|
|
|
121
|
($content,$status) = $i->inflate($payload); |
189
|
33
|
50
|
|
|
|
3830
|
croak "Error during inflate run\n" if ($status != Z_STREAM_END); |
190
|
|
|
|
|
|
|
#rawinflate( \$payload => \$content ) |
191
|
|
|
|
|
|
|
#or croak "Error inflating: $RawInflateError\n"; |
192
|
33
|
|
|
|
|
150
|
my $crc_given = unpack 'V', _safe_sysread($self->{fh} => 4); |
193
|
33
|
50
|
|
|
|
3974
|
croak "content CRC32 mismatch" if ( $crc_given != crc32($content) ); |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
} |
196
|
11
|
|
|
|
|
16
|
else { sysseek $self->{fh}, $payload_len + 4, 1; } |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# unpack and possibly check uncompressed payload size |
199
|
44
|
|
|
|
|
58
|
my $size_given = unpack 'V', _safe_sysread($self->{fh} => 4); |
200
|
44
|
50
|
66
|
|
|
143
|
croak "content length mismatch" |
201
|
|
|
|
|
|
|
if ( defined $content && $size_given != length($content) ); |
202
|
44
|
|
|
|
|
40
|
push @return_values, $size_given; |
203
|
44
|
100
|
|
|
|
61
|
push @return_values, $content if (defined $content); |
204
|
|
|
|
|
|
|
|
205
|
44
|
|
|
|
|
79
|
return @return_values; |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub read_data { |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# More OO-ish wrapper around _read(), avoids conflicts with system read() |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
214
|
|
|
|
|
|
|
# ARG 0 : number of bytes to read |
215
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
216
|
|
|
|
|
|
|
# RET 0 : data read |
217
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
218
|
|
|
|
|
|
|
|
219
|
3
|
|
|
3
|
1
|
15
|
my ($self, $bytes) = @_; |
220
|
|
|
|
|
|
|
|
221
|
3
|
|
|
|
|
5
|
my $r = $self->_read( my $buffer, $bytes ); |
222
|
|
|
|
|
|
|
carp "received fewer bytes than requested" |
223
|
3
|
50
|
33
|
|
|
12
|
if ($r < $bytes && $self->{buffer_len} > -1); |
224
|
|
|
|
|
|
|
|
225
|
3
|
50
|
|
|
|
4
|
$buffer = undef if ( $r < 1 ); |
226
|
3
|
|
|
|
|
5
|
return $buffer; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
sub _read { |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
233
|
|
|
|
|
|
|
# ARG 0 : buffer to write to |
234
|
|
|
|
|
|
|
# ARG 1 : bytes to attempt to read |
235
|
|
|
|
|
|
|
# ARG 3 : (optional) offset in buffer to start write (default: 0) |
236
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
237
|
|
|
|
|
|
|
# RET 0 : bytes read (0 at EOF, undef on error) |
238
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# we try to emulate the built-in 'read' call, so we will |
241
|
|
|
|
|
|
|
# overwrite $_[1] and return the number of bytes read. To facilitate this, |
242
|
|
|
|
|
|
|
# make $buf a reference to the buffer passed |
243
|
42
|
|
|
42
|
|
37
|
my $self = shift; |
244
|
42
|
|
|
|
|
32
|
my $buf = \shift; # must be a reference !! |
245
|
42
|
|
|
|
|
33
|
my $bytes = shift; |
246
|
42
|
|
|
|
|
25
|
my $offset = shift; |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# handle cases when $offset is passed in |
249
|
42
|
|
|
|
|
50
|
my $prefix = ''; |
250
|
42
|
100
|
100
|
|
|
120
|
if (defined $offset && $offset != 0) { |
251
|
12
|
|
|
|
|
20
|
$prefix = substr $$buf, 0, $offset; |
252
|
12
|
100
|
|
|
|
22
|
$prefix .= "\0" x ( $offset - length($$buf) ) |
253
|
|
|
|
|
|
|
if ( $offset > length($$buf) ); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
42
|
|
|
|
|
41
|
$$buf = ''; # reset (only AFTER grabbing any prefix above) |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
ITER: |
259
|
42
|
|
|
|
|
70
|
while (length($$buf) < $bytes) { |
260
|
|
|
|
|
|
|
|
261
|
57
|
|
|
|
|
46
|
my $l = length($$buf); |
262
|
57
|
|
|
|
|
44
|
my $remaining = $bytes - $l; |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
# if read is within current block |
265
|
57
|
100
|
|
|
|
85
|
if ($self->{buffer_offset} + $remaining <= $self->{buffer_len}) { |
266
|
19
|
|
|
|
|
136
|
$$buf .= substr $self->{buffer}, $self->{buffer_offset}, $remaining; |
267
|
19
|
|
|
|
|
19
|
$self->{buffer_offset} += $remaining; |
268
|
|
|
|
|
|
|
$self->_load_block() |
269
|
19
|
50
|
|
|
|
46
|
if ($self->{buffer_offset} == $self->{buffer_len}); |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
else { |
272
|
38
|
100
|
|
|
|
63
|
last ITER if ($self->{buffer_len} < 0); #EOF |
273
|
15
|
|
|
|
|
125
|
$$buf .= substr $self->{buffer}, $self->{buffer_offset}; |
274
|
15
|
|
|
|
|
20
|
$self->_load_block(); |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
42
|
|
|
|
|
32
|
my $l = length($$buf); |
280
|
42
|
|
|
|
|
36
|
$self->{u_offset} += $l; |
281
|
42
|
|
|
|
|
154
|
$$buf = $prefix . $$buf; |
282
|
|
|
|
|
|
|
|
283
|
42
|
|
|
|
|
66
|
return $l; |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
sub getline { |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
291
|
|
|
|
|
|
|
# No arguments |
292
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
293
|
|
|
|
|
|
|
# RET 0 : string read (undef at EOF) |
294
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
295
|
|
|
|
|
|
|
|
296
|
1022
|
|
|
1022
|
1
|
768
|
my ($self) = @_; |
297
|
|
|
|
|
|
|
|
298
|
1022
|
|
|
|
|
560
|
my $data; |
299
|
|
|
|
|
|
|
|
300
|
1022
|
|
|
|
|
597
|
while (1) { |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# return immediately if EOF |
303
|
1026
|
100
|
|
|
|
1228
|
last if ($self->{buffer_len} < 0); |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# search current block for record separator |
306
|
|
|
|
|
|
|
# start searching from the current buffer offset |
307
|
1009
|
|
|
|
|
1027
|
pos($self->{buffer}) = $self->{buffer_offset}; |
308
|
|
|
|
|
|
|
|
309
|
1009
|
100
|
|
|
|
2133
|
if ($self->{buffer} =~ m|$/|g) { |
310
|
1005
|
|
|
|
|
625
|
my $pos = pos $self->{buffer}; |
311
|
|
|
|
|
|
|
$data .= substr $self->{buffer}, $self->{buffer_offset}, |
312
|
1005
|
|
|
|
|
1318
|
$pos - $self->{buffer_offset}; |
313
|
|
|
|
|
|
|
|
314
|
1005
|
|
|
|
|
619
|
$self->{buffer_offset} = $pos; |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# if we're at the end of the block, load next |
317
|
1005
|
50
|
|
|
|
1117
|
$self->_load_block if ($pos == $self->{buffer_len}); |
318
|
|
|
|
|
|
|
|
319
|
1005
|
|
|
|
|
626
|
$self->{u_offset} += length($data); |
320
|
|
|
|
|
|
|
|
321
|
1005
|
|
|
|
|
660
|
last; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
} |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
# otherwise, add rest of block to data and load next block |
326
|
4
|
|
|
|
|
8
|
$data .= substr $self->{buffer}, $self->{buffer_offset}; |
327
|
4
|
|
|
|
|
9
|
$self->_load_block; |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
1022
|
100
|
|
|
|
2168
|
return length($data) > 0 ? $data : undef; |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
sub write_index { |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# index format (as defined by htslib) consists of little-endian |
338
|
|
|
|
|
|
|
# int64-coded values. The first value is the number of offsets in the |
339
|
|
|
|
|
|
|
# index. The rest of the values consist of pairs of block offsets relative |
340
|
|
|
|
|
|
|
# to the compressed and uncompressed data. The first offset (always 0,0) |
341
|
|
|
|
|
|
|
# is not included. |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
344
|
|
|
|
|
|
|
# ARG 0 : index output filename |
345
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
346
|
|
|
|
|
|
|
# No returns |
347
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
348
|
|
|
|
|
|
|
|
349
|
3
|
|
|
3
|
1
|
10
|
my ($self, $fn_out) = @_; |
350
|
|
|
|
|
|
|
|
351
|
3
|
50
|
|
|
|
6
|
croak "missing index output filename" if (! defined $fn_out); |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
|
354
|
3
|
50
|
|
|
|
6
|
$self->_generate_index() if (! defined $self->{idx}); |
355
|
3
|
|
|
|
|
3
|
my @offsets = @{ $self->{idx} }; |
|
3
|
|
|
|
|
34
|
|
356
|
3
|
|
|
|
|
4
|
shift @offsets; # don't write first |
357
|
|
|
|
|
|
|
|
358
|
3
|
|
|
|
|
114
|
open my $fh_out, '>:raw', $fn_out; |
359
|
|
|
|
|
|
|
|
360
|
3
|
|
|
|
|
4
|
print {$fh_out} pack('Q<', scalar(@offsets)); |
|
3
|
|
|
|
|
14
|
|
361
|
3
|
|
|
|
|
7
|
for (@offsets) { |
362
|
8
|
|
|
|
|
3
|
print {$fh_out} pack('Q<', $_->[0]); |
|
8
|
|
|
|
|
11
|
|
363
|
8
|
|
|
|
|
6
|
print {$fh_out} pack('Q<', $_->[1]); |
|
8
|
|
|
|
|
12
|
|
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
|
366
|
3
|
|
|
|
|
80
|
close $fh_out; |
367
|
|
|
|
|
|
|
|
368
|
3
|
|
|
|
|
10
|
return; |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
sub _load_index { |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
375
|
|
|
|
|
|
|
# ARG 0 : index input filename |
376
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
377
|
|
|
|
|
|
|
# No returns |
378
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
379
|
|
|
|
|
|
|
|
380
|
3
|
|
|
3
|
|
3
|
my ($self, $fn_in) = @_; |
381
|
3
|
50
|
|
|
|
7
|
croak "missing index input filename" if (! defined $fn_in); |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
#TODO: speed up index parsing |
384
|
|
|
|
|
|
|
|
385
|
3
|
50
|
|
|
|
45
|
open my $fh_in, '<:raw', $fn_in or croak "error opening index"; |
386
|
3
|
50
|
|
|
|
27
|
read( $fh_in, my $n_offsets, 8) |
387
|
|
|
|
|
|
|
or croak "failed to read first quad"; |
388
|
3
|
|
|
|
|
8
|
$n_offsets = unpack 'Q<', $n_offsets; |
389
|
3
|
|
|
|
|
3
|
my @idx; |
390
|
3
|
|
|
|
|
7
|
for (0..$n_offsets-1) { |
391
|
8
|
50
|
|
|
|
13
|
read( $fh_in, my $buff, 16) or croak "error reading index"; |
392
|
8
|
|
|
|
|
20
|
$idx[$_] = [ unpack 'Q
|
393
|
|
|
|
|
|
|
} |
394
|
3
|
|
|
|
|
13
|
close $fh_in; |
395
|
3
|
|
|
|
|
7
|
unshift @idx, [0,0]; # add initial offsets |
396
|
|
|
|
|
|
|
|
397
|
3
|
|
|
|
|
3
|
$self->{u_file_size} = $idx[-1]->[1]; |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# some indices created by htslib bgzip are missing last offset pair |
400
|
|
|
|
|
|
|
# check for that here by loading last block in index and proceeding from |
401
|
|
|
|
|
|
|
# there. Also calculate uncompressed file size at the same time. |
402
|
3
|
|
|
|
|
4
|
my $c_size = $idx[-1]->[0]; |
403
|
3
|
|
|
|
|
7
|
sysseek $self->{fh}, $idx[-1]->[0], 0; |
404
|
3
|
|
|
|
|
7
|
my ($c, $u) = $self->_unpack_block(0); |
405
|
3
|
|
|
|
|
2
|
$self->{u_file_size} += $u; |
406
|
3
|
|
|
|
|
3
|
$c_size += $c; |
407
|
3
|
|
|
|
|
8
|
while ($c_size < $self->{file_size}) { |
408
|
0
|
|
|
|
|
0
|
push @idx, [$idx[-1]->[0]+$c, $idx[-1]->[1]+$u]; |
409
|
0
|
|
|
|
|
0
|
sysseek $self->{fh}, $idx[-1]->[0], 0; |
410
|
0
|
|
|
|
|
0
|
($c, $u) = $self->_unpack_block(0); |
411
|
0
|
|
|
|
|
0
|
$self->{u_file_size} += $u; |
412
|
0
|
|
|
|
|
0
|
$c_size += $c; |
413
|
|
|
|
|
|
|
} |
414
|
|
|
|
|
|
|
croak "Unexpected file size/last index mismatch ($c_size v $self->{file_size})" |
415
|
3
|
50
|
|
|
|
7
|
if ($c_size != $self->{file_size}); |
416
|
|
|
|
|
|
|
|
417
|
3
|
|
|
|
|
6
|
$self->{idx} = [@idx]; |
418
|
3
|
|
|
|
|
16
|
$self->{ridx}->{$_->[0]} = $_->[1] for (@idx); |
419
|
|
|
|
|
|
|
|
420
|
3
|
|
|
|
|
6
|
sysseek $self->{fh}, $self->{block_offset}, 0; |
421
|
|
|
|
|
|
|
|
422
|
3
|
|
|
|
|
10
|
return; |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub _generate_index { |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
429
|
|
|
|
|
|
|
# No arguments |
430
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
431
|
|
|
|
|
|
|
# No returns |
432
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
433
|
|
|
|
|
|
|
|
434
|
3
|
|
|
3
|
|
4
|
my ($self) = @_; |
435
|
|
|
|
|
|
|
|
436
|
3
|
|
|
|
|
3
|
my $uncmp_offset = 0; |
437
|
3
|
|
|
|
|
2
|
my $cmp_offset = 0; |
438
|
3
|
|
|
|
|
3
|
my $i = 0; |
439
|
3
|
|
|
|
|
4
|
$self->{u_file_size} = 0; |
440
|
3
|
|
|
|
|
4
|
$self->{idx} = []; |
441
|
3
|
|
|
|
|
4
|
$self->{ridx} = {}; |
442
|
|
|
|
|
|
|
|
443
|
3
|
|
|
|
|
7
|
sysseek $self->{fh}, 0, 0; |
444
|
|
|
|
|
|
|
|
445
|
3
|
|
|
|
|
7
|
while ($cmp_offset < $self->{file_size}) { |
446
|
|
|
|
|
|
|
|
447
|
8
|
|
|
|
|
8
|
push @{$self->{idx}}, [$cmp_offset, $uncmp_offset]; |
|
8
|
|
|
|
|
12
|
|
448
|
8
|
|
|
|
|
11
|
$self->{ridx}->{$cmp_offset} = $uncmp_offset; |
449
|
|
|
|
|
|
|
|
450
|
8
|
|
|
|
|
12
|
my ($block_size, $uncompressed_size) = $self->_unpack_block(0); |
451
|
|
|
|
|
|
|
|
452
|
8
|
|
|
|
|
6
|
$cmp_offset += $block_size; |
453
|
8
|
|
|
|
|
6
|
$uncmp_offset += $uncompressed_size; |
454
|
8
|
|
|
|
|
12
|
$self->{u_file_size} += $uncompressed_size; |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
} |
457
|
|
|
|
|
|
|
|
458
|
3
|
|
|
|
|
5
|
sysseek $self->{fh}, $self->{block_offset}, 0; |
459
|
|
|
|
|
|
|
|
460
|
3
|
|
|
|
|
5
|
return; |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
} |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
sub move_to { |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
# Wrapper around _seek(), avoids conflicts with system seek() |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
470
|
|
|
|
|
|
|
# ARG 0 : byte offset to which to seek |
471
|
|
|
|
|
|
|
# ARG 1 : relativity of offset (0: file start, 1: current, 2: file end) |
472
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
473
|
|
|
|
|
|
|
# no returns |
474
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
477
|
12
|
|
|
12
|
1
|
4280
|
my ($self, @args) = @_; |
478
|
12
|
|
|
|
|
21
|
$self->_seek( @args ); |
479
|
|
|
|
|
|
|
|
480
|
12
|
|
|
|
|
17
|
return; |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
sub _seek { |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
488
|
|
|
|
|
|
|
# ARG 0 : byte offset to which to seek |
489
|
|
|
|
|
|
|
# ARG 1 : relativity of offset (0: file start, 1: current, 2: file end) |
490
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
491
|
|
|
|
|
|
|
# no returns |
492
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
493
|
|
|
|
|
|
|
|
494
|
54
|
|
|
54
|
|
59
|
my ($self, $pos, $whence) = @_; |
495
|
|
|
|
|
|
|
|
496
|
54
|
100
|
|
|
|
96
|
$pos += $self->{u_offset} if ($whence == 1); |
497
|
54
|
100
|
|
|
|
75
|
$pos = $self->{u_file_size} + $pos if ($whence == 2); |
498
|
|
|
|
|
|
|
|
499
|
54
|
100
|
|
|
|
73
|
return if ($pos < 0); |
500
|
47
|
100
|
|
|
|
72
|
if ($pos >= $self->{u_file_size}) { |
501
|
15
|
|
|
|
|
16
|
$self->{buffer_len} = -1; |
502
|
15
|
|
|
|
|
10
|
$self->{u_offset} = $pos; |
503
|
15
|
|
|
|
|
15
|
$self->{block_offset} = $pos; |
504
|
15
|
|
|
|
|
24
|
return 1; |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
# Do seeded search for nearest block start <= $pos |
508
|
|
|
|
|
|
|
# (although we don't know the size of each block, we can determine the |
509
|
|
|
|
|
|
|
# mean length and usually choose a starting value close to the actual - |
510
|
|
|
|
|
|
|
# benchmarks much faster than binary search) |
511
|
|
|
|
|
|
|
# TODO: benchmark whether breaking this out and Memoizing speeds things up |
512
|
|
|
|
|
|
|
|
513
|
32
|
|
|
|
|
22
|
my $s = scalar @{$self->{idx}}; |
|
32
|
|
|
|
|
39
|
|
514
|
32
|
|
|
|
|
65
|
my $idx = int($pos/($self->{u_file_size}) * $s); |
515
|
32
|
|
|
|
|
16
|
while (1) { |
516
|
35
|
100
|
|
|
|
61
|
if ($pos < $self->{idx}->[$idx]->[1]) { |
517
|
3
|
|
|
|
|
3
|
--$idx; |
518
|
3
|
|
|
|
|
3
|
next; |
519
|
|
|
|
|
|
|
} |
520
|
32
|
50
|
66
|
|
|
99
|
if ($idx+1 < $s && $pos >= $self->{idx}->[$idx+1]->[1]) { |
521
|
0
|
|
|
|
|
0
|
++$idx; |
522
|
0
|
|
|
|
|
0
|
next; |
523
|
|
|
|
|
|
|
} |
524
|
32
|
|
|
|
|
29
|
last; |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
|
527
|
32
|
|
|
|
|
58
|
my $block_o = $self->{idx}->[$idx]->[0]; |
528
|
32
|
|
|
|
|
28
|
my $block_o_u = $self->{idx}->[$idx]->[1]; |
529
|
32
|
|
|
|
|
27
|
my $buff_o = $pos - $block_o_u; |
530
|
|
|
|
|
|
|
|
531
|
32
|
|
|
|
|
41
|
$self->_load_block( $block_o ); |
532
|
32
|
|
|
|
|
27
|
$self->{buffer_offset} = $buff_o; |
533
|
32
|
|
|
|
|
30
|
$self->{u_offset} = $block_o_u + $buff_o; |
534
|
|
|
|
|
|
|
|
535
|
32
|
|
|
|
|
58
|
return 1; |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
sub get_vo { |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
542
|
|
|
|
|
|
|
# no arguments |
543
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
544
|
|
|
|
|
|
|
# RET 0 : virtual offset of current position |
545
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
546
|
|
|
|
|
|
|
|
547
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
548
|
0
|
|
|
|
|
0
|
return ($self->{block_offset} << 16) | $self->{buffer_offset}; |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
} |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
sub move_to_vo { |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
555
|
|
|
|
|
|
|
# ARG 0 : virtual offset (see POD for definition) |
556
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
557
|
|
|
|
|
|
|
# no returns |
558
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
559
|
|
|
|
|
|
|
|
560
|
0
|
|
|
0
|
1
|
0
|
my ($self, $vo) = @_; |
561
|
0
|
|
|
|
|
0
|
my $block_o = $vo >> 16; |
562
|
0
|
|
|
|
|
0
|
my $buff_o = $vo ^ ($block_o << 16); |
563
|
0
|
|
|
|
|
0
|
$self->_load_block( $block_o ); |
564
|
0
|
|
|
|
|
0
|
$self->{buffer_offset} = $buff_o; |
565
|
0
|
0
|
|
|
|
0
|
croak "invalid block offset" if (! defined $self->{ridx}->{$block_o}); |
566
|
0
|
|
|
|
|
0
|
$self->{u_offset} = $self->{ridx}->{$block_o} + $buff_o; |
567
|
|
|
|
|
|
|
|
568
|
0
|
|
|
|
|
0
|
return; |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
sub _safe_sysread { |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
# sysread wrapper that checks return count and returns read |
575
|
|
|
|
|
|
|
# (internally we should never read off end of file - doing so indicates |
576
|
|
|
|
|
|
|
# either a software bug or a corrupt input file so we croak) |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
579
|
|
|
|
|
|
|
# ARG 0 : bytes to read |
580
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
581
|
|
|
|
|
|
|
# RET 0 : string read |
582
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
583
|
|
|
|
|
|
|
|
584
|
242
|
|
|
242
|
|
190
|
my ($fh, $len) = @_; |
585
|
242
|
|
|
|
|
160
|
my $buf = ''; |
586
|
242
|
|
|
|
|
813
|
my $r = sysread $fh, $buf, $len; |
587
|
242
|
50
|
|
|
|
308
|
croak "returned unexpected byte count" if ($r != $len); |
588
|
|
|
|
|
|
|
|
589
|
242
|
|
|
|
|
455
|
return $buf; |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
1; |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
__END__ |