line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Compress::BGZF::Reader; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
110153
|
use strict; |
|
2
|
|
|
|
|
14
|
|
|
2
|
|
|
|
|
47
|
|
4
|
2
|
|
|
2
|
|
7
|
use warnings; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
46
|
|
5
|
|
|
|
|
|
|
|
6
|
2
|
|
|
2
|
|
9
|
use Carp; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
85
|
|
7
|
2
|
|
|
2
|
|
1016
|
use Compress::Zlib; |
|
2
|
|
|
|
|
100363
|
|
|
2
|
|
|
|
|
359
|
|
8
|
2
|
|
|
2
|
|
15
|
use List::Util qw/sum/; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
154
|
|
9
|
2
|
|
|
2
|
|
827
|
use FileHandle; |
|
2
|
|
|
|
|
1472
|
|
|
2
|
|
|
|
|
9
|
|
10
|
|
|
|
|
|
|
|
11
|
2
|
|
|
2
|
|
500
|
use constant BGZF_MAGIC => pack "H*", '1f8b0804'; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
123
|
|
12
|
2
|
|
|
2
|
|
14
|
use constant HEAD_BYTES => 12; # not including extra fields |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
80
|
|
13
|
2
|
|
|
2
|
|
9
|
use constant FOOT_BYTES => 8; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
4297
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
## no critic |
16
|
|
|
|
|
|
|
# allow for filehandle tie'ing |
17
|
3
|
|
|
3
|
|
7
|
sub TIEHANDLE { Compress::BGZF::Reader::new(@_) } |
18
|
39
|
|
|
39
|
|
1275
|
sub READ { Compress::BGZF::Reader::_read(@_) } |
19
|
1013
|
|
|
1013
|
|
4093
|
sub READLINE { Compress::BGZF::Reader::getline(@_) } |
20
|
42
|
|
|
42
|
|
10428
|
sub SEEK { Compress::BGZF::Reader::_seek(@_) } |
21
|
3
|
|
|
3
|
|
1954
|
sub CLOSE { close $_[0]->{fh} } |
22
|
3
|
|
|
3
|
|
966
|
sub TELL { return $_[0]->{u_offset} } |
23
|
0
|
|
|
0
|
|
0
|
sub EOF { return $_[0]->{buffer_len} == -1 } |
24
|
3
|
|
|
3
|
|
57
|
sub FILENO { return fileno $_[0]->{fh} } |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# accessors |
27
|
3
|
|
|
3
|
1
|
809
|
sub usize { return $_[0]->{u_file_size} }; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
## use critic |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub new_filehandle { |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
34
|
|
|
|
|
|
|
# ARG 0 : BGZF input filename |
35
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
36
|
|
|
|
|
|
|
# RET 0 : Filehandle GLOB (internally an IO::File object tied to |
37
|
|
|
|
|
|
|
# Compress::BGZF::Reader) |
38
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
39
|
|
|
|
|
|
|
|
40
|
3
|
|
|
3
|
1
|
47
|
my ($class, $fn_in) = @_; |
41
|
3
|
50
|
|
|
|
7
|
croak "input filename required" if (! defined $fn_in); |
42
|
3
|
|
|
|
|
14
|
my $fh = FileHandle->new; |
43
|
3
|
50
|
|
|
|
125
|
tie *$fh, $class, $fn_in or croak "failed to tie filehandle"; |
44
|
|
|
|
|
|
|
#tie *FH, $class, $fn_in or croak "failed to tie filehandle"; |
45
|
|
|
|
|
|
|
|
46
|
3
|
|
|
|
|
9
|
return $fh; |
47
|
|
|
|
|
|
|
#return \*FH; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sub new { |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
54
|
|
|
|
|
|
|
# ARG 0 : BGZF input filename |
55
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
56
|
|
|
|
|
|
|
# RET 0 : Compress::BGZF::Reader object |
57
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
58
|
|
|
|
|
|
|
|
59
|
6
|
|
|
6
|
1
|
2166
|
my ($class, $fn_in) = @_; |
60
|
6
|
|
|
|
|
12
|
my $self = bless {}, $class; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# initialize |
63
|
6
|
50
|
|
|
|
24
|
$self->{fn_in} = $fn_in |
64
|
|
|
|
|
|
|
or croak "Input name required"; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# open compressed file in binary mode |
67
|
6
|
50
|
|
|
|
185
|
open my $fh, '<:raw', $fn_in or croak "Failed to open input file"; ## no critic |
68
|
6
|
|
|
|
|
19
|
$self->{fh} = $fh; |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# these member variables allow for data extraction |
71
|
6
|
|
|
|
|
8
|
$self->{buffer} = ''; # contents of currently uncompressed block |
72
|
6
|
|
|
|
|
11
|
$self->{buffer_len} = 0; # save time on frequent length() calls |
73
|
6
|
|
|
|
|
11
|
$self->{block_offset} = 0; # offset of current block in compressed data |
74
|
6
|
|
|
|
|
7
|
$self->{buffer_offset} = 0; # offset of current pos in uncompressed block |
75
|
6
|
|
|
|
|
8
|
$self->{block_size} = 0; # compressed size of current block |
76
|
6
|
|
|
|
|
64
|
$self->{file_size} = -s $fn_in; # size of compressed file |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# these variables are tracked to allow for full seek implementation |
79
|
6
|
|
|
|
|
12
|
$self->{u_offset} = 0; # calculated current uncompressed offset |
80
|
6
|
|
|
|
|
10
|
$self->{u_file_size} = 0; # size of uncompressed file (filled during |
81
|
|
|
|
|
|
|
# indexing |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# load offset index |
84
|
6
|
100
|
|
|
|
108
|
if (-e "$fn_in.gzi") { |
85
|
3
|
|
|
|
|
15
|
$self->_load_index( "$fn_in.gzi" ); # from disk |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
else { |
88
|
3
|
|
|
|
|
10
|
$self->_generate_index(); # on-the-fly |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# load initial block |
92
|
6
|
|
|
|
|
17
|
$self->_load_block(); |
93
|
|
|
|
|
|
|
|
94
|
6
|
|
|
|
|
22
|
return $self; |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub _load_block { |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
101
|
|
|
|
|
|
|
# ARG 0 : offset of block start in compressed file |
102
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
103
|
|
|
|
|
|
|
# no returns |
104
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
105
|
|
|
|
|
|
|
|
106
|
57
|
|
|
57
|
|
87
|
my ($self, $block_offset) = @_; |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# loading a block should always reset buffer offset |
109
|
57
|
|
|
|
|
74
|
$self->{buffer_offset} = 0; |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# avoid reload of current block |
112
|
|
|
|
|
|
|
return if (defined $block_offset |
113
|
57
|
100
|
100
|
|
|
150
|
&& $block_offset == $self->{block_offset}); |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
# if no offset given (e.g. sequential reads), move to next block |
116
|
44
|
100
|
|
|
|
76
|
if (! defined $block_offset) { |
117
|
25
|
|
|
|
|
35
|
$block_offset = $self->{block_offset} + $self->{block_size}; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
44
|
|
|
|
|
62
|
$self->{block_offset} = $block_offset; |
121
|
|
|
|
|
|
|
# deal with EOF |
122
|
|
|
|
|
|
|
croak "Read past file end (perhaps corrupted/truncated input?)" |
123
|
44
|
50
|
|
|
|
73
|
if ($self->{block_offset} > $self->{file_size}); |
124
|
44
|
100
|
|
|
|
76
|
if ($self->{block_offset} == $self->{file_size}) { |
125
|
11
|
|
|
|
|
16
|
$self->{buffer} = ''; |
126
|
11
|
|
|
|
|
13
|
$self->{buffer_len} = -1; |
127
|
11
|
|
|
|
|
24
|
return; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# never assume we're already there |
131
|
33
|
|
|
|
|
181
|
sysseek $self->{fh}, $self->{block_offset}, 0; |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# parse block according to GZIP spec, including content inflation |
134
|
33
|
|
|
|
|
90
|
my ($block_size, $uncompressed_size, $content) |
135
|
|
|
|
|
|
|
= $self->_unpack_block(1); |
136
|
33
|
|
|
|
|
54
|
$self->{block_size} = $block_size; |
137
|
33
|
|
|
|
|
42
|
$self->{buffer_len} = $uncompressed_size; |
138
|
33
|
|
|
|
|
57
|
$self->{buffer} = $content; |
139
|
|
|
|
|
|
|
|
140
|
33
|
|
|
|
|
57
|
return; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub _unpack_block { |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
147
|
|
|
|
|
|
|
# ARG 0 : bool indicating whether to inflate (and return) actual payload |
148
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
149
|
|
|
|
|
|
|
# RET 0 : compressed block size |
150
|
|
|
|
|
|
|
# RET 1 : uncompressed content size |
151
|
|
|
|
|
|
|
# RET 2 : content (if ARG 0) |
152
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
153
|
|
|
|
|
|
|
|
154
|
44
|
|
|
44
|
|
68
|
my ($self, $do_unpack) = @_; |
155
|
|
|
|
|
|
|
|
156
|
44
|
|
|
|
|
48
|
my @return_values; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
my ($magic, $mod, $flags, $os, $len_extra) = unpack 'A4A4CCv', |
160
|
44
|
|
|
|
|
87
|
_safe_sysread($self->{fh}, HEAD_BYTES); |
161
|
44
|
|
|
|
|
213
|
my $t = sysseek $self->{fh}, 0, 1; |
162
|
44
|
50
|
|
|
|
115
|
croak "invalid header at $t (corrupt file or not BGZF?)" |
163
|
|
|
|
|
|
|
if ($magic ne BGZF_MAGIC); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# extract stated block size according to BGZF spec |
166
|
44
|
|
|
|
|
54
|
my $block_size; |
167
|
44
|
|
|
|
|
58
|
my $l = 0; |
168
|
44
|
|
|
|
|
82
|
while ($l < $len_extra) { |
169
|
|
|
|
|
|
|
my ($field_id, $field_len) = unpack 'A2v', |
170
|
44
|
|
|
|
|
88
|
_safe_sysread($self->{fh}, 4); |
171
|
44
|
50
|
|
|
|
97
|
if ($field_id eq 'BC') { |
172
|
44
|
50
|
|
|
|
67
|
croak "invalid BC length" if ($field_len != 2); |
173
|
44
|
50
|
|
|
|
67
|
croak "multiple BC fields" if (defined $block_size); |
174
|
|
|
|
|
|
|
$block_size = unpack 'v', |
175
|
44
|
|
|
|
|
71
|
_safe_sysread($self->{fh} => $field_len); |
176
|
44
|
|
|
|
|
68
|
$block_size += 1; # convert to 1-based |
177
|
|
|
|
|
|
|
} |
178
|
44
|
|
|
|
|
81
|
$l += 4 + $field_len; |
179
|
|
|
|
|
|
|
} |
180
|
44
|
50
|
|
|
|
72
|
croak "invalid extra field length" if ($l != $len_extra); |
181
|
44
|
50
|
|
|
|
63
|
croak "failed to read block size" if (! defined $block_size); |
182
|
|
|
|
|
|
|
|
183
|
44
|
|
|
|
|
64
|
push @return_values, $block_size; |
184
|
44
|
|
|
|
|
62
|
my $payload_len = $block_size - HEAD_BYTES - FOOT_BYTES - $len_extra; |
185
|
44
|
|
|
|
|
46
|
my $content; |
186
|
44
|
100
|
|
|
|
65
|
if ($do_unpack) { |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# decode actual content |
189
|
33
|
|
|
|
|
63
|
my $payload = _safe_sysread($self->{fh}, $payload_len); |
190
|
33
|
|
|
|
|
97
|
my ($i,$status) = inflateInit(-WindowBits => -&MAX_WBITS()); |
191
|
33
|
50
|
|
|
|
4418
|
croak "Error during inflate init\n" if ($status != Z_OK); |
192
|
33
|
|
|
|
|
139
|
($content,$status) = $i->inflate($payload); |
193
|
33
|
50
|
|
|
|
4176
|
croak "Error during inflate run\n" if ($status != Z_STREAM_END); |
194
|
|
|
|
|
|
|
#rawinflate( \$payload => \$content ) |
195
|
|
|
|
|
|
|
#or croak "Error inflating: $RawInflateError\n"; |
196
|
33
|
|
|
|
|
213
|
my $crc_given = unpack 'V', _safe_sysread($self->{fh} => 4); |
197
|
33
|
50
|
|
|
|
4068
|
croak "content CRC32 mismatch" if ( $crc_given != crc32($content) ); |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
} |
200
|
11
|
|
|
|
|
47
|
else { sysseek $self->{fh}, $payload_len + 4, 1; } |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# unpack and possibly check uncompressed payload size |
203
|
44
|
|
|
|
|
102
|
my $size_given = unpack 'V', _safe_sysread($self->{fh} => 4); |
204
|
44
|
50
|
66
|
|
|
161
|
croak "content length mismatch" |
205
|
|
|
|
|
|
|
if ( defined $content && $size_given != length($content) ); |
206
|
44
|
|
|
|
|
61
|
push @return_values, $size_given; |
207
|
44
|
100
|
|
|
|
73
|
push @return_values, $content if (defined $content); |
208
|
|
|
|
|
|
|
|
209
|
44
|
|
|
|
|
110
|
return @return_values; |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub read_data { |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# More OO-ish wrapper around _read(), avoids conflicts with system read() |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
218
|
|
|
|
|
|
|
# ARG 0 : number of bytes to read |
219
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
220
|
|
|
|
|
|
|
# RET 0 : data read |
221
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
222
|
|
|
|
|
|
|
|
223
|
3
|
|
|
3
|
1
|
31
|
my ($self, $bytes) = @_; |
224
|
|
|
|
|
|
|
|
225
|
3
|
|
|
|
|
9
|
my $r = $self->_read( my $buffer, $bytes ); |
226
|
|
|
|
|
|
|
carp "received fewer bytes than requested" |
227
|
3
|
50
|
33
|
|
|
12
|
if ($r < $bytes && $self->{buffer_len} > -1); |
228
|
|
|
|
|
|
|
|
229
|
3
|
50
|
|
|
|
5
|
$buffer = undef if ( $r < 1 ); |
230
|
3
|
|
|
|
|
8
|
return $buffer; |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
sub _read { |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
237
|
|
|
|
|
|
|
# ARG 0 : buffer to write to |
238
|
|
|
|
|
|
|
# ARG 1 : bytes to attempt to read |
239
|
|
|
|
|
|
|
# ARG 3 : (optional) offset in buffer to start write (default: 0) |
240
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
241
|
|
|
|
|
|
|
# RET 0 : bytes read (0 at EOF, undef on error) |
242
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# we try to emulate the built-in 'read' call, so we will |
245
|
|
|
|
|
|
|
# overwrite $_[1] and return the number of bytes read. To facilitate this, |
246
|
|
|
|
|
|
|
# make $buf a reference to the buffer passed |
247
|
42
|
|
|
42
|
|
64
|
my $self = shift; |
248
|
42
|
|
|
|
|
56
|
my $buf = \shift; # must be a reference !! |
249
|
42
|
|
|
|
|
52
|
my $bytes = shift; |
250
|
42
|
|
|
|
|
55
|
my $offset = shift; |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
# handle cases when $offset is passed in |
253
|
42
|
|
|
|
|
53
|
my $prefix = ''; |
254
|
42
|
100
|
100
|
|
|
123
|
if (defined $offset && $offset != 0) { |
255
|
12
|
|
|
|
|
26
|
$prefix = substr $$buf, 0, $offset; |
256
|
12
|
100
|
|
|
|
27
|
$prefix .= "\0" x ( $offset - length($$buf) ) |
257
|
|
|
|
|
|
|
if ( $offset > length($$buf) ); |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
42
|
|
|
|
|
58
|
$$buf = ''; # reset (only AFTER grabbing any prefix above) |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
ITER: |
263
|
42
|
|
|
|
|
70
|
while (length($$buf) < $bytes) { |
264
|
|
|
|
|
|
|
|
265
|
57
|
|
|
|
|
65
|
my $l = length($$buf); |
266
|
57
|
|
|
|
|
65
|
my $remaining = $bytes - $l; |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# if read is within current block |
269
|
57
|
100
|
|
|
|
106
|
if ($self->{buffer_offset} + $remaining <= $self->{buffer_len}) { |
270
|
19
|
|
|
|
|
217
|
$$buf .= substr $self->{buffer}, $self->{buffer_offset}, $remaining; |
271
|
19
|
|
|
|
|
26
|
$self->{buffer_offset} += $remaining; |
272
|
|
|
|
|
|
|
$self->_load_block() |
273
|
19
|
50
|
|
|
|
44
|
if ($self->{buffer_offset} == $self->{buffer_len}); |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
else { |
276
|
38
|
100
|
|
|
|
77
|
last ITER if ($self->{buffer_len} < 0); #EOF |
277
|
15
|
|
|
|
|
286
|
$$buf .= substr $self->{buffer}, $self->{buffer_offset}; |
278
|
15
|
|
|
|
|
32
|
$self->_load_block(); |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
42
|
|
|
|
|
46
|
my $l = length($$buf); |
284
|
42
|
|
|
|
|
54
|
$self->{u_offset} += $l; |
285
|
42
|
|
|
|
|
231
|
$$buf = $prefix . $$buf; |
286
|
|
|
|
|
|
|
|
287
|
42
|
|
|
|
|
94
|
return $l; |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub getline { |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
295
|
|
|
|
|
|
|
# No arguments |
296
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
297
|
|
|
|
|
|
|
# RET 0 : string read (undef at EOF) |
298
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
299
|
|
|
|
|
|
|
|
300
|
1022
|
|
|
1022
|
1
|
1365
|
my ($self) = @_; |
301
|
|
|
|
|
|
|
|
302
|
1022
|
|
|
|
|
1126
|
my $data = ''; |
303
|
|
|
|
|
|
|
|
304
|
1022
|
|
|
|
|
1016
|
while (1) { |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# return immediately if EOF |
307
|
1026
|
100
|
|
|
|
1469
|
last if ($self->{buffer_len} < 0); |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# search current block for record separator |
310
|
|
|
|
|
|
|
# start searching from the current buffer offset |
311
|
1009
|
|
|
|
|
1586
|
pos($self->{buffer}) = $self->{buffer_offset}; |
312
|
|
|
|
|
|
|
|
313
|
1009
|
100
|
|
|
|
2630
|
if ($self->{buffer} =~ m|$/|g) { |
314
|
1005
|
|
|
|
|
1320
|
my $pos = pos $self->{buffer}; |
315
|
|
|
|
|
|
|
$data .= substr $self->{buffer}, $self->{buffer_offset}, |
316
|
1005
|
|
|
|
|
1846
|
$pos - $self->{buffer_offset}; |
317
|
|
|
|
|
|
|
|
318
|
1005
|
|
|
|
|
1095
|
$self->{buffer_offset} = $pos; |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# if we're at the end of the block, load next |
321
|
1005
|
50
|
|
|
|
1402
|
$self->_load_block if ($pos == $self->{buffer_len}); |
322
|
|
|
|
|
|
|
|
323
|
1005
|
|
|
|
|
1100
|
$self->{u_offset} += length($data); |
324
|
|
|
|
|
|
|
|
325
|
1005
|
|
|
|
|
1124
|
last; |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
# otherwise, add rest of block to data and load next block |
330
|
4
|
|
|
|
|
9
|
$data .= substr $self->{buffer}, $self->{buffer_offset}; |
331
|
4
|
|
|
|
|
9
|
$self->_load_block; |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
1022
|
100
|
|
|
|
2105
|
return length($data) > 0 ? $data : undef; |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
sub write_index { |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
# index format (as defined by htslib) consists of little-endian |
342
|
|
|
|
|
|
|
# int64-coded values. The first value is the number of offsets in the |
343
|
|
|
|
|
|
|
# index. The rest of the values consist of pairs of block offsets relative |
344
|
|
|
|
|
|
|
# to the compressed and uncompressed data. The first offset (always 0,0) |
345
|
|
|
|
|
|
|
# is not included. |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
348
|
|
|
|
|
|
|
# ARG 0 : index output filename |
349
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
350
|
|
|
|
|
|
|
# No returns |
351
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
352
|
|
|
|
|
|
|
|
353
|
3
|
|
|
3
|
1
|
13
|
my ($self, $fn_out) = @_; |
354
|
|
|
|
|
|
|
|
355
|
3
|
50
|
|
|
|
7
|
croak "missing index output filename" if (! defined $fn_out); |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
|
358
|
3
|
50
|
|
|
|
49
|
$self->_generate_index() if (! defined $self->{idx}); |
359
|
3
|
|
|
|
|
6
|
my @offsets = @{ $self->{idx} }; |
|
3
|
|
|
|
|
8
|
|
360
|
3
|
|
|
|
|
14
|
shift @offsets; # don't write first |
361
|
|
|
|
|
|
|
|
362
|
3
|
|
|
|
|
151
|
open my $fh_out, '>:raw', $fn_out; |
363
|
|
|
|
|
|
|
|
364
|
3
|
|
|
|
|
7
|
print {$fh_out} pack('Q<', scalar(@offsets)); |
|
3
|
|
|
|
|
19
|
|
365
|
3
|
|
|
|
|
8
|
for (@offsets) { |
366
|
8
|
|
|
|
|
10
|
print {$fh_out} pack('Q<', $_->[0]); |
|
8
|
|
|
|
|
12
|
|
367
|
8
|
|
|
|
|
8
|
print {$fh_out} pack('Q<', $_->[1]); |
|
8
|
|
|
|
|
17
|
|
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
3
|
|
|
|
|
264
|
close $fh_out; |
371
|
|
|
|
|
|
|
|
372
|
3
|
|
|
|
|
21
|
return; |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
sub _load_index { |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
379
|
|
|
|
|
|
|
# ARG 0 : index input filename |
380
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
381
|
|
|
|
|
|
|
# No returns |
382
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
383
|
|
|
|
|
|
|
|
384
|
3
|
|
|
3
|
|
6
|
my ($self, $fn_in) = @_; |
385
|
3
|
50
|
|
|
|
7
|
croak "missing index input filename" if (! defined $fn_in); |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
#TODO: speed up index parsing |
388
|
|
|
|
|
|
|
|
389
|
3
|
50
|
|
|
|
76
|
open my $fh_in, '<:raw', $fn_in or croak "error opening index"; |
390
|
3
|
50
|
|
|
|
42
|
read( $fh_in, my $n_offsets, 8) |
391
|
|
|
|
|
|
|
or croak "failed to read first quad"; |
392
|
3
|
|
|
|
|
13
|
$n_offsets = unpack 'Q<', $n_offsets; |
393
|
3
|
|
|
|
|
6
|
my @idx; |
394
|
3
|
|
|
|
|
9
|
for (0..$n_offsets-1) { |
395
|
8
|
50
|
|
|
|
17
|
read( $fh_in, my $buff, 16) or croak "error reading index"; |
396
|
8
|
|
|
|
|
20
|
$idx[$_] = [ unpack 'Q
|
397
|
|
|
|
|
|
|
} |
398
|
3
|
|
|
|
|
21
|
close $fh_in; |
399
|
3
|
|
|
|
|
10
|
unshift @idx, [0,0]; # add initial offsets |
400
|
|
|
|
|
|
|
|
401
|
3
|
|
|
|
|
7
|
$self->{u_file_size} = $idx[-1]->[1]; |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# some indices created by htslib bgzip are missing last offset pair |
404
|
|
|
|
|
|
|
# check for that here by loading last block in index and proceeding from |
405
|
|
|
|
|
|
|
# there. Also calculate uncompressed file size at the same time. |
406
|
3
|
|
|
|
|
4
|
my $c_size = $idx[-1]->[0]; |
407
|
3
|
|
|
|
|
14
|
sysseek $self->{fh}, $idx[-1]->[0], 0; |
408
|
3
|
|
|
|
|
11
|
my ($c, $u) = $self->_unpack_block(0); |
409
|
3
|
|
|
|
|
6
|
$self->{u_file_size} += $u; |
410
|
3
|
|
|
|
|
3
|
$c_size += $c; |
411
|
3
|
|
|
|
|
8
|
while ($c_size < $self->{file_size}) { |
412
|
0
|
|
|
|
|
0
|
push @idx, [$idx[-1]->[0]+$c, $idx[-1]->[1]+$u]; |
413
|
0
|
|
|
|
|
0
|
sysseek $self->{fh}, $idx[-1]->[0], 0; |
414
|
0
|
|
|
|
|
0
|
($c, $u) = $self->_unpack_block(0); |
415
|
0
|
|
|
|
|
0
|
$self->{u_file_size} += $u; |
416
|
0
|
|
|
|
|
0
|
$c_size += $c; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
croak "Unexpected file size/last index mismatch ($c_size v $self->{file_size})" |
419
|
3
|
50
|
|
|
|
7
|
if ($c_size != $self->{file_size}); |
420
|
|
|
|
|
|
|
|
421
|
3
|
|
|
|
|
6
|
$self->{idx} = [@idx]; |
422
|
3
|
|
|
|
|
17
|
$self->{ridx}->{$_->[0]} = $_->[1] for (@idx); |
423
|
|
|
|
|
|
|
|
424
|
3
|
|
|
|
|
16
|
sysseek $self->{fh}, $self->{block_offset}, 0; |
425
|
|
|
|
|
|
|
|
426
|
3
|
|
|
|
|
14
|
return; |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
sub _generate_index { |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
433
|
|
|
|
|
|
|
# No arguments |
434
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
435
|
|
|
|
|
|
|
# No returns |
436
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
437
|
|
|
|
|
|
|
|
438
|
3
|
|
|
3
|
|
7
|
my ($self) = @_; |
439
|
|
|
|
|
|
|
|
440
|
3
|
|
|
|
|
4
|
my $uncmp_offset = 0; |
441
|
3
|
|
|
|
|
4
|
my $cmp_offset = 0; |
442
|
3
|
|
|
|
|
5
|
my $i = 0; |
443
|
3
|
|
|
|
|
5
|
$self->{u_file_size} = 0; |
444
|
3
|
|
|
|
|
6
|
$self->{idx} = []; |
445
|
3
|
|
|
|
|
6
|
$self->{ridx} = {}; |
446
|
|
|
|
|
|
|
|
447
|
3
|
|
|
|
|
17
|
sysseek $self->{fh}, 0, 0; |
448
|
|
|
|
|
|
|
|
449
|
3
|
|
|
|
|
11
|
while ($cmp_offset < $self->{file_size}) { |
450
|
|
|
|
|
|
|
|
451
|
8
|
|
|
|
|
14
|
push @{$self->{idx}}, [$cmp_offset, $uncmp_offset]; |
|
8
|
|
|
|
|
17
|
|
452
|
8
|
|
|
|
|
19
|
$self->{ridx}->{$cmp_offset} = $uncmp_offset; |
453
|
|
|
|
|
|
|
|
454
|
8
|
|
|
|
|
17
|
my ($block_size, $uncompressed_size) = $self->_unpack_block(0); |
455
|
|
|
|
|
|
|
|
456
|
8
|
|
|
|
|
12
|
$cmp_offset += $block_size; |
457
|
8
|
|
|
|
|
10
|
$uncmp_offset += $uncompressed_size; |
458
|
8
|
|
|
|
|
56
|
$self->{u_file_size} += $uncompressed_size; |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
} |
461
|
|
|
|
|
|
|
|
462
|
3
|
|
|
|
|
14
|
sysseek $self->{fh}, $self->{block_offset}, 0; |
463
|
|
|
|
|
|
|
|
464
|
3
|
|
|
|
|
9
|
return; |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
} |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
sub move_to { |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
# Wrapper around _seek(), avoids conflicts with system seek() |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
474
|
|
|
|
|
|
|
# ARG 0 : byte offset to which to seek |
475
|
|
|
|
|
|
|
# ARG 1 : relativity of offset (0: file start, 1: current, 2: file end) |
476
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
477
|
|
|
|
|
|
|
# no returns |
478
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
|
481
|
12
|
|
|
12
|
1
|
3677
|
my ($self, @args) = @_; |
482
|
12
|
|
|
|
|
31
|
$self->_seek( @args ); |
483
|
|
|
|
|
|
|
|
484
|
12
|
|
|
|
|
22
|
return; |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
} |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
sub _seek { |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
492
|
|
|
|
|
|
|
# ARG 0 : byte offset to which to seek |
493
|
|
|
|
|
|
|
# ARG 1 : relativity of offset (0: file start, 1: current, 2: file end) |
494
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
495
|
|
|
|
|
|
|
# no returns |
496
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
497
|
|
|
|
|
|
|
|
498
|
54
|
|
|
54
|
|
90
|
my ($self, $pos, $whence) = @_; |
499
|
|
|
|
|
|
|
|
500
|
54
|
100
|
|
|
|
110
|
$pos += $self->{u_offset} if ($whence == 1); |
501
|
54
|
100
|
|
|
|
94
|
$pos = $self->{u_file_size} + $pos if ($whence == 2); |
502
|
|
|
|
|
|
|
|
503
|
54
|
100
|
|
|
|
92
|
return if ($pos < 0); |
504
|
47
|
100
|
|
|
|
90
|
if ($pos >= $self->{u_file_size}) { |
505
|
15
|
|
|
|
|
24
|
$self->{buffer_len} = -1; |
506
|
15
|
|
|
|
|
35
|
$self->{u_offset} = $pos; |
507
|
15
|
|
|
|
|
20
|
$self->{block_offset} = $pos; |
508
|
15
|
|
|
|
|
31
|
return 1; |
509
|
|
|
|
|
|
|
} |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# Do seeded search for nearest block start <= $pos |
512
|
|
|
|
|
|
|
# (although we don't know the size of each block, we can determine the |
513
|
|
|
|
|
|
|
# mean length and usually choose a starting value close to the actual - |
514
|
|
|
|
|
|
|
# benchmarks much faster than binary search) |
515
|
|
|
|
|
|
|
# TODO: benchmark whether breaking this out and Memoizing speeds things up |
516
|
|
|
|
|
|
|
|
517
|
32
|
|
|
|
|
34
|
my $s = scalar @{$self->{idx}}; |
|
32
|
|
|
|
|
49
|
|
518
|
32
|
|
|
|
|
81
|
my $idx = int($pos/($self->{u_file_size}) * $s); |
519
|
32
|
|
|
|
|
41
|
while (1) { |
520
|
35
|
100
|
|
|
|
63
|
if ($pos < $self->{idx}->[$idx]->[1]) { |
521
|
3
|
|
|
|
|
4
|
--$idx; |
522
|
3
|
|
|
|
|
4
|
next; |
523
|
|
|
|
|
|
|
} |
524
|
32
|
50
|
66
|
|
|
100
|
if ($idx+1 < $s && $pos >= $self->{idx}->[$idx+1]->[1]) { |
525
|
0
|
|
|
|
|
0
|
++$idx; |
526
|
0
|
|
|
|
|
0
|
next; |
527
|
|
|
|
|
|
|
} |
528
|
32
|
|
|
|
|
44
|
last; |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
|
531
|
32
|
|
|
|
|
38
|
my $block_o = $self->{idx}->[$idx]->[0]; |
532
|
32
|
|
|
|
|
40
|
my $block_o_u = $self->{idx}->[$idx]->[1]; |
533
|
32
|
|
|
|
|
43
|
my $buff_o = $pos - $block_o_u; |
534
|
|
|
|
|
|
|
|
535
|
32
|
|
|
|
|
61
|
$self->_load_block( $block_o ); |
536
|
32
|
|
|
|
|
43
|
$self->{buffer_offset} = $buff_o; |
537
|
32
|
|
|
|
|
40
|
$self->{u_offset} = $block_o_u + $buff_o; |
538
|
|
|
|
|
|
|
|
539
|
32
|
|
|
|
|
71
|
return 1; |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
sub get_vo { |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
546
|
|
|
|
|
|
|
# no arguments |
547
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
548
|
|
|
|
|
|
|
# RET 0 : virtual offset of current position |
549
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
550
|
|
|
|
|
|
|
|
551
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
552
|
0
|
|
|
|
|
0
|
return ($self->{block_offset} << 16) | $self->{buffer_offset}; |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
} |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
sub move_to_vo { |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
559
|
|
|
|
|
|
|
# ARG 0 : virtual offset (see POD for definition) |
560
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
561
|
|
|
|
|
|
|
# no returns |
562
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
563
|
|
|
|
|
|
|
|
564
|
0
|
|
|
0
|
1
|
0
|
my ($self, $vo) = @_; |
565
|
0
|
|
|
|
|
0
|
my $block_o = $vo >> 16; |
566
|
0
|
|
|
|
|
0
|
my $buff_o = $vo ^ ($block_o << 16); |
567
|
0
|
|
|
|
|
0
|
$self->_load_block( $block_o ); |
568
|
0
|
|
|
|
|
0
|
$self->{buffer_offset} = $buff_o; |
569
|
0
|
0
|
|
|
|
0
|
croak "invalid block offset" if (! defined $self->{ridx}->{$block_o}); |
570
|
0
|
|
|
|
|
0
|
$self->{u_offset} = $self->{ridx}->{$block_o} + $buff_o; |
571
|
|
|
|
|
|
|
|
572
|
0
|
|
|
|
|
0
|
return; |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
} |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
sub _safe_sysread { |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
# sysread wrapper that checks return count and returns read |
579
|
|
|
|
|
|
|
# (internally we should never read off end of file - doing so indicates |
580
|
|
|
|
|
|
|
# either a software bug or a corrupt input file so we croak) |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
583
|
|
|
|
|
|
|
# ARG 0 : bytes to read |
584
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
585
|
|
|
|
|
|
|
# RET 0 : string read |
586
|
|
|
|
|
|
|
#------------------------------------------------------------------------- |
587
|
|
|
|
|
|
|
|
588
|
242
|
|
|
242
|
|
332
|
my ($fh, $len) = @_; |
589
|
242
|
|
|
|
|
271
|
my $buf = ''; |
590
|
242
|
|
|
|
|
1571
|
my $r = sysread $fh, $buf, $len; |
591
|
242
|
50
|
|
|
|
567
|
croak "returned unexpected byte count" if ($r != $len); |
592
|
|
|
|
|
|
|
|
593
|
242
|
|
|
|
|
731
|
return $buf; |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
} |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
1; |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
__END__ |