File Coverage

blib/lib/Compress/BGZF/Reader.pm
Criterion Covered Total %
statement 262 264 99.2
branch 74 98 75.5
condition 11 15 73.3
subroutine 34 34 100.0
pod 10 10 100.0
total 391 421 92.8


line stmt bran cond sub pod time code
1             package Compress::BGZF::Reader;
2              
3 3     3   7612 use strict;
  3         11  
  3         144  
4 3     3   17 use warnings;
  3         4  
  3         213  
5              
6 3     3   18 use Carp;
  3         4  
  3         199  
7 3     3   1873 use Compress::Zlib;
  3         226707  
  3         905  
8 3     3   40 use List::Util qw/sum/;
  3         7  
  3         277  
9 3     3   2043 use FileHandle;
  3         2996  
  3         16  
10              
11 3     3   1093 use constant BGZF_MAGIC => pack "H*", '1f8b0804';
  3         4  
  3         239  
12 3     3   16 use constant HEAD_BYTES => 12; # not including extra fields
  3         5  
  3         130  
13 3     3   12 use constant FOOT_BYTES => 8;
  3         20  
  3         9071  
14              
15             ## no critic
16             # allow for filehandle tie'ing
17 4     4   35 sub TIEHANDLE { Compress::BGZF::Reader::new(@_) }
18 39     39   1318 sub READ { Compress::BGZF::Reader::_read(@_) }
19 1013     1013   3218 sub READLINE { Compress::BGZF::Reader::getline(@_) }
20 42     42   10320 sub SEEK { Compress::BGZF::Reader::_seek(@_) }
21 3     3   2061 sub CLOSE { close $_[0]->{fh} }
22 3     3   1204 sub TELL { return $_[0]->{u_offset} }
23 3     3   812 sub EOF { return $_[0]->{buffer_len} == -1 }
24 3     3   74 sub FILENO { return fileno $_[0]->{fh} }
25              
26             # accessors
27 6     6 1 2037 sub usize { return $_[0]->{u_file_size} };
28              
29             ## use critic
30              
31             sub new_filehandle {
32              
33             #-------------------------------------------------------------------------
34             # ARG 0 : BGZF input filename
35             #-------------------------------------------------------------------------
36             # RET 0 : Filehandle GLOB (internally an IO::File object tied to
37             # Compress::BGZF::Reader)
38             #-------------------------------------------------------------------------
39              
40 5     5 1 772 my ($class, $fn_in) = @_;
41 5 100       148 croak "Input filename required" if (! defined $fn_in);
42 4         32 my $fh = FileHandle->new;
43 4 50       245 tie *$fh, $class, $fn_in or croak "failed to tie filehandle";
44             #tie *FH, $class, $fn_in or croak "failed to tie filehandle";
45              
46 3         8 return $fh;
47             #return \*FH;
48              
49             }
50              
51             sub new {
52              
53             #-------------------------------------------------------------------------
54             # ARG 0 : BGZF input filename
55             #-------------------------------------------------------------------------
56             # RET 0 : Compress::BGZF::Reader object
57             #-------------------------------------------------------------------------
58            
59 14     14 1 9836 my ($class, $fn_in) = @_;
60 14         32 my $self = bless {}, $class;
61              
62             # initialize
63 14 100       145 $self->{fn_in} = $fn_in
64             or croak "Input filename required";
65              
66             # open compressed file in binary mode
67 13 100       748 open my $fh, '<:raw', $fn_in or croak "Failed to open input file"; ## no critic
68 11         34 $self->{fh} = $fh;
69              
70             # these member variables allow for data extraction
71 11         26 $self->{buffer} = ''; # contents of currently uncompressed block
72 11         21 $self->{buffer_len} = 0; # save time on frequent length() calls
73 11         19 $self->{block_offset} = 0; # offset of current block in compressed data
74 11         16 $self->{buffer_offset} = 0; # offset of current pos in uncompressed block
75 11         23 $self->{block_size} = 0; # compressed size of current block
76 11         106 $self->{file_size} = -s $fn_in; # size of compressed file
77              
78             # these variables are tracked to allow for full seek implementation
79 11         20 $self->{u_offset} = 0; # calculated current uncompressed offset
80 11         20 $self->{u_file_size} = 0; # size of uncompressed file (filled during
81             # indexing
82              
83             # load offset index
84 11 100       281 if (-e "$fn_in.gzi") {
85 6         28 $self->_load_index( "$fn_in.gzi" ); # from disk
86             }
87             else {
88 5         25 $self->_generate_index(); # on-the-fly
89             }
90              
91             # load initial block
92 9         33 $self->_load_block();
93              
94 9         40 return $self;
95              
96             }
97              
98             sub _load_block {
99            
100             #-------------------------------------------------------------------------
101             # ARG 0 : offset of block start in compressed file
102             #-------------------------------------------------------------------------
103             # no returns
104             #-------------------------------------------------------------------------
105              
106 100     100   210 my ($self, $block_offset) = @_;
107              
108             # loading a block should always reset buffer offset
109 100         135 $self->{buffer_offset} = 0;
110              
111             # avoid reload of current block
112             return if (defined $block_offset
113 100 100 100     295 && $block_offset == $self->{block_offset});
114              
115             # if no offset given (e.g. sequential reads), move to next block
116 76 100       133 if (! defined $block_offset) {
117 32         54 $block_offset = $self->{block_offset} + $self->{block_size};
118             }
119              
120 76         106 $self->{block_offset} = $block_offset;
121             # deal with EOF
122             croak "Read past file end (perhaps corrupted/truncated input?)"
123 76 50       162 if ($self->{block_offset} > $self->{file_size});
124 76 100       136 if ($self->{block_offset} == $self->{file_size}) {
125 14         27 $self->{buffer} = '';
126 14         19 $self->{buffer_len} = -1;
127 14         30 return;
128             }
129              
130             # never assume we're already there
131 62         313 sysseek $self->{fh}, $self->{block_offset}, 0;
132              
133             # parse block according to GZIP spec, including content inflation
134 62         134 my ($block_size, $uncompressed_size, $content)
135             = $self->_unpack_block(1);
136 62         129 $self->{block_size} = $block_size;
137 62         102 $self->{buffer_len} = $uncompressed_size;
138 62         126 $self->{buffer} = $content;
139              
140 62         111 return;
141              
142             }
143              
144             sub _unpack_block {
145              
146             #-------------------------------------------------------------------------
147             # ARG 0 : bool indicating whether to inflate (and return) actual payload
148             #-------------------------------------------------------------------------
149             # RET 0 : compressed block size
150             # RET 1 : uncompressed content size
151             # RET 2 : content (if ARG 0)
152             #-------------------------------------------------------------------------
153              
154 99     99   154 my ($self, $do_unpack) = @_;
155              
156 99         123 my @return_values;
157              
158              
159             my ($magic, $mod, $flags, $os, $len_extra) = unpack 'A4A4CCv',
160 99         191 _safe_sysread($self->{fh}, HEAD_BYTES);
161 99         307 my $t = sysseek $self->{fh}, 0, 1;
162 99 100       301 croak "invalid header at $t (corrupt file or not BGZF?)"
163             if ($magic ne BGZF_MAGIC);
164              
165             # extract stated block size according to BGZF spec
166 98         120 my $block_size;
167 98         122 my $l = 0;
168 98         172 while ($l < $len_extra) {
169             my ($field_id, $field_len) = unpack 'A2v',
170 98         146 _safe_sysread($self->{fh}, 4);
171 98 50       234 if ($field_id eq 'BC') {
172 98 50       166 croak "invalid BC length" if ($field_len != 2);
173 98 50       153 croak "multiple BC fields" if (defined $block_size);
174             $block_size = unpack 'v',
175 98         170 _safe_sysread($self->{fh} => $field_len);
176 98         151 $block_size += 1; # convert to 1-based
177             }
178 98         182 $l += 4 + $field_len;
179             }
180 98 50       178 croak "invalid extra field length" if ($l != $len_extra);
181 98 50       151 croak "failed to read block size" if (! defined $block_size);
182              
183 98         180 push @return_values, $block_size;
184 98         137 my $payload_len = $block_size - HEAD_BYTES - FOOT_BYTES - $len_extra;
185 98         110 my $content;
186 98 100       174 if ($do_unpack) {
187            
188             # decode actual content
189 62         103 my $payload = _safe_sysread($self->{fh}, $payload_len);
190 62         228 my ($i,$status) = inflateInit(-WindowBits => -&MAX_WBITS());
191 62 50       9353 croak "Error during inflate init\n" if ($status != Z_OK);
192 62         288 ($content,$status) = $i->inflate($payload);
193 62 50       10453 croak "Error during inflate run\n" if ($status != Z_STREAM_END);
194             #rawinflate( \$payload => \$content )
195             #or croak "Error inflating: $RawInflateError\n";
196 62         352 my $crc_given = unpack 'V', _safe_sysread($self->{fh} => 4);
197 62 50       10949 croak "content CRC32 mismatch" if ( $crc_given != crc32($content) );
198              
199             }
200 36         116 else { sysseek $self->{fh}, $payload_len + 4, 1; }
201              
202             # unpack and possibly check uncompressed payload size
203 98         204 my $size_given = unpack 'V', _safe_sysread($self->{fh} => 4);
204 97 50 66     461 croak "content length mismatch"
205             if ( defined $content && $size_given != length($content) );
206 97         132 push @return_values, $size_given;
207 97 100       190 push @return_values, $content if (defined $content);
208              
209 97         290 return @return_values;
210              
211             }
212              
213             sub read_data {
214              
215             # More OO-ish wrapper around _read(), avoids conflicts with system read()
216              
217             #-------------------------------------------------------------------------
218             # ARG 0 : number of bytes to read
219             #-------------------------------------------------------------------------
220             # RET 0 : data read
221             #-------------------------------------------------------------------------
222              
223 6     6 1 61 my ($self, $bytes) = @_;
224              
225 6         18 my $r = $self->_read( my $buffer, $bytes );
226             carp "received fewer bytes than requested"
227 6 50 33     26 if ($r < $bytes && $self->{buffer_len} > -1);
228            
229 6 50       13 $buffer = undef if ( $r < 1 );
230 6         17 return $buffer;
231            
232             }
233              
234             sub _read {
235              
236             #-------------------------------------------------------------------------
237             # ARG 0 : buffer to write to
238             # ARG 1 : bytes to attempt to read
239             # ARG 3 : (optional) offset in buffer to start write (default: 0)
240             #-------------------------------------------------------------------------
241             # RET 0 : bytes read (0 at EOF, undef on error)
242             #-------------------------------------------------------------------------
243              
244             # we try to emulate the built-in 'read' call, so we will
245             # overwrite $_[1] and return the number of bytes read. To facilitate this,
246             # make $buf a reference to the buffer passed
247 45     45   65 my $self = shift;
248 45         60 my $buf = \shift; # must be a reference !!
249 45         54 my $bytes = shift;
250 45         47 my $offset = shift;
251              
252             # handle cases when $offset is passed in
253 45         62 my $prefix = '';
254 45 100 100     160 if (defined $offset && $offset != 0) {
255 12         31 $prefix = substr $$buf, 0, $offset;
256 12 100       46 $prefix .= "\0" x ( $offset - length($$buf) )
257             if ( $offset > length($$buf) );
258             }
259              
260 45         70 $$buf = ''; # reset (only AFTER grabbing any prefix above)
261              
262             ITER:
263 45         76 while (length($$buf) < $bytes) {
264              
265 64         69 my $l = length($$buf);
266 64         76 my $remaining = $bytes - $l;
267              
268             # if read is within current block
269 64 100       139 if ($self->{buffer_offset} + $remaining <= $self->{buffer_len}) {
270 19         86 $$buf .= substr $self->{buffer}, $self->{buffer_offset}, $remaining;
271 19         24 $self->{buffer_offset} += $remaining;
272             $self->_load_block()
273 19 50       48 if ($self->{buffer_offset} == $self->{buffer_len});
274             }
275             else {
276 45 100       97 last ITER if ($self->{buffer_len} < 0); #EOF
277 19         300 $$buf .= substr $self->{buffer}, $self->{buffer_offset};
278 19         42 $self->_load_block();
279             }
280              
281             }
282              
283 45         50 my $l = length($$buf);
284 45         60 $self->{u_offset} += $l;
285 45         110 $$buf = $prefix . $$buf;
286              
287 45         95 return $l;
288              
289             }
290              
291              
292             sub getline {
293              
294             #-------------------------------------------------------------------------
295             # No arguments
296             #-------------------------------------------------------------------------
297             # RET 0 : string read (undef at EOF)
298             #-------------------------------------------------------------------------
299              
300 1055     1055 1 1504 my ($self) = @_;
301              
302 1055         1013 my $data = '';
303              
304 1055         942 while (1) {
305              
306             # return immediately if EOF
307 1059 100       1379 last if ($self->{buffer_len} < 0);
308              
309             # search current block for record separator
310             # start searching from the current buffer offset
311 1036         1521 pos($self->{buffer}) = $self->{buffer_offset};
312              
313 1036 100       2396 if ($self->{buffer} =~ m|$/|g) {
314 1032         1046 my $pos = pos $self->{buffer};
315             $data .= substr $self->{buffer}, $self->{buffer_offset},
316 1032         1704 $pos - $self->{buffer_offset};
317              
318 1032         1089 $self->{buffer_offset} = $pos;
319              
320             # if we're at the end of the block, load next
321 1032 50       1368 $self->_load_block if ($pos == $self->{buffer_len});
322              
323 1032         1089 $self->{u_offset} += length($data);
324              
325 1032         1018 last;
326              
327             }
328              
329             # otherwise, add rest of block to data and load next block
330 4         8 $data .= substr $self->{buffer}, $self->{buffer_offset};
331 4         13 $self->_load_block;
332              
333             }
334              
335 1055 100       1801 return length($data) > 0 ? $data : undef;
336              
337             }
338              
339             sub write_index {
340              
341             # index format (as defined by htslib) consists of little-endian
342             # int64-coded values. The first value is the number of offsets in the
343             # index. The rest of the values consist of pairs of block offsets relative
344             # to the compressed and uncompressed data. The first offset (always 0,0)
345             # is not included.
346              
347             #-------------------------------------------------------------------------
348             # ARG 0 : index output filename
349             #-------------------------------------------------------------------------
350             # No returns
351             #-------------------------------------------------------------------------
352              
353 6     6 1 995 my ($self, $fn_out) = @_;
354              
355 6 50       15 croak "missing index output filename" if (! defined $fn_out);
356              
357 6 50       16 $self->_generate_index() if (! defined $self->{idx});
358 6         9 my @offsets = @{ $self->{idx} };
  6         15  
359 6         9 shift @offsets; # don't write first
360              
361 6         453 open my $fh_out, '>:raw', $fn_out;
362              
363 6         16 print {$fh_out} pack('Q<', scalar(@offsets));
  6         38  
364 6         15 for (@offsets) {
365 26         28 print {$fh_out} pack('Q<', $_->[0]);
  26         48  
366 26         30 print {$fh_out} pack('Q<', $_->[1]);
  26         53  
367             }
368              
369 6         732 close $fh_out;
370              
371 6         55 return;
372              
373             }
374              
375             sub rebuild_index {
376              
377             # force rebuilding of index
378             # (added as a public alias of _generate_index() )
379 3     3 1 945 my ($self) = @_;
380 3         10 return $self->_generate_index;
381              
382             }
383              
384             sub _load_index {
385              
386             #-------------------------------------------------------------------------
387             # ARG 0 : index input filename
388             #-------------------------------------------------------------------------
389             # No returns
390             #-------------------------------------------------------------------------
391              
392 6     6   11 my ($self, $fn_in) = @_;
393 6 50       15 croak "missing index input filename" if (! defined $fn_in);
394              
395             #TODO: speed up index parsing
396              
397 6 50       171 open my $fh_in, '<:raw', $fn_in or croak "error opening index";
398 6 50       166 read( $fh_in, my $n_offsets, 8)
399             or croak "failed to read first quad";
400 6         39 $n_offsets = unpack 'Q<', $n_offsets;
401 6         12 my @idx;
402 6         23 for (0..$n_offsets-1) {
403 26 50       54 read( $fh_in, my $buff, 16) or croak "error reading index";
404 26         65 $idx[$_] = [ unpack 'Q
405             }
406 6         60 close $fh_in;
407 6         20 unshift @idx, [0,0]; # add initial offsets
408              
409             # some indices created by htslib bgzip are missing last offset pair
410             # check for that here by loading last block in index and proceeding from
411             # there. Also calculate uncompressed file size at the same time.
412             # EDIT 2024-02-09: But htslib uses empty last block as EOF indicator,
413             # so only add extra blocks to index if they are not empty.
414 6         12 my $c_size = $idx[-1]->[0];
415 6         9 my $u_size = $idx[-1]->[1];
416              
417 6         29 sysseek $self->{fh}, $c_size, 0;
418 6         23 my ($c, $u) = $self->_unpack_block(0);
419 6         9 $c_size += $c;
420 6         10 $u_size += $u;
421 6         27 $self->{u_file_size} = $u_size; # should be correct at this point
422              
423 6         13 while ($c_size < $self->{file_size}) {
424 4         16 sysseek $self->{fh}, $c_size, 0;
425 4         10 ($c, $u) = $self->_unpack_block(0);
426 4 50       10 push @idx, [$c_size, $u_size]
427             if ($u > 0);
428 4         9 $c_size += $c;
429 4         6 $u_size += $u;
430 4         10 $self->{u_file_size} = $u_size;
431             }
432              
433             croak "Unexpected file size/last index mismatch ($c_size v $self->{file_size})"
434 6 50       13 if ($c_size != $self->{file_size});
435              
436 6         30 $self->{idx} = [@idx];
437 6         59 $self->{ridx}->{$_->[0]} = $_->[1] for (@idx);
438              
439 6         26 sysseek $self->{fh}, $self->{block_offset}, 0;
440              
441 6         32 return;
442              
443             }
444              
445             sub _generate_index {
446              
447             #-------------------------------------------------------------------------
448             # No arguments
449             #-------------------------------------------------------------------------
450             # No returns
451             #-------------------------------------------------------------------------
452              
453 8     8   19 my ($self) = @_;
454              
455 8         15 my $uncmp_offset = 0;
456 8         14 my $cmp_offset = 0;
457 8         12 my $i = 0;
458 8         16 $self->{u_file_size} = 0;
459 8         33 $self->{idx} = [];
460 8         21 $self->{ridx} = {};
461              
462 8         42 sysseek $self->{fh}, 0, 0;
463              
464 8         32 while ($cmp_offset < $self->{file_size}) {
465              
466 27         59 $self->{ridx}->{$cmp_offset} = $uncmp_offset;
467 27         54 my ($block_size, $uncompressed_size) = $self->_unpack_block(0);
468              
469             # don't index empty blocks
470             # (should only be final EOF block in practice)
471 25 100       44 push @{$self->{idx}}, [$cmp_offset, $uncmp_offset]
  24         50  
472             if ($uncompressed_size > 0);
473              
474 25         32 $cmp_offset += $block_size;
475 25         30 $uncmp_offset += $uncompressed_size;
476 25         44 $self->{u_file_size} += $uncompressed_size;
477              
478             }
479              
480 6         20 sysseek $self->{fh}, $self->{block_offset}, 0;
481              
482 6         36 return;
483              
484             }
485              
486              
487             sub move_to {
488              
489             # Wrapper around _seek(), avoids conflicts with system seek()
490              
491             #-------------------------------------------------------------------------
492             # ARG 0 : byte offset to which to seek
493             # ARG 1 : relativity of offset (0: file start, 1: current, 2: file end)
494             #-------------------------------------------------------------------------
495             # no returns
496             #-------------------------------------------------------------------------
497              
498            
499 48     48 1 17543 my ($self, @args) = @_;
500 48         142 $self->_seek( @args );
501              
502 48         148 return;
503              
504             }
505              
506              
507             sub _seek {
508              
509             #-------------------------------------------------------------------------
510             # ARG 0 : byte offset to which to seek
511             # ARG 1 : relativity of offset (0: file start, 1: current, 2: file end)
512             #-------------------------------------------------------------------------
513             # no returns
514             #-------------------------------------------------------------------------
515              
516 90     90   153 my ($self, $pos, $whence) = @_;
517              
518 90 100       187 $pos += $self->{u_offset} if ($whence == 1);
519 90 100       166 $pos = $self->{u_file_size} + $pos if ($whence == 2);
520              
521 90 100       166 return if ($pos < 0);
522 80 100       165 if ($pos >= $self->{u_file_size}) {
523 18         28 $self->{buffer_len} = -1;
524 18         31 $self->{u_offset} = $pos;
525 18         26 $self->{block_offset} = $pos;
526 18         61 return 1;
527             }
528              
529             # Do seeded search for nearest block start <= $pos
530             # (although we don't know the size of each block, we can determine the
531             # mean length and usually choose a starting value close to the actual -
532             # benchmarks much faster than binary search)
533             # TODO: benchmark whether breaking this out and Memoizing speeds things up
534              
535 62         68 my $s = scalar @{$self->{idx}};
  62         108  
536 62         155 my $idx = int($pos/($self->{u_file_size}) * $s);
537 62         77 while (1) {
538 64 100       138 if ($pos < $self->{idx}->[$idx]->[1]) {
539 2         3 --$idx;
540 2         4 next;
541             }
542 62 50 66     202 if ($idx+1 < $s && $pos >= $self->{idx}->[$idx+1]->[1]) {
543 0         0 ++$idx;
544 0         0 next;
545             }
546 62         87 last;
547             }
548              
549 62         105 my $block_o = $self->{idx}->[$idx]->[0];
550 62         81 my $block_o_u = $self->{idx}->[$idx]->[1];
551 62         85 my $buff_o = $pos - $block_o_u;
552              
553 62         150 $self->_load_block( $block_o );
554 62         96 $self->{buffer_offset} = $buff_o;
555 62         88 $self->{u_offset} = $block_o_u + $buff_o;
556              
557 62         162 return 1;
558              
559             }
560              
561             sub get_vo {
562              
563             #-------------------------------------------------------------------------
564             # no arguments
565             #-------------------------------------------------------------------------
566             # RET 0 : virtual offset of current position
567             #-------------------------------------------------------------------------
568              
569 6     6 1 75 my ($self) = @_;
570 6         18 return ($self->{block_offset} << 16) | $self->{buffer_offset};
571              
572             }
573              
574             sub move_to_vo {
575              
576             #-------------------------------------------------------------------------
577             # ARG 0 : virtual offset (see POD for definition)
578             #-------------------------------------------------------------------------
579             # no returns
580             #-------------------------------------------------------------------------
581              
582 12     12 1 435 my ($self, $vo) = @_;
583 12         22 my $block_o = $vo >> 16;
584             croak "Invalid block offset"
585 12 100       687 if (! defined $self->{ridx}->{$block_o});
586 6         12 my $buff_o = $vo ^ ($block_o << 16);
587 6         14 $self->_load_block( $block_o );
588 6         11 $self->{buffer_offset} = $buff_o;
589 6         15 $self->{u_offset} = $self->{ridx}->{$block_o} + $buff_o;
590              
591 6         41 return;
592              
593             }
594              
595             sub _safe_sysread {
596              
597             # sysread wrapper that checks return count and returns read
598             # (internally we should never read off end of file - doing so indicates
599             # either a software bug or a corrupt input file so we croak)
600              
601             #-------------------------------------------------------------------------
602             # ARG 0 : bytes to read
603             #-------------------------------------------------------------------------
604             # RET 0 : string read
605             #-------------------------------------------------------------------------
606              
607 517     517   723 my ($fh, $len) = @_;
608 517         627 my $buf = '';
609 517         3783 my $r = sysread $fh, $buf, $len;
610 517 100       1030 croak "Returned unexpected byte count" if ($r != $len);
611              
612 516         1389 return $buf;
613              
614             }
615              
616             1;
617              
618              
619             __END__