File Coverage

blib/lib/Compress/BGZF/Writer.pm
Criterion Covered Total %
statement 119 119 100.0
branch 27 40 67.5
condition 5 5 100.0
subroutine 23 23 100.0
pod 7 7 100.0
total 181 194 93.3


line stmt bran cond sub pod time code
1             package Compress::BGZF::Writer;
2              
3 3     3   8527 use strict;
  3         6  
  3         127  
4 3     3   14 use warnings;
  3         11  
  3         242  
5              
6 3     3   19 use Carp;
  3         4  
  3         250  
7 3     3   18 use Compress::Zlib;
  3         30  
  3         1046  
8 3     3   26 use IO::Compress::RawDeflate qw/rawdeflate $RawDeflateError/;
  3         6  
  3         600  
9              
10 3     3   23 use constant HEAD_BYTES => 18;
  3         13  
  3         327  
11 3     3   19 use constant FOOT_BYTES => 8;
  3         6  
  3         292  
12 3     3   21 use constant FLUSH_SIZE => 2**16 - HEAD_BYTES - FOOT_BYTES - 1;
  3         4  
  3         217  
13 3     3   43 use constant BGZF_HEADER => pack "H*", '1f8b08040000000000ff060042430200';
  3         3  
  3         201  
14 3         4622 use constant BGZF_EOF => pack "H*",
15 3     3   17 '1f8b08040000000000ff0600424302001b0003000000000000000000';
  3         6  
16              
17             ## no critic
18             # allow for filehandle tie'ing
19 3     3   14 sub TIEHANDLE { Compress::BGZF::Writer::new(@_) }
20 2015     2015   5753 sub PRINT { Compress::BGZF::Writer::_queue(@_) }
21 3     3   27 sub CLOSE { Compress::BGZF::Writer::finalize(@_) }
22             ## use critic
23              
24             sub new_filehandle {
25              
26             #-------------------------------------------------------------------------
27             # ARG 0 : (optional) output filename
28             #-------------------------------------------------------------------------
29             # RET 0 : filehandle GLOB
30             #-------------------------------------------------------------------------
31              
32 3     3 1 265118 my ($class, $fn_out) = @_;
33              
34 3         553 open my $fh, '<', undef;
35 3 50       35 tie *$fh, $class, $fn_out or croak "failed to tie filehandle";
36 3         53 return $fh;
37              
38             }
39              
40             sub new {
41              
42             #-------------------------------------------------------------------------
43             # ARG 0 : (optional) output filename
44             #-------------------------------------------------------------------------
45             # RET 0 : Compress::BGZF::Writer object
46             #-------------------------------------------------------------------------
47              
48 6     6 1 188389 my ($class, $fn_out) = @_;
49 6         19 my $self = bless {}, $class;
50              
51             # initialize
52 6 100       30 if (defined $fn_out) {
53 5 50       717 open $self->{fh}, '>', $fn_out
54             or croak "Error opening file for writing";
55             }
56             else {
57 1         10 $self->{fh} = \*STDOUT;
58             }
59 6         35 binmode $self->{fh};
60              
61 6         57 $self->{c_level} = Z_DEFAULT_COMPRESSION;
62 6         45 $self->{buffer} = ''; # contents waiting to be compressed/written
63              
64             # these variables are tracked to allow for virtual offset calculation
65 6         19 $self->{block_offset} = 0; # offset of current block in compressed data
66 6         17 $self->{buffer_offset} = 0; # offset of current pos in uncompressed block
67              
68             # these variables are tracked to allow for index creation
69 6         15 $self->{u_offset} = 0; #uncompressed file offset
70 6         26 $self->{idx} = [];
71 6         44 $self->{write_eof} = 0;
72              
73 6         34 return $self;
74              
75             }
76              
77             sub set_level {
78              
79             #-------------------------------------------------------------------------
80             # ARG 0 : compression level desired
81             #-------------------------------------------------------------------------
82             # no returns
83             #-------------------------------------------------------------------------
84              
85 9     9 1 3344 my ($self, $level) = @_;
86              
87 9 100       1034 croak "Invalid compression level (allowed 0-9)"
88             if ($level !~ /^\d$/);
89 3         10 $self->{c_level} = $level;
90              
91 3         6 return;
92              
93             }
94              
95             sub set_write_eof {
96              
97             # Sets whether to include htslib-style EOF empty block at end of file
98              
99             #-------------------------------------------------------------------------
100             # ARG 0 : (optional) boolean
101             #-------------------------------------------------------------------------
102             # no returns
103             #-------------------------------------------------------------------------
104              
105 6     6 1 1534 my ($self, $bool) = @_;
106              
107 6   100     29 $bool //= 1;
108 6 100       20 $self->{write_eof} = $bool ? 1 : 0;
109              
110 6         24 return;
111              
112             }
113              
114             sub add_data {
115              
116             # a wrapper around the queue() function that returns the virtual offset
117             # of the chunk added
118              
119             #-------------------------------------------------------------------------
120             # ARG 0 : data chunk to queue for compression
121             #-------------------------------------------------------------------------
122             # RET 1 : virtual offset of data written
123             #-------------------------------------------------------------------------
124              
125 5855     5855 1 21893 my ($self, $content) = @_;
126              
127 5855         10213 my $vo = ($self->{block_offset} << 16) | $self->{buffer_offset};
128 5855         12654 $self->_queue( $content );
129              
130 5855         14485 return $vo;
131              
132             }
133              
134             sub _queue {
135              
136             #-------------------------------------------------------------------------
137             # ARG 0 : data chunk to queue for compression
138             #-------------------------------------------------------------------------
139             # no returns
140             #-------------------------------------------------------------------------
141              
142 7870     7870   12030 my ($self, $content) = @_;
143              
144 7870         12442 $self->{buffer} .= $content;
145              
146             # compress/write in 64k chunks
147 7870         16849 while (length($self->{buffer}) >= FLUSH_SIZE) {
148              
149 18         543 my $chunk = substr $self->{buffer}, 0, FLUSH_SIZE, '';
150 18         89 my $unwritten = $self->_write_block($chunk);
151             $self->{buffer} = $unwritten . $self->{buffer}
152 18 100       142 if ( length($unwritten) );
153              
154             }
155 7870         12191 $self->{buffer_offset} = length $self->{buffer};
156              
157 7870         13885 return;
158            
159             }
160              
161             sub _write_block {
162              
163             #-------------------------------------------------------------------------
164             # ARG 0 : independent data block to compress
165             #-------------------------------------------------------------------------
166             # RET 0 : remaining data that wasn't written
167             #-------------------------------------------------------------------------
168              
169 24     24   110 my ($self, $chunk) = @_;
170              
171 24         52 my $chunk_len = length($chunk);
172              
173             # payload is compressed with DEFLATE
174             rawdeflate(\$chunk, \my $payload, -Level => $self->{c_level})
175 24 50       232 or croak "deflate failed: $RawDeflateError\n";
176              
177             # very rarely, a DEFLATEd string may be larger than input. This can result
178             # in a block size > 2**16, which violates the BGZF specification and
179             # causes corruption of the BC field. Fix those edge cases here (somewhat
180             # slow but shouldn't happen often) and send the rest back to the buffer
181 24         122839 my $trimmed = '';
182 24         102 while (length($payload) > FLUSH_SIZE) {
183 8         28 my $trim_len = int( $chunk_len * 0.05 );
184 8         169 $trimmed = substr($chunk, -$trim_len, $trim_len, '') . $trimmed;
185             rawdeflate(\$chunk, \$payload, -Level => $self->{c_level})
186 8 50       55 or croak "deflate failed: $RawDeflateError\n";
187 8         36903 $chunk_len = length($chunk);
188             }
189              
190 24         70 my $block_size = length($payload) + HEAD_BYTES + FOOT_BYTES;
191              
192 24 50       76 croak "Internal error: block size > 65536" if ($block_size > 2**16);
193              
194             # payload is wrapped with appropriate headers and footers
195 24 50       49 print { $self->{fh} } pack(
  24         8900  
196             "a*va*VV",
197             BGZF_HEADER,
198             $block_size - 1,
199             $payload,
200             crc32($chunk),
201             $chunk_len,
202             ) or croak "Error writing compressed block";
203              
204             # increment the current offsets
205 24         132 $self->{block_offset} += $block_size;
206 24         64 $self->{u_offset} += $chunk_len;
207 24         51 push @{ $self->{idx} }, [$self->{block_offset}, $self->{u_offset}];
  24         151  
208              
209 24         119 return $trimmed;
210              
211             }
212              
213             sub finalize {
214              
215             #-------------------------------------------------------------------------
216             # no arguments
217             #-------------------------------------------------------------------------
218             # no returns
219             #-------------------------------------------------------------------------
220              
221 18     18 1 107 my ($self) = @_;
222              
223 18         81 while (length($self->{buffer}) > 0) {
224              
225             croak "file closed but buffer not empty"
226 6 50       32 if ( ! defined fileno($self->{fh}) );
227              
228 6         60 my $chunk = substr $self->{buffer}, 0, FLUSH_SIZE, '';
229 6         23 my $unwritten = $self->_write_block($chunk);
230             $self->{buffer} = $unwritten . $self->{buffer}
231 6 50       42 if ( length($unwritten) );
232              
233             }
234             # write EOF block if requested (only first time finalize() is run)
235 18 100 100     79 if ($self->{write_eof} && defined fileno($self->{fh})) {
236 1         3 print { $self->{fh} } BGZF_EOF;
  1         4  
237             }
238 18 100       59 if (defined fileno($self->{fh}) ) {
239             close $self->{fh}
240 6 50       1942 or croak "Error closing compressed file";
241             }
242              
243 18         68 return;
244              
245             }
246              
247             sub write_index {
248              
249             #-------------------------------------------------------------------------
250             # ARG 0 : index output filename
251             #-------------------------------------------------------------------------
252             # No returns
253             #-------------------------------------------------------------------------
254              
255 6     6 1 449 my ($self, $fn_out) = @_;
256              
257 6         29 $self->finalize(); # always clear remaining buffer to fully populate index
258 6 100       599 croak "missing index output filename" if (! defined $fn_out);
259 3 50       591 open my $fh_out, '>:raw', $fn_out
260             or croak "Error opening index file for writing";
261              
262 3         11 my @offsets = @{ $self->{idx} };
  3         19  
263 3         34 pop @offsets; # last offset is EOF
264 3 50       7 print {$fh_out} pack('Q<', scalar(@offsets))
  3         31  
265             or croak "Error printing to index file";
266 3         14 for (@offsets) {
267 13 50       21 print {$fh_out} pack('Q
  13         25  
  13         48  
268             or croak "Error printing offset to index file";
269             }
270              
271 3 50       184 close $fh_out
272             or croak "Error closing index file after writing";
273 3         33 return;
274              
275             }
276              
277             sub DESTROY {
278              
279 6     6   4500 my ($self) = @_;
280              
281             # make sure we call finalize in case the caller forgot
282 6         26 $self->finalize();
283              
284 6         292 return;
285              
286             }
287              
288             1;
289              
290              
291             __END__