| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Compression::Util; |
|
2
|
|
|
|
|
|
|
|
|
3
|
47
|
|
|
47
|
|
5782912
|
use utf8; |
|
|
47
|
|
|
|
|
11958
|
|
|
|
47
|
|
|
|
|
352
|
|
|
4
|
47
|
|
|
47
|
|
2614
|
use 5.036; |
|
|
47
|
|
|
|
|
178
|
|
|
5
|
47
|
|
|
47
|
|
376
|
use List::Util qw(min uniq max sum all); |
|
|
47
|
|
|
|
|
126
|
|
|
|
47
|
|
|
|
|
5962
|
|
|
6
|
47
|
|
|
47
|
|
385
|
use Carp qw(confess); |
|
|
47
|
|
|
|
|
153
|
|
|
|
47
|
|
|
|
|
7863
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
require Exporter; |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.15'; |
|
13
|
|
|
|
|
|
|
our $VERBOSE = 0; # verbose mode |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our $LZ_MIN_LEN = 4; # minimum match length in LZ parsing |
|
16
|
|
|
|
|
|
|
our $LZ_MAX_LEN = 1 << 15; # maximum match length in LZ parsing |
|
17
|
|
|
|
|
|
|
our $LZ_MAX_DIST = ~0; # maximum allowed back-reference distance in LZ parsing |
|
18
|
|
|
|
|
|
|
our $LZ_MAX_CHAIN_LEN = 32; # how many recent positions to remember in LZ parsing |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# Arithmetic Coding settings |
|
21
|
47
|
|
|
47
|
|
397
|
use constant BITS => 32; |
|
|
47
|
|
|
|
|
92
|
|
|
|
47
|
|
|
|
|
5997
|
|
|
22
|
47
|
|
|
47
|
|
360
|
use constant MAX => oct('0b' . ('1' x BITS)); |
|
|
47
|
|
|
|
|
146
|
|
|
|
47
|
|
|
|
|
3498
|
|
|
23
|
47
|
|
|
47
|
|
301
|
use constant INITIAL_FREQ => 1; |
|
|
47
|
|
|
|
|
113
|
|
|
|
47
|
|
|
|
|
1711042
|
|
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( |
|
26
|
|
|
|
|
|
|
'all' => [ |
|
27
|
|
|
|
|
|
|
qw( |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
crc32 |
|
30
|
|
|
|
|
|
|
adler32 |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
read_bit |
|
33
|
|
|
|
|
|
|
read_bit_lsb |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
read_bits |
|
36
|
|
|
|
|
|
|
read_bits_lsb |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
int2bits |
|
39
|
|
|
|
|
|
|
int2bits_lsb |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
int2bytes |
|
42
|
|
|
|
|
|
|
int2bytes_lsb |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
bits2int |
|
45
|
|
|
|
|
|
|
bits2int_lsb |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
bytes2int |
|
48
|
|
|
|
|
|
|
bytes2int_lsb |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
string2symbols |
|
51
|
|
|
|
|
|
|
symbols2string |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
read_null_terminated |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
bwt_encode |
|
56
|
|
|
|
|
|
|
bwt_decode |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
bwt_encode_symbolic |
|
59
|
|
|
|
|
|
|
bwt_decode_symbolic |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
bwt_sort |
|
62
|
|
|
|
|
|
|
bwt_sort_symbolic |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
bwt_compress |
|
65
|
|
|
|
|
|
|
bwt_decompress |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
bwt_compress_symbolic |
|
68
|
|
|
|
|
|
|
bwt_decompress_symbolic |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
bzip2_compress |
|
71
|
|
|
|
|
|
|
bzip2_decompress |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
gzip_compress |
|
74
|
|
|
|
|
|
|
gzip_decompress |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
mrl_compress |
|
77
|
|
|
|
|
|
|
mrl_decompress |
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
mrl_compress_symbolic |
|
80
|
|
|
|
|
|
|
mrl_decompress_symbolic |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
create_huffman_entry |
|
83
|
|
|
|
|
|
|
decode_huffman_entry |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
delta_encode |
|
86
|
|
|
|
|
|
|
delta_decode |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
huffman_encode |
|
89
|
|
|
|
|
|
|
huffman_decode |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
huffman_from_freq |
|
92
|
|
|
|
|
|
|
huffman_from_symbols |
|
93
|
|
|
|
|
|
|
huffman_from_code_lengths |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
mtf_encode |
|
96
|
|
|
|
|
|
|
mtf_decode |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
encode_alphabet |
|
99
|
|
|
|
|
|
|
decode_alphabet |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
encode_alphabet_256 |
|
102
|
|
|
|
|
|
|
decode_alphabet_256 |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
deltas |
|
105
|
|
|
|
|
|
|
accumulate |
|
106
|
|
|
|
|
|
|
frequencies |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
run_length |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
binary_vrl_encode |
|
111
|
|
|
|
|
|
|
binary_vrl_decode |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
rle4_encode |
|
114
|
|
|
|
|
|
|
rle4_decode |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
zrle_encode |
|
117
|
|
|
|
|
|
|
zrle_decode |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
lzss_compress |
|
120
|
|
|
|
|
|
|
lzss_decompress |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
make_deflate_tables |
|
123
|
|
|
|
|
|
|
find_deflate_index |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
deflate_encode |
|
126
|
|
|
|
|
|
|
deflate_decode |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
lzss_encode |
|
129
|
|
|
|
|
|
|
lzss_encode_fast |
|
130
|
|
|
|
|
|
|
lzss_encode_fast_symbolic |
|
131
|
|
|
|
|
|
|
lzss_decode |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
lzss_encode_symbolic |
|
134
|
|
|
|
|
|
|
lzss_decode_symbolic |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
lzss_compress_symbolic |
|
137
|
|
|
|
|
|
|
lzss_decompress_symbolic |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
lz77_encode |
|
140
|
|
|
|
|
|
|
lz77_decode |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
lz77_encode_symbolic |
|
143
|
|
|
|
|
|
|
lz77_decode_symbolic |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
lz77_compress |
|
146
|
|
|
|
|
|
|
lz77_decompress |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
lz77_compress_symbolic |
|
149
|
|
|
|
|
|
|
lz77_decompress_symbolic |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
lzb_compress |
|
152
|
|
|
|
|
|
|
lzb_decompress |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
lz4_compress |
|
155
|
|
|
|
|
|
|
lz4_decompress |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
ac_encode |
|
158
|
|
|
|
|
|
|
ac_decode |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
create_ac_entry |
|
161
|
|
|
|
|
|
|
decode_ac_entry |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
adaptive_ac_encode |
|
164
|
|
|
|
|
|
|
adaptive_ac_decode |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
create_adaptive_ac_entry |
|
167
|
|
|
|
|
|
|
decode_adaptive_ac_entry |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
abc_encode |
|
170
|
|
|
|
|
|
|
abc_decode |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
fibonacci_encode |
|
173
|
|
|
|
|
|
|
fibonacci_decode |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
elias_gamma_encode |
|
176
|
|
|
|
|
|
|
elias_gamma_decode |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
elias_omega_encode |
|
179
|
|
|
|
|
|
|
elias_omega_decode |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
obh_encode |
|
182
|
|
|
|
|
|
|
obh_decode |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
lzw_encode |
|
185
|
|
|
|
|
|
|
lzw_decode |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
lzw_compress |
|
188
|
|
|
|
|
|
|
lzw_decompress |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
zlib_compress |
|
191
|
|
|
|
|
|
|
zlib_decompress |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
deflate_create_block_type_0_header |
|
194
|
|
|
|
|
|
|
deflate_create_block_type_1 |
|
195
|
|
|
|
|
|
|
deflate_create_block_type_2 |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
deflate_extract_next_block |
|
198
|
|
|
|
|
|
|
deflate_extract_block_type_0 |
|
199
|
|
|
|
|
|
|
deflate_extract_block_type_1 |
|
200
|
|
|
|
|
|
|
deflate_extract_block_type_2 |
|
201
|
|
|
|
|
|
|
) |
|
202
|
|
|
|
|
|
|
] |
|
203
|
|
|
|
|
|
|
); |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
our @EXPORT_OK = (@{$EXPORT_TAGS{'all'}}, '$VERBOSE', '$LZ_MAX_CHAIN_LEN', '$LZ_MIN_LEN', '$LZ_MAX_LEN', '$LZ_MAX_DIST'); |
|
206
|
|
|
|
|
|
|
our @EXPORT; |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
########################## |
|
209
|
|
|
|
|
|
|
# Misc low-level functions |
|
210
|
|
|
|
|
|
|
########################## |
|
211
|
|
|
|
|
|
|
|
|
212
|
168282
|
|
|
168282
|
1
|
229268
|
sub read_bit ($fh, $bitstring) { |
|
|
168282
|
|
|
|
|
239480
|
|
|
|
168282
|
|
|
|
|
222615
|
|
|
|
168282
|
|
|
|
|
215755
|
|
|
213
|
|
|
|
|
|
|
|
|
214
|
168282
|
100
|
50
|
|
|
395946
|
if (($$bitstring // '') eq '') { |
|
215
|
22115
|
|
33
|
|
|
93291
|
$$bitstring = unpack('b*', getc($fh) // confess "can't read bit"); |
|
216
|
|
|
|
|
|
|
} |
|
217
|
|
|
|
|
|
|
|
|
218
|
168282
|
|
|
|
|
435892
|
chop($$bitstring); |
|
219
|
|
|
|
|
|
|
} |
|
220
|
|
|
|
|
|
|
|
|
221
|
89466
|
|
|
89466
|
1
|
130783
|
sub read_bit_lsb ($fh, $bitstring) { |
|
|
89466
|
|
|
|
|
135702
|
|
|
|
89466
|
|
|
|
|
123794
|
|
|
|
89466
|
|
|
|
|
123230
|
|
|
222
|
|
|
|
|
|
|
|
|
223
|
89466
|
100
|
50
|
|
|
223419
|
if (($$bitstring // '') eq '') { |
|
224
|
14388
|
|
33
|
|
|
61347
|
$$bitstring = unpack('B*', getc($fh) // confess "can't read bit"); |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
|
|
227
|
89466
|
|
|
|
|
201440
|
chop($$bitstring); |
|
228
|
|
|
|
|
|
|
} |
|
229
|
|
|
|
|
|
|
|
|
230
|
1005
|
|
|
1005
|
1
|
4708
|
sub read_bits ($fh, $bits_len) { |
|
|
1005
|
|
|
|
|
1654
|
|
|
|
1005
|
|
|
|
|
8408
|
|
|
|
1005
|
|
|
|
|
2037
|
|
|
231
|
|
|
|
|
|
|
|
|
232
|
1005
|
|
33
|
|
|
15064
|
read($fh, (my $data), $bits_len >> 3) // confess "Read error: $!"; |
|
233
|
1005
|
|
|
|
|
7188
|
$data = unpack('B*', $data); |
|
234
|
|
|
|
|
|
|
|
|
235
|
1005
|
|
|
|
|
6352
|
while (length($data) < $bits_len) { |
|
236
|
512
|
|
33
|
|
|
4175
|
$data .= unpack('B*', getc($fh) // confess "can't read bits"); |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
|
|
239
|
1005
|
100
|
|
|
|
3906
|
if (length($data) > $bits_len) { |
|
240
|
512
|
|
|
|
|
2194
|
$data = substr($data, 0, $bits_len); |
|
241
|
|
|
|
|
|
|
} |
|
242
|
|
|
|
|
|
|
|
|
243
|
1005
|
|
|
|
|
4742
|
return $data; |
|
244
|
|
|
|
|
|
|
} |
|
245
|
|
|
|
|
|
|
|
|
246
|
1
|
|
|
1
|
1
|
20
|
sub read_bits_lsb ($fh, $bits_len) { |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
2
|
|
|
247
|
|
|
|
|
|
|
|
|
248
|
1
|
|
33
|
|
|
8
|
read($fh, (my $data), $bits_len >> 3) // confess "Read error: $!"; |
|
249
|
1
|
|
|
|
|
4
|
$data = unpack('b*', $data); |
|
250
|
|
|
|
|
|
|
|
|
251
|
1
|
|
|
|
|
7
|
while (length($data) < $bits_len) { |
|
252
|
0
|
|
0
|
|
|
0
|
$data .= unpack('b*', getc($fh) // confess "can't read bits"); |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
|
|
255
|
1
|
50
|
|
|
|
5
|
if (length($data) > $bits_len) { |
|
256
|
0
|
|
|
|
|
0
|
$data = substr($data, 0, $bits_len); |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
|
|
259
|
1
|
|
|
|
|
8
|
return $data; |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
|
|
262
|
79
|
|
|
79
|
1
|
4416
|
sub int2bits ($value, $size) { |
|
|
79
|
|
|
|
|
309
|
|
|
|
79
|
|
|
|
|
173
|
|
|
|
79
|
|
|
|
|
140
|
|
|
263
|
79
|
|
|
|
|
493
|
sprintf("%0*b", $size, $value); |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
|
|
266
|
11969
|
|
|
11969
|
1
|
28967
|
sub int2bits_lsb ($value, $size) { |
|
|
11969
|
|
|
|
|
20861
|
|
|
|
11969
|
|
|
|
|
18748
|
|
|
|
11969
|
|
|
|
|
23561
|
|
|
267
|
11969
|
|
|
|
|
64342
|
scalar reverse sprintf("%0*b", $size, $value); |
|
268
|
|
|
|
|
|
|
} |
|
269
|
|
|
|
|
|
|
|
|
270
|
221
|
|
|
221
|
1
|
399
|
sub int2bytes ($value, $size) { |
|
|
221
|
|
|
|
|
485
|
|
|
|
221
|
|
|
|
|
455
|
|
|
|
221
|
|
|
|
|
329
|
|
|
271
|
221
|
|
|
|
|
1853
|
pack('B*', sprintf("%0*b", 8 * $size, $value)); |
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
|
|
274
|
145
|
|
|
145
|
1
|
1336
|
sub int2bytes_lsb ($value, $size) { |
|
|
145
|
|
|
|
|
386
|
|
|
|
145
|
|
|
|
|
336
|
|
|
|
145
|
|
|
|
|
263
|
|
|
275
|
145
|
|
|
|
|
1768
|
pack('b*', scalar reverse sprintf("%0*b", 8 * $size, $value)); |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
|
|
278
|
766
|
|
|
766
|
1
|
2735
|
sub bytes2int($fh, $n) { |
|
|
766
|
|
|
|
|
1322
|
|
|
|
766
|
|
|
|
|
1568
|
|
|
|
766
|
|
|
|
|
1167
|
|
|
279
|
|
|
|
|
|
|
|
|
280
|
766
|
100
|
|
|
|
2304
|
if (ref($fh) eq '') { |
|
281
|
8
|
50
|
|
|
|
127
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
282
|
8
|
|
|
|
|
28
|
return __SUB__->($fh2, $n); |
|
283
|
|
|
|
|
|
|
} |
|
284
|
|
|
|
|
|
|
|
|
285
|
758
|
|
|
|
|
1576
|
my $bytes = ''; |
|
286
|
758
|
|
|
|
|
6700
|
$bytes .= getc($fh) for (1 .. $n); |
|
287
|
758
|
|
|
|
|
6205
|
oct('0b' . unpack('B*', $bytes)); |
|
288
|
|
|
|
|
|
|
} |
|
289
|
|
|
|
|
|
|
|
|
290
|
337
|
|
|
337
|
1
|
741
|
sub bytes2int_lsb ($fh, $n) { |
|
|
337
|
|
|
|
|
942
|
|
|
|
337
|
|
|
|
|
675
|
|
|
|
337
|
|
|
|
|
571
|
|
|
291
|
|
|
|
|
|
|
|
|
292
|
337
|
100
|
|
|
|
1139
|
if (ref($fh) eq '') { |
|
293
|
8
|
50
|
|
|
|
131
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
294
|
8
|
|
|
|
|
47
|
return __SUB__->($fh2, $n); |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
329
|
|
|
|
|
753
|
my $bytes = ''; |
|
298
|
329
|
|
|
|
|
2611
|
$bytes .= getc($fh) for (1 .. $n); |
|
299
|
329
|
|
|
|
|
2836
|
oct('0b' . reverse unpack('b*', $bytes)); |
|
300
|
|
|
|
|
|
|
} |
|
301
|
|
|
|
|
|
|
|
|
302
|
276
|
|
|
276
|
1
|
547
|
sub bits2int ($fh, $size, $buffer) { |
|
|
276
|
|
|
|
|
562
|
|
|
|
276
|
|
|
|
|
446
|
|
|
|
276
|
|
|
|
|
460
|
|
|
|
276
|
|
|
|
|
452
|
|
|
303
|
|
|
|
|
|
|
|
|
304
|
276
|
100
|
100
|
|
|
1294
|
if ($size % 8 == 0 and ($$buffer // '') eq '') { # optimization |
|
|
|
|
100
|
|
|
|
|
|
305
|
29
|
|
|
|
|
134
|
return bytes2int($fh, $size >> 3); |
|
306
|
|
|
|
|
|
|
} |
|
307
|
|
|
|
|
|
|
|
|
308
|
247
|
|
|
|
|
545
|
my $bitstring = '0b'; |
|
309
|
247
|
|
|
|
|
621
|
for (1 .. $size) { |
|
310
|
3627
|
100
|
50
|
|
|
11027
|
$bitstring .= ($$buffer // '') eq '' ? read_bit($fh, $buffer) : chop($$buffer); |
|
311
|
|
|
|
|
|
|
} |
|
312
|
247
|
|
|
|
|
858
|
oct($bitstring); |
|
313
|
|
|
|
|
|
|
} |
|
314
|
|
|
|
|
|
|
|
|
315
|
5582
|
|
|
5582
|
1
|
8288
|
sub bits2int_lsb ($fh, $size, $buffer) { |
|
|
5582
|
|
|
|
|
8044
|
|
|
|
5582
|
|
|
|
|
8346
|
|
|
|
5582
|
|
|
|
|
7771
|
|
|
|
5582
|
|
|
|
|
8510
|
|
|
316
|
|
|
|
|
|
|
|
|
317
|
5582
|
100
|
100
|
|
|
14833
|
if ($size % 8 == 0 and ($$buffer // '') eq '') { # optimization |
|
|
|
|
100
|
|
|
|
|
|
318
|
146
|
|
|
|
|
570
|
return bytes2int_lsb($fh, $size >> 3); |
|
319
|
|
|
|
|
|
|
} |
|
320
|
|
|
|
|
|
|
|
|
321
|
5436
|
|
|
|
|
9627
|
my $bitstring = ''; |
|
322
|
5436
|
|
|
|
|
12060
|
for (1 .. $size) { |
|
323
|
28855
|
100
|
50
|
|
|
84589
|
$bitstring .= ($$buffer // '') eq '' ? read_bit_lsb($fh, $buffer) : chop($$buffer); |
|
324
|
|
|
|
|
|
|
} |
|
325
|
5436
|
|
|
|
|
16030
|
oct('0b' . reverse($bitstring)); |
|
326
|
|
|
|
|
|
|
} |
|
327
|
|
|
|
|
|
|
|
|
328
|
130
|
|
|
130
|
1
|
13514
|
sub string2symbols ($string) { |
|
|
130
|
|
|
|
|
382
|
|
|
|
130
|
|
|
|
|
335
|
|
|
329
|
130
|
|
|
|
|
170645
|
[unpack('C*', $string)]; |
|
330
|
|
|
|
|
|
|
} |
|
331
|
|
|
|
|
|
|
|
|
332
|
76
|
|
|
76
|
1
|
4966
|
sub symbols2string ($symbols) { |
|
|
76
|
|
|
|
|
167
|
|
|
|
76
|
|
|
|
|
868
|
|
|
333
|
76
|
|
|
|
|
11970
|
pack('C*', @$symbols); |
|
334
|
|
|
|
|
|
|
} |
|
335
|
|
|
|
|
|
|
|
|
336
|
8
|
|
|
8
|
1
|
3175
|
sub read_null_terminated ($fh) { |
|
|
8
|
|
|
|
|
20
|
|
|
|
8
|
|
|
|
|
18
|
|
|
337
|
8
|
|
|
|
|
21
|
my $string = ''; |
|
338
|
8
|
|
|
|
|
15
|
while (1) { |
|
339
|
66
|
|
33
|
|
|
241
|
my $c = getc($fh) // confess "can't read character"; |
|
340
|
66
|
100
|
|
|
|
251
|
last if $c eq "\0"; |
|
341
|
58
|
|
|
|
|
176
|
$string .= $c; |
|
342
|
|
|
|
|
|
|
} |
|
343
|
8
|
|
|
|
|
37
|
return $string; |
|
344
|
|
|
|
|
|
|
} |
|
345
|
|
|
|
|
|
|
|
|
346
|
727
|
|
|
727
|
1
|
1559
|
sub frequencies ($symbols) { |
|
|
727
|
|
|
|
|
1253
|
|
|
|
727
|
|
|
|
|
1158
|
|
|
347
|
727
|
|
|
|
|
1317
|
my %freq; |
|
348
|
727
|
|
|
|
|
88901
|
++$freq{$_} for @$symbols; |
|
349
|
727
|
|
|
|
|
2898
|
return \%freq; |
|
350
|
|
|
|
|
|
|
} |
|
351
|
|
|
|
|
|
|
|
|
352
|
1010
|
|
|
1010
|
1
|
2047
|
sub deltas ($integers) { |
|
|
1010
|
|
|
|
|
1712
|
|
|
|
1010
|
|
|
|
|
1452
|
|
|
353
|
|
|
|
|
|
|
|
|
354
|
1010
|
|
|
|
|
1547
|
my @deltas; |
|
355
|
1010
|
|
|
|
|
1815
|
my $prev = 0; |
|
356
|
|
|
|
|
|
|
|
|
357
|
1010
|
|
|
|
|
2402
|
foreach my $n (@$integers) { |
|
358
|
224502
|
|
|
|
|
362746
|
push @deltas, $n - $prev; |
|
359
|
224502
|
|
|
|
|
387274
|
$prev = $n; |
|
360
|
|
|
|
|
|
|
} |
|
361
|
|
|
|
|
|
|
|
|
362
|
1010
|
|
|
|
|
2748
|
return \@deltas; |
|
363
|
|
|
|
|
|
|
} |
|
364
|
|
|
|
|
|
|
|
|
365
|
461
|
|
|
461
|
1
|
971
|
sub accumulate ($deltas) { |
|
|
461
|
|
|
|
|
818
|
|
|
|
461
|
|
|
|
|
701
|
|
|
366
|
|
|
|
|
|
|
|
|
367
|
461
|
|
|
|
|
840
|
my @acc; |
|
368
|
461
|
|
|
|
|
828
|
my $prev = 0; |
|
369
|
|
|
|
|
|
|
|
|
370
|
461
|
|
|
|
|
1150
|
foreach my $d (@$deltas) { |
|
371
|
8032
|
|
|
|
|
11687
|
$prev += $d; |
|
372
|
8032
|
|
|
|
|
22446
|
push @acc, $prev; |
|
373
|
|
|
|
|
|
|
} |
|
374
|
|
|
|
|
|
|
|
|
375
|
461
|
|
|
|
|
1710
|
return \@acc; |
|
376
|
|
|
|
|
|
|
} |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
######################## |
|
379
|
|
|
|
|
|
|
# Fibonacci Coding |
|
380
|
|
|
|
|
|
|
######################## |
|
381
|
|
|
|
|
|
|
|
|
382
|
426
|
|
|
426
|
1
|
1098
|
sub fibonacci_encode ($symbols) { |
|
|
426
|
|
|
|
|
866
|
|
|
|
426
|
|
|
|
|
732
|
|
|
383
|
|
|
|
|
|
|
|
|
384
|
426
|
|
|
|
|
1053
|
my $bitstring = ''; |
|
385
|
|
|
|
|
|
|
|
|
386
|
426
|
|
|
|
|
1336
|
foreach my $n (scalar(@$symbols), @$symbols) { |
|
387
|
2438
|
|
|
|
|
4425
|
my ($f1, $f2, $f3) = (0, 1, 1); |
|
388
|
2438
|
|
|
|
|
4980
|
my ($rn, $s, $k) = ($n + 1, '', 2); |
|
389
|
2438
|
|
|
|
|
5040
|
for (; $f3 <= $rn ; ++$k) { |
|
390
|
18609
|
|
|
|
|
44492
|
($f1, $f2, $f3) = ($f2, $f3, $f2 + $f3); |
|
391
|
|
|
|
|
|
|
} |
|
392
|
2438
|
|
|
|
|
5303
|
foreach my $i (1 .. $k - 2) { |
|
393
|
18609
|
|
|
|
|
26443
|
($f3, $f2, $f1) = ($f2, $f1, $f2 - $f1); |
|
394
|
18609
|
100
|
|
|
|
29083
|
if ($f3 <= $rn) { |
|
395
|
6051
|
|
|
|
|
7704
|
$rn -= $f3; |
|
396
|
6051
|
|
|
|
|
12007
|
$s .= '1'; |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
else { |
|
399
|
12558
|
|
|
|
|
20244
|
$s .= '0'; |
|
400
|
|
|
|
|
|
|
} |
|
401
|
|
|
|
|
|
|
} |
|
402
|
2438
|
|
|
|
|
6131
|
$bitstring .= reverse($s) . '1'; |
|
403
|
|
|
|
|
|
|
} |
|
404
|
|
|
|
|
|
|
|
|
405
|
426
|
|
|
|
|
6555
|
pack('B*', $bitstring); |
|
406
|
|
|
|
|
|
|
} |
|
407
|
|
|
|
|
|
|
|
|
408
|
439
|
|
|
439
|
1
|
823
|
sub fibonacci_decode ($fh) { |
|
|
439
|
|
|
|
|
801
|
|
|
|
439
|
|
|
|
|
886
|
|
|
409
|
|
|
|
|
|
|
|
|
410
|
439
|
100
|
|
|
|
1478
|
if (ref($fh) eq '') { |
|
411
|
13
|
50
|
|
|
|
242
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
412
|
13
|
|
|
|
|
50
|
return __SUB__->($fh2); |
|
413
|
|
|
|
|
|
|
} |
|
414
|
|
|
|
|
|
|
|
|
415
|
426
|
|
|
|
|
801
|
my @symbols; |
|
416
|
|
|
|
|
|
|
|
|
417
|
426
|
|
|
|
|
5069
|
my $enc = ''; |
|
418
|
426
|
|
|
|
|
2786
|
my $prev_bit = '0'; |
|
419
|
|
|
|
|
|
|
|
|
420
|
426
|
|
|
|
|
779
|
my $len = 0; |
|
421
|
426
|
|
|
|
|
961
|
my $buffer = ''; |
|
422
|
|
|
|
|
|
|
|
|
423
|
426
|
|
|
|
|
1308
|
for (my $k = 0 ; $k <= $len ;) { |
|
424
|
21047
|
|
|
|
|
38277
|
my $bit = read_bit($fh, \$buffer); |
|
425
|
|
|
|
|
|
|
|
|
426
|
21047
|
100
|
100
|
|
|
58823
|
if ($bit eq '1' and $prev_bit eq '1') { |
|
427
|
2438
|
|
|
|
|
5286
|
my ($value, $f1, $f2) = (0, 1, 1); |
|
428
|
2438
|
|
|
|
|
9923
|
foreach my $bit (split //, $enc) { |
|
429
|
18609
|
100
|
|
|
|
35718
|
$value += $f2 if $bit; |
|
430
|
18609
|
|
|
|
|
38676
|
($f1, $f2) = ($f2, $f1 + $f2); |
|
431
|
|
|
|
|
|
|
} |
|
432
|
2438
|
|
|
|
|
6994
|
push @symbols, $value - 1; |
|
433
|
2438
|
100
|
|
|
|
5419
|
$len = pop @symbols if (++$k == 1); |
|
434
|
2438
|
|
|
|
|
4025
|
$enc = ''; |
|
435
|
2438
|
|
|
|
|
6514
|
$prev_bit = '0'; |
|
436
|
|
|
|
|
|
|
} |
|
437
|
|
|
|
|
|
|
else { |
|
438
|
18609
|
|
|
|
|
29284
|
$enc .= $bit; |
|
439
|
18609
|
|
|
|
|
42344
|
$prev_bit = $bit; |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
} |
|
442
|
|
|
|
|
|
|
|
|
443
|
426
|
|
|
|
|
1619
|
return \@symbols; |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
####################################### |
|
447
|
|
|
|
|
|
|
# Adaptive Binary Concatenation method |
|
448
|
|
|
|
|
|
|
####################################### |
|
449
|
|
|
|
|
|
|
|
|
450
|
31
|
|
|
31
|
1
|
96
|
sub abc_encode ($integers) { |
|
|
31
|
|
|
|
|
100
|
|
|
|
31
|
|
|
|
|
57
|
|
|
451
|
|
|
|
|
|
|
|
|
452
|
31
|
|
|
|
|
99
|
my @counts; |
|
453
|
31
|
|
|
|
|
70
|
my $count = 0; |
|
454
|
31
|
|
|
|
|
59
|
my $bits_width = 1; |
|
455
|
31
|
|
|
|
|
80
|
my $bits_max_symbol = 1 << $bits_width; |
|
456
|
31
|
|
|
|
|
64
|
my $processed_len = 0; |
|
457
|
|
|
|
|
|
|
|
|
458
|
31
|
|
|
|
|
98
|
foreach my $k (@$integers) { |
|
459
|
9150
|
|
|
|
|
14247
|
while ($k >= $bits_max_symbol) { |
|
460
|
|
|
|
|
|
|
|
|
461
|
202
|
100
|
|
|
|
502
|
if ($count > 0) { |
|
462
|
24
|
|
|
|
|
123
|
push @counts, [$bits_width, $count]; |
|
463
|
24
|
|
|
|
|
94
|
$processed_len += $count; |
|
464
|
|
|
|
|
|
|
} |
|
465
|
|
|
|
|
|
|
|
|
466
|
202
|
|
|
|
|
301
|
$count = 0; |
|
467
|
202
|
|
|
|
|
266
|
$bits_max_symbol *= 2; |
|
468
|
202
|
|
|
|
|
460
|
$bits_width += 1; |
|
469
|
|
|
|
|
|
|
} |
|
470
|
9150
|
|
|
|
|
13154
|
++$count; |
|
471
|
|
|
|
|
|
|
} |
|
472
|
|
|
|
|
|
|
|
|
473
|
31
|
|
|
|
|
146
|
push @counts, grep { $_->[1] > 0 } [$bits_width, scalar(@$integers) - $processed_len]; |
|
|
31
|
|
|
|
|
141
|
|
|
474
|
|
|
|
|
|
|
|
|
475
|
31
|
50
|
|
|
|
133
|
$VERBOSE && say STDERR "Bit sizes: ", join(' ', map { $_->[0] } @counts); |
|
|
0
|
|
|
|
|
0
|
|
|
476
|
31
|
50
|
|
|
|
106
|
$VERBOSE && say STDERR "Lengths : ", join(' ', map { $_->[1] } @counts); |
|
|
0
|
|
|
|
|
0
|
|
|
477
|
31
|
50
|
|
|
|
142
|
$VERBOSE && say STDERR ''; |
|
478
|
|
|
|
|
|
|
|
|
479
|
31
|
|
|
|
|
103
|
my $compressed = fibonacci_encode([(map { $_->[0] } @counts), (map { $_->[1] } @counts)]); |
|
|
53
|
|
|
|
|
165
|
|
|
|
53
|
|
|
|
|
212
|
|
|
480
|
|
|
|
|
|
|
|
|
481
|
31
|
|
|
|
|
92
|
my $bits = ''; |
|
482
|
31
|
|
|
|
|
10282
|
my @ints = @$integers; |
|
483
|
|
|
|
|
|
|
|
|
484
|
31
|
|
|
|
|
86
|
foreach my $pair (@counts) { |
|
485
|
53
|
|
|
|
|
165
|
my ($blen, $len) = @$pair; |
|
486
|
53
|
|
|
|
|
755
|
foreach my $symbol (splice(@ints, 0, $len)) { |
|
487
|
9150
|
|
|
|
|
16269
|
$bits .= sprintf("%0*b", $blen, $symbol); |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
} |
|
490
|
|
|
|
|
|
|
|
|
491
|
31
|
|
|
|
|
353
|
$compressed .= pack('B*', $bits); |
|
492
|
31
|
|
|
|
|
259
|
return $compressed; |
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
|
|
495
|
44
|
|
|
44
|
1
|
100
|
sub abc_decode ($fh) { |
|
|
44
|
|
|
|
|
81
|
|
|
|
44
|
|
|
|
|
79
|
|
|
496
|
|
|
|
|
|
|
|
|
497
|
44
|
100
|
|
|
|
167
|
if (ref($fh) eq '') { |
|
498
|
13
|
50
|
|
|
|
265
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
499
|
13
|
|
|
|
|
64
|
return __SUB__->($fh2); |
|
500
|
|
|
|
|
|
|
} |
|
501
|
|
|
|
|
|
|
|
|
502
|
31
|
|
|
|
|
108
|
my $ints = fibonacci_decode($fh); |
|
503
|
31
|
|
|
|
|
91
|
my $half = scalar(@$ints) >> 1; |
|
504
|
|
|
|
|
|
|
|
|
505
|
31
|
|
|
|
|
69
|
my @counts; |
|
506
|
31
|
|
|
|
|
259
|
foreach my $i (0 .. ($half - 1)) { |
|
507
|
53
|
|
|
|
|
192
|
push @counts, [$ints->[$i], $ints->[$half + $i]]; |
|
508
|
|
|
|
|
|
|
} |
|
509
|
|
|
|
|
|
|
|
|
510
|
31
|
|
|
|
|
73
|
my $bits_len = 0; |
|
511
|
|
|
|
|
|
|
|
|
512
|
31
|
|
|
|
|
75
|
foreach my $pair (@counts) { |
|
513
|
53
|
|
|
|
|
146
|
my ($blen, $len) = @$pair; |
|
514
|
53
|
|
|
|
|
115
|
$bits_len += $blen * $len; |
|
515
|
|
|
|
|
|
|
} |
|
516
|
|
|
|
|
|
|
|
|
517
|
31
|
|
|
|
|
100
|
my $bits = read_bits($fh, $bits_len); |
|
518
|
|
|
|
|
|
|
|
|
519
|
31
|
|
|
|
|
147
|
my @integers; |
|
520
|
31
|
|
|
|
|
95
|
foreach my $pair (@counts) { |
|
521
|
53
|
|
|
|
|
170
|
my ($blen, $len) = @$pair; |
|
522
|
53
|
|
|
|
|
8500
|
foreach my $chunk (unpack(sprintf('(a%d)*', $blen), substr($bits, 0, $blen * $len, ''))) { |
|
523
|
9150
|
|
|
|
|
29566
|
push @integers, oct('0b' . $chunk); |
|
524
|
|
|
|
|
|
|
} |
|
525
|
|
|
|
|
|
|
} |
|
526
|
|
|
|
|
|
|
|
|
527
|
31
|
|
|
|
|
315
|
return \@integers; |
|
528
|
|
|
|
|
|
|
} |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
################################### |
|
531
|
|
|
|
|
|
|
# Arithmetic Coding (in fixed bits) |
|
532
|
|
|
|
|
|
|
################################### |
|
533
|
|
|
|
|
|
|
|
|
534
|
158
|
|
|
158
|
|
388
|
sub _create_cfreq ($freq) { |
|
|
158
|
|
|
|
|
297
|
|
|
|
158
|
|
|
|
|
1248
|
|
|
535
|
|
|
|
|
|
|
|
|
536
|
158
|
|
|
|
|
324
|
my @cf; |
|
537
|
158
|
|
|
|
|
292
|
my $T = 0; |
|
538
|
|
|
|
|
|
|
|
|
539
|
158
|
|
|
|
|
1215
|
foreach my $i (sort { $a <=> $b } keys %$freq) { |
|
|
5699
|
|
|
|
|
8203
|
|
|
540
|
1630
|
|
50
|
|
|
3275
|
$freq->{$i} // next; |
|
541
|
1630
|
|
|
|
|
17364
|
$cf[$i] = $T; |
|
542
|
1630
|
|
|
|
|
2455
|
$T += $freq->{$i}; |
|
543
|
1630
|
|
|
|
|
3176
|
$cf[$i + 1] = $T; |
|
544
|
|
|
|
|
|
|
} |
|
545
|
|
|
|
|
|
|
|
|
546
|
158
|
|
|
|
|
811
|
return (\@cf, $T); |
|
547
|
|
|
|
|
|
|
} |
|
548
|
|
|
|
|
|
|
|
|
549
|
79
|
|
|
79
|
1
|
214
|
sub ac_encode ($symbols) { |
|
|
79
|
|
|
|
|
133
|
|
|
|
79
|
|
|
|
|
196
|
|
|
550
|
|
|
|
|
|
|
|
|
551
|
79
|
50
|
|
|
|
294
|
if (ref($symbols) eq '') { |
|
552
|
0
|
|
|
|
|
0
|
$symbols = string2symbols($symbols); |
|
553
|
|
|
|
|
|
|
} |
|
554
|
|
|
|
|
|
|
|
|
555
|
79
|
|
|
|
|
163
|
my $enc = ''; |
|
556
|
79
|
|
100
|
|
|
809
|
my $EOF_SYMBOL = (max(@$symbols) // 0) + 1; |
|
557
|
79
|
|
|
|
|
1198
|
my @bytes = (@$symbols, $EOF_SYMBOL); |
|
558
|
|
|
|
|
|
|
|
|
559
|
79
|
|
|
|
|
277
|
my $freq = frequencies(\@bytes); |
|
560
|
79
|
|
|
|
|
311
|
my ($cf, $T) = _create_cfreq($freq); |
|
561
|
|
|
|
|
|
|
|
|
562
|
79
|
50
|
|
|
|
331
|
if ($T > MAX) { |
|
563
|
0
|
|
|
|
|
0
|
confess "Too few bits: $T > ${\MAX}"; |
|
|
0
|
|
|
|
|
0
|
|
|
564
|
|
|
|
|
|
|
} |
|
565
|
|
|
|
|
|
|
|
|
566
|
79
|
|
|
|
|
150
|
my $low = 0; |
|
567
|
79
|
|
|
|
|
197
|
my $high = MAX; |
|
568
|
79
|
|
|
|
|
168
|
my $uf_count = 0; |
|
569
|
|
|
|
|
|
|
|
|
570
|
79
|
|
|
|
|
260
|
foreach my $c (@bytes) { |
|
571
|
|
|
|
|
|
|
|
|
572
|
4984
|
|
|
|
|
8223
|
my $w = $high - $low + 1; |
|
573
|
|
|
|
|
|
|
|
|
574
|
4984
|
|
|
|
|
11233
|
$high = ($low + int(($w * $cf->[$c + 1]) / $T) - 1) & MAX; |
|
575
|
4984
|
|
|
|
|
10613
|
$low = ($low + int(($w * $cf->[$c]) / $T)) & MAX; |
|
576
|
|
|
|
|
|
|
|
|
577
|
4984
|
50
|
|
|
|
9845
|
if ($high > MAX) { |
|
578
|
0
|
|
|
|
|
0
|
confess "high > MAX: $high > ${\MAX}"; |
|
|
0
|
|
|
|
|
0
|
|
|
579
|
|
|
|
|
|
|
} |
|
580
|
|
|
|
|
|
|
|
|
581
|
4984
|
50
|
|
|
|
9688
|
if ($low >= $high) { confess "$low >= $high" } |
|
|
0
|
|
|
|
|
0
|
|
|
582
|
|
|
|
|
|
|
|
|
583
|
4984
|
|
|
|
|
7164
|
while (1) { |
|
584
|
|
|
|
|
|
|
|
|
585
|
19416
|
100
|
100
|
|
|
55452
|
if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) { |
|
|
|
100
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
|
|
587
|
11516
|
|
|
|
|
16604
|
my $bit = $high >> (BITS - 1); |
|
588
|
11516
|
|
|
|
|
16669
|
$enc .= $bit; |
|
589
|
|
|
|
|
|
|
|
|
590
|
11516
|
100
|
|
|
|
19432
|
if ($uf_count > 0) { |
|
591
|
1429
|
|
|
|
|
2938
|
$enc .= join('', 1 - $bit) x $uf_count; |
|
592
|
1429
|
|
|
|
|
2184
|
$uf_count = 0; |
|
593
|
|
|
|
|
|
|
} |
|
594
|
|
|
|
|
|
|
|
|
595
|
11516
|
|
|
|
|
15456
|
$low <<= 1; |
|
596
|
11516
|
|
|
|
|
18424
|
($high <<= 1) |= 1; |
|
597
|
|
|
|
|
|
|
} |
|
598
|
|
|
|
|
|
|
elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) { |
|
599
|
2916
|
|
|
|
|
4454
|
($high <<= 1) |= (1 << (BITS - 1)); |
|
600
|
2916
|
|
|
|
|
5522
|
$high |= 1; |
|
601
|
2916
|
|
|
|
|
4386
|
($low <<= 1) &= ((1 << (BITS - 1)) - 1); |
|
602
|
2916
|
|
|
|
|
5091
|
++$uf_count; |
|
603
|
|
|
|
|
|
|
} |
|
604
|
|
|
|
|
|
|
else { |
|
605
|
4984
|
|
|
|
|
13423
|
last; |
|
606
|
|
|
|
|
|
|
} |
|
607
|
|
|
|
|
|
|
|
|
608
|
14432
|
|
|
|
|
19111
|
$low &= MAX; |
|
609
|
14432
|
|
|
|
|
23736
|
$high &= MAX; |
|
610
|
|
|
|
|
|
|
} |
|
611
|
|
|
|
|
|
|
} |
|
612
|
|
|
|
|
|
|
|
|
613
|
79
|
|
|
|
|
188
|
$enc .= '0'; |
|
614
|
79
|
|
|
|
|
215
|
$enc .= '1'; |
|
615
|
|
|
|
|
|
|
|
|
616
|
79
|
|
|
|
|
346
|
while (length($enc) % 8 != 0) { |
|
617
|
318
|
|
|
|
|
3444
|
$enc .= '1'; |
|
618
|
|
|
|
|
|
|
} |
|
619
|
|
|
|
|
|
|
|
|
620
|
79
|
|
|
|
|
5450
|
return ($enc, $freq); |
|
621
|
|
|
|
|
|
|
} |
|
622
|
|
|
|
|
|
|
|
|
623
|
92
|
|
|
92
|
1
|
167
|
sub ac_decode ($fh, $freq) { |
|
|
92
|
|
|
|
|
205
|
|
|
|
92
|
|
|
|
|
163
|
|
|
|
92
|
|
|
|
|
167
|
|
|
624
|
|
|
|
|
|
|
|
|
625
|
92
|
100
|
|
|
|
377
|
if (ref($fh) eq '') { |
|
626
|
13
|
50
|
|
|
|
297
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
627
|
13
|
|
|
|
|
62
|
return __SUB__->($fh2, $freq); |
|
628
|
|
|
|
|
|
|
} |
|
629
|
|
|
|
|
|
|
|
|
630
|
79
|
|
|
|
|
287
|
my ($cf, $T) = _create_cfreq($freq); |
|
631
|
|
|
|
|
|
|
|
|
632
|
79
|
|
|
|
|
208
|
my @dec; |
|
633
|
79
|
|
|
|
|
136
|
my $low = 0; |
|
634
|
79
|
|
|
|
|
265
|
my $high = MAX; |
|
635
|
79
|
|
100
|
|
|
270
|
my $enc = oct('0b' . join '', map { getc($fh) // 1 } 1 .. BITS); |
|
|
2528
|
|
|
|
|
9577
|
|
|
636
|
|
|
|
|
|
|
|
|
637
|
79
|
|
|
|
|
543
|
my @table; |
|
638
|
79
|
|
|
|
|
425
|
foreach my $i (sort { $a <=> $b } keys %$freq) { |
|
|
2862
|
|
|
|
|
3872
|
|
|
639
|
815
|
|
|
|
|
1904
|
foreach my $j ($cf->[$i] .. $cf->[$i + 1] - 1) { |
|
640
|
4984
|
|
|
|
|
16618
|
$table[$j] = $i; |
|
641
|
|
|
|
|
|
|
} |
|
642
|
|
|
|
|
|
|
} |
|
643
|
|
|
|
|
|
|
|
|
644
|
79
|
|
50
|
|
|
1179
|
my $EOF_SYMBOL = max(keys %$freq) // 0; |
|
645
|
|
|
|
|
|
|
|
|
646
|
79
|
|
|
|
|
206
|
while (1) { |
|
647
|
|
|
|
|
|
|
|
|
648
|
4984
|
|
|
|
|
10586
|
my $w = $high - $low + 1; |
|
649
|
4984
|
|
|
|
|
17844
|
my $ss = int((($T * ($enc - $low + 1)) - 1) / $w); |
|
650
|
|
|
|
|
|
|
|
|
651
|
4984
|
|
50
|
|
|
16563
|
my $i = $table[$ss] // last; |
|
652
|
4984
|
100
|
|
|
|
9968
|
last if ($i == $EOF_SYMBOL); |
|
653
|
|
|
|
|
|
|
|
|
654
|
4905
|
|
|
|
|
11798
|
push @dec, $i; |
|
655
|
|
|
|
|
|
|
|
|
656
|
4905
|
|
|
|
|
10808
|
$high = ($low + int(($w * $cf->[$i + 1]) / $T) - 1) & MAX; |
|
657
|
4905
|
|
|
|
|
10271
|
$low = ($low + int(($w * $cf->[$i]) / $T)) & MAX; |
|
658
|
|
|
|
|
|
|
|
|
659
|
4905
|
50
|
|
|
|
9832
|
if ($high > MAX) { |
|
660
|
0
|
|
|
|
|
0
|
confess "error"; |
|
661
|
|
|
|
|
|
|
} |
|
662
|
|
|
|
|
|
|
|
|
663
|
4905
|
50
|
|
|
|
14876
|
if ($low >= $high) { confess "$low >= $high" } |
|
|
0
|
|
|
|
|
0
|
|
|
664
|
|
|
|
|
|
|
|
|
665
|
4905
|
|
|
|
|
7452
|
while (1) { |
|
666
|
|
|
|
|
|
|
|
|
667
|
19027
|
100
|
100
|
|
|
47576
|
if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) { |
|
|
|
100
|
|
|
|
|
|
|
668
|
11282
|
|
|
|
|
14577
|
($high <<= 1) |= 1; |
|
669
|
11282
|
|
|
|
|
13658
|
$low <<= 1; |
|
670
|
11282
|
|
100
|
|
|
30599
|
($enc <<= 1) |= (getc($fh) // 1); |
|
671
|
|
|
|
|
|
|
} |
|
672
|
|
|
|
|
|
|
elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) { |
|
673
|
2840
|
|
|
|
|
3933
|
($high <<= 1) |= (1 << (BITS - 1)); |
|
674
|
2840
|
|
|
|
|
3633
|
$high |= 1; |
|
675
|
2840
|
|
|
|
|
3729
|
($low <<= 1) &= ((1 << (BITS - 1)) - 1); |
|
676
|
2840
|
|
100
|
|
|
11055
|
$enc = (($enc >> (BITS - 1)) << (BITS - 1)) | (($enc & ((1 << (BITS - 2)) - 1)) << 1) | (getc($fh) // 1); |
|
677
|
|
|
|
|
|
|
} |
|
678
|
|
|
|
|
|
|
else { |
|
679
|
4905
|
|
|
|
|
15985
|
last; |
|
680
|
|
|
|
|
|
|
} |
|
681
|
|
|
|
|
|
|
|
|
682
|
14122
|
|
|
|
|
19523
|
$low &= MAX; |
|
683
|
14122
|
|
|
|
|
17312
|
$high &= MAX; |
|
684
|
14122
|
|
|
|
|
20650
|
$enc &= MAX; |
|
685
|
|
|
|
|
|
|
} |
|
686
|
|
|
|
|
|
|
} |
|
687
|
|
|
|
|
|
|
|
|
688
|
79
|
|
|
|
|
5514
|
return \@dec; |
|
689
|
|
|
|
|
|
|
} |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
############################################# |
|
692
|
|
|
|
|
|
|
# Adaptive Arithemtic Coding (in fixed bits) |
|
693
|
|
|
|
|
|
|
############################################# |
|
694
|
|
|
|
|
|
|
|
|
695
|
334
|
|
|
334
|
|
524
|
sub _create_adaptive_cfreq ($freq_value, $alphabet_size) { |
|
|
334
|
|
|
|
|
550
|
|
|
|
334
|
|
|
|
|
514
|
|
|
|
334
|
|
|
|
|
491
|
|
|
696
|
|
|
|
|
|
|
|
|
697
|
334
|
|
|
|
|
615
|
my $T = 0; |
|
698
|
334
|
|
|
|
|
622
|
my (@cf, @freq); |
|
699
|
|
|
|
|
|
|
|
|
700
|
334
|
|
|
|
|
934
|
foreach my $i (0 .. $alphabet_size) { |
|
701
|
2080
|
|
|
|
|
3434
|
$freq[$i] = $freq_value; |
|
702
|
2080
|
|
|
|
|
3061
|
$cf[$i] = $T; |
|
703
|
2080
|
|
|
|
|
2835
|
$T += $freq_value; |
|
704
|
2080
|
|
|
|
|
4292
|
$cf[$i + 1] = $T; |
|
705
|
|
|
|
|
|
|
} |
|
706
|
|
|
|
|
|
|
|
|
707
|
334
|
|
|
|
|
1767
|
return (\@freq, \@cf, $T); |
|
708
|
|
|
|
|
|
|
} |
|
709
|
|
|
|
|
|
|
|
|
710
|
9817
|
|
|
9817
|
|
14781
|
sub _increment_freq ($c, $alphabet_size, $freq, $cf) { |
|
|
9817
|
|
|
|
|
15642
|
|
|
|
9817
|
|
|
|
|
16234
|
|
|
|
9817
|
|
|
|
|
14669
|
|
|
|
9817
|
|
|
|
|
14212
|
|
|
|
9817
|
|
|
|
|
13647
|
|
|
711
|
|
|
|
|
|
|
|
|
712
|
9817
|
|
|
|
|
15753
|
++$freq->[$c]; |
|
713
|
9817
|
|
|
|
|
14803
|
my $T = $cf->[$c]; |
|
714
|
|
|
|
|
|
|
|
|
715
|
9817
|
|
|
|
|
23615
|
foreach my $i ($c .. $alphabet_size) { |
|
716
|
192221
|
|
|
|
|
291012
|
$cf->[$i] = $T; |
|
717
|
192221
|
|
|
|
|
281442
|
$T += $freq->[$i]; |
|
718
|
192221
|
|
|
|
|
352744
|
$cf->[$i + 1] = $T; |
|
719
|
|
|
|
|
|
|
} |
|
720
|
|
|
|
|
|
|
|
|
721
|
9817
|
|
|
|
|
24371
|
return $T; |
|
722
|
|
|
|
|
|
|
} |
|
723
|
|
|
|
|
|
|
|
|
724
|
167
|
|
|
167
|
1
|
336
|
sub adaptive_ac_encode ($symbols) { |
|
|
167
|
|
|
|
|
295
|
|
|
|
167
|
|
|
|
|
338
|
|
|
725
|
|
|
|
|
|
|
|
|
726
|
167
|
50
|
|
|
|
533
|
if (ref($symbols) eq '') { |
|
727
|
0
|
|
|
|
|
0
|
$symbols = string2symbols($symbols); |
|
728
|
|
|
|
|
|
|
} |
|
729
|
|
|
|
|
|
|
|
|
730
|
167
|
|
|
|
|
337
|
my $enc = ''; |
|
731
|
167
|
|
|
|
|
3120
|
my @alphabet = sort { $a <=> $b } uniq(@$symbols); |
|
|
2485
|
|
|
|
|
4412
|
|
|
732
|
167
|
100
|
|
|
|
780
|
my $EOF_SYMBOL = scalar(@alphabet) ? ($alphabet[-1] + 1) : 1; |
|
733
|
167
|
|
|
|
|
462
|
push @alphabet, $EOF_SYMBOL; |
|
734
|
|
|
|
|
|
|
|
|
735
|
167
|
|
|
|
|
333
|
my $alphabet_size = $#alphabet; |
|
736
|
167
|
|
|
|
|
626
|
my ($freq, $cf, $T) = _create_adaptive_cfreq(INITIAL_FREQ, $alphabet_size); |
|
737
|
|
|
|
|
|
|
|
|
738
|
167
|
|
|
|
|
330
|
my %table; |
|
739
|
167
|
|
|
|
|
1060
|
@table{@alphabet} = (0 .. $alphabet_size); |
|
740
|
|
|
|
|
|
|
|
|
741
|
167
|
50
|
|
|
|
568
|
if ($T > MAX) { |
|
742
|
0
|
|
|
|
|
0
|
confess "Too few bits: $T > ${\MAX}"; |
|
|
0
|
|
|
|
|
0
|
|
|
743
|
|
|
|
|
|
|
} |
|
744
|
|
|
|
|
|
|
|
|
745
|
167
|
|
|
|
|
326
|
my $low = 0; |
|
746
|
167
|
|
|
|
|
298
|
my $high = MAX; |
|
747
|
167
|
|
|
|
|
342
|
my $uf_count = 0; |
|
748
|
|
|
|
|
|
|
|
|
749
|
167
|
|
|
|
|
412
|
foreach my $value (@$symbols, $EOF_SYMBOL) { |
|
750
|
|
|
|
|
|
|
|
|
751
|
4992
|
|
|
|
|
10338
|
my $c = $table{$value}; |
|
752
|
4992
|
|
|
|
|
8712
|
my $w = $high - $low + 1; |
|
753
|
|
|
|
|
|
|
|
|
754
|
4992
|
|
|
|
|
11712
|
$high = ($low + int(($w * $cf->[$c + 1]) / $T) - 1) & MAX; |
|
755
|
4992
|
|
|
|
|
9967
|
$low = ($low + int(($w * $cf->[$c]) / $T)) & MAX; |
|
756
|
|
|
|
|
|
|
|
|
757
|
4992
|
|
|
|
|
9892
|
$T = _increment_freq($c, $alphabet_size, $freq, $cf); |
|
758
|
|
|
|
|
|
|
|
|
759
|
4992
|
50
|
|
|
|
14764
|
if ($high > MAX) { |
|
760
|
0
|
|
|
|
|
0
|
confess "high > MAX: $high > ${\MAX}"; |
|
|
0
|
|
|
|
|
0
|
|
|
761
|
|
|
|
|
|
|
} |
|
762
|
|
|
|
|
|
|
|
|
763
|
4992
|
50
|
|
|
|
11830
|
if ($low >= $high) { confess "$low >= $high" } |
|
|
0
|
|
|
|
|
0
|
|
|
764
|
|
|
|
|
|
|
|
|
765
|
4992
|
|
|
|
|
7540
|
while (1) { |
|
766
|
|
|
|
|
|
|
|
|
767
|
18558
|
100
|
100
|
|
|
47517
|
if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) { |
|
|
|
100
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
|
|
769
|
10712
|
|
|
|
|
16354
|
my $bit = $high >> (BITS - 1); |
|
770
|
10712
|
|
|
|
|
16460
|
$enc .= $bit; |
|
771
|
|
|
|
|
|
|
|
|
772
|
10712
|
100
|
|
|
|
20388
|
if ($uf_count > 0) { |
|
773
|
1379
|
|
|
|
|
3450
|
$enc .= join('', 1 - $bit) x $uf_count; |
|
774
|
1379
|
|
|
|
|
2232
|
$uf_count = 0; |
|
775
|
|
|
|
|
|
|
} |
|
776
|
|
|
|
|
|
|
|
|
777
|
10712
|
|
|
|
|
16068
|
$low <<= 1; |
|
778
|
10712
|
|
|
|
|
18542
|
($high <<= 1) |= 1; |
|
779
|
|
|
|
|
|
|
} |
|
780
|
|
|
|
|
|
|
elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) { |
|
781
|
2854
|
|
|
|
|
4875
|
($high <<= 1) |= (1 << (BITS - 1)); |
|
782
|
2854
|
|
|
|
|
4194
|
$high |= 1; |
|
783
|
2854
|
|
|
|
|
4396
|
($low <<= 1) &= ((1 << (BITS - 1)) - 1); |
|
784
|
2854
|
|
|
|
|
4411
|
++$uf_count; |
|
785
|
|
|
|
|
|
|
} |
|
786
|
|
|
|
|
|
|
else { |
|
787
|
4992
|
|
|
|
|
12146
|
last; |
|
788
|
|
|
|
|
|
|
} |
|
789
|
|
|
|
|
|
|
|
|
790
|
13566
|
|
|
|
|
19732
|
$low &= MAX; |
|
791
|
13566
|
|
|
|
|
23421
|
$high &= MAX; |
|
792
|
|
|
|
|
|
|
} |
|
793
|
|
|
|
|
|
|
} |
|
794
|
|
|
|
|
|
|
|
|
795
|
167
|
|
|
|
|
391
|
$enc .= '0'; |
|
796
|
167
|
|
|
|
|
309
|
$enc .= '1'; |
|
797
|
|
|
|
|
|
|
|
|
798
|
167
|
|
|
|
|
593
|
while (length($enc) % 8 != 0) { |
|
799
|
683
|
|
|
|
|
1579
|
$enc .= '1'; |
|
800
|
|
|
|
|
|
|
} |
|
801
|
|
|
|
|
|
|
|
|
802
|
167
|
|
|
|
|
1459
|
return ($enc, \@alphabet); |
|
803
|
|
|
|
|
|
|
} |
|
804
|
|
|
|
|
|
|
|
|
805
|
180
|
|
|
180
|
1
|
315
|
sub adaptive_ac_decode ($fh, $alphabet) { |
|
|
180
|
|
|
|
|
386
|
|
|
|
180
|
|
|
|
|
282
|
|
|
|
180
|
|
|
|
|
293
|
|
|
806
|
|
|
|
|
|
|
|
|
807
|
180
|
100
|
|
|
|
599
|
if (ref($fh) eq '') { |
|
808
|
13
|
50
|
|
|
|
208
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
809
|
13
|
|
|
|
|
52
|
return __SUB__->($fh2, $alphabet); |
|
810
|
|
|
|
|
|
|
} |
|
811
|
|
|
|
|
|
|
|
|
812
|
167
|
|
|
|
|
291
|
my @dec; |
|
813
|
167
|
|
|
|
|
347
|
my $low = 0; |
|
814
|
167
|
|
|
|
|
310
|
my $high = MAX; |
|
815
|
|
|
|
|
|
|
|
|
816
|
167
|
|
|
|
|
302
|
my $alphabet_size = $#{$alphabet}; |
|
|
167
|
|
|
|
|
428
|
|
|
817
|
167
|
|
|
|
|
459
|
my ($freq, $cf, $T) = _create_adaptive_cfreq(INITIAL_FREQ, $alphabet_size); |
|
818
|
|
|
|
|
|
|
|
|
819
|
167
|
|
100
|
|
|
495
|
my $enc = oct('0b' . join '', map { getc($fh) // 1 } 1 .. BITS); |
|
|
5344
|
|
|
|
|
20417
|
|
|
820
|
|
|
|
|
|
|
|
|
821
|
167
|
|
|
|
|
1019
|
while (1) { |
|
822
|
4992
|
|
|
|
|
9589
|
my $w = ($high + 1) - $low; |
|
823
|
4992
|
|
|
|
|
20651
|
my $ss = int((($T * ($enc - $low + 1)) - 1) / $w); |
|
824
|
|
|
|
|
|
|
|
|
825
|
4992
|
|
|
|
|
8656
|
my $i = 0; |
|
826
|
4992
|
|
|
|
|
10147
|
foreach my $j (0 .. $alphabet_size) { |
|
827
|
45019
|
100
|
66
|
|
|
221472
|
if ($cf->[$j] <= $ss and $ss < $cf->[$j + 1]) { |
|
828
|
4992
|
|
|
|
|
8778
|
$i = $j; |
|
829
|
4992
|
|
|
|
|
10160
|
last; |
|
830
|
|
|
|
|
|
|
} |
|
831
|
|
|
|
|
|
|
} |
|
832
|
|
|
|
|
|
|
|
|
833
|
4992
|
100
|
|
|
|
10794
|
last if ($i == $alphabet_size); |
|
834
|
4825
|
|
|
|
|
10695
|
push @dec, $alphabet->[$i]; |
|
835
|
|
|
|
|
|
|
|
|
836
|
4825
|
|
|
|
|
11647
|
$high = ($low + int(($w * $cf->[$i + 1]) / $T) - 1) & MAX; |
|
837
|
4825
|
|
|
|
|
11885
|
$low = ($low + int(($w * $cf->[$i]) / $T)) & MAX; |
|
838
|
|
|
|
|
|
|
|
|
839
|
4825
|
|
|
|
|
10487
|
$T = _increment_freq($i, $alphabet_size, $freq, $cf); |
|
840
|
|
|
|
|
|
|
|
|
841
|
4825
|
50
|
|
|
|
11180
|
if ($high > MAX) { |
|
842
|
0
|
|
|
|
|
0
|
confess "high > MAX: ($high > ${\MAX})"; |
|
|
0
|
|
|
|
|
0
|
|
|
843
|
|
|
|
|
|
|
} |
|
844
|
|
|
|
|
|
|
|
|
845
|
4825
|
50
|
|
|
|
10454
|
if ($low >= $high) { confess "$low >= $high" } |
|
|
0
|
|
|
|
|
0
|
|
|
846
|
|
|
|
|
|
|
|
|
847
|
4825
|
|
|
|
|
7101
|
while (1) { |
|
848
|
|
|
|
|
|
|
|
|
849
|
17866
|
100
|
100
|
|
|
49548
|
if (($high >> (BITS - 1)) == ($low >> (BITS - 1))) { |
|
|
|
100
|
|
|
|
|
|
|
850
|
10298
|
|
|
|
|
15349
|
($high <<= 1) |= 1; |
|
851
|
10298
|
|
|
|
|
14087
|
$low <<= 1; |
|
852
|
10298
|
|
100
|
|
|
37065
|
($enc <<= 1) |= (getc($fh) // 1); |
|
853
|
|
|
|
|
|
|
} |
|
854
|
|
|
|
|
|
|
elsif (((($low >> (BITS - 2)) & 0x1) == 1) && ((($high >> (BITS - 2)) & 0x1) == 0)) { |
|
855
|
2743
|
|
|
|
|
4308
|
($high <<= 1) |= (1 << (BITS - 1)); |
|
856
|
2743
|
|
|
|
|
3872
|
$high |= 1; |
|
857
|
2743
|
|
|
|
|
7612
|
($low <<= 1) &= ((1 << (BITS - 1)) - 1); |
|
858
|
2743
|
|
100
|
|
|
13053
|
$enc = (($enc >> (BITS - 1)) << (BITS - 1)) | (($enc & ((1 << (BITS - 2)) - 1)) << 1) | (getc($fh) // 1); |
|
859
|
|
|
|
|
|
|
} |
|
860
|
|
|
|
|
|
|
else { |
|
861
|
4825
|
|
|
|
|
14254
|
last; |
|
862
|
|
|
|
|
|
|
} |
|
863
|
|
|
|
|
|
|
|
|
864
|
13041
|
|
|
|
|
20807
|
$low &= MAX; |
|
865
|
13041
|
|
|
|
|
18204
|
$high &= MAX; |
|
866
|
13041
|
|
|
|
|
22263
|
$enc &= MAX; |
|
867
|
|
|
|
|
|
|
} |
|
868
|
|
|
|
|
|
|
} |
|
869
|
|
|
|
|
|
|
|
|
870
|
167
|
|
|
|
|
1731
|
return \@dec; |
|
871
|
|
|
|
|
|
|
} |
|
872
|
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
##################### |
|
874
|
|
|
|
|
|
|
# Generic run-length |
|
875
|
|
|
|
|
|
|
##################### |
|
876
|
|
|
|
|
|
|
|
|
877
|
4116
|
|
|
4116
|
1
|
6248
|
sub run_length ($arr, $max_run = undef) { |
|
|
4116
|
|
|
|
|
6193
|
|
|
|
4116
|
|
|
|
|
7276
|
|
|
|
4116
|
|
|
|
|
6082
|
|
|
878
|
|
|
|
|
|
|
|
|
879
|
4116
|
100
|
|
|
|
10549
|
@$arr || return []; |
|
880
|
|
|
|
|
|
|
|
|
881
|
3979
|
|
|
|
|
14038
|
my @result = [$arr->[0], 1]; |
|
882
|
3979
|
|
|
|
|
9069
|
my $prev_value = $arr->[0]; |
|
883
|
|
|
|
|
|
|
|
|
884
|
3979
|
|
|
|
|
12069
|
foreach my $i (1 .. $#$arr) { |
|
885
|
|
|
|
|
|
|
|
|
886
|
487222
|
|
|
|
|
744765
|
my $curr_value = $arr->[$i]; |
|
887
|
|
|
|
|
|
|
|
|
888
|
487222
|
100
|
100
|
|
|
1558745
|
if ($curr_value == $prev_value and (defined($max_run) ? $result[-1][1] < $max_run : 1)) { |
|
|
|
100
|
|
|
|
|
|
|
889
|
450934
|
|
|
|
|
704640
|
++$result[-1][1]; |
|
890
|
|
|
|
|
|
|
} |
|
891
|
|
|
|
|
|
|
else { |
|
892
|
36288
|
|
|
|
|
88766
|
push(@result, [$curr_value, 1]); |
|
893
|
|
|
|
|
|
|
} |
|
894
|
|
|
|
|
|
|
|
|
895
|
487222
|
|
|
|
|
852868
|
$prev_value = $curr_value; |
|
896
|
|
|
|
|
|
|
} |
|
897
|
|
|
|
|
|
|
|
|
898
|
3979
|
|
|
|
|
11815
|
return \@result; |
|
899
|
|
|
|
|
|
|
} |
|
900
|
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
###################################### |
|
902
|
|
|
|
|
|
|
# Binary variable run-length encoding |
|
903
|
|
|
|
|
|
|
###################################### |
|
904
|
|
|
|
|
|
|
|
|
905
|
1
|
|
|
1
|
1
|
1535
|
sub binary_vrl_encode ($bitstring) { |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
4
|
|
|
906
|
|
|
|
|
|
|
|
|
907
|
1
|
|
|
|
|
27
|
my @bits = split(//, $bitstring); |
|
908
|
1
|
|
|
|
|
498
|
my $encoded = $bits[0]; |
|
909
|
|
|
|
|
|
|
|
|
910
|
1
|
|
|
|
|
5
|
foreach my $rle (@{run_length(\@bits)}) { |
|
|
1
|
|
|
|
|
7
|
|
|
911
|
23
|
|
|
|
|
55
|
my ($c, $v) = @$rle; |
|
912
|
|
|
|
|
|
|
|
|
913
|
23
|
100
|
|
|
|
65
|
if ($v == 1) { |
|
914
|
13
|
|
|
|
|
33
|
$encoded .= '0'; |
|
915
|
|
|
|
|
|
|
} |
|
916
|
|
|
|
|
|
|
else { |
|
917
|
10
|
|
|
|
|
25
|
my $t = sprintf('%b', $v - 1); |
|
918
|
10
|
|
|
|
|
37
|
$encoded .= join('', '1' x length($t), '0', substr($t, 1)); |
|
919
|
|
|
|
|
|
|
} |
|
920
|
|
|
|
|
|
|
} |
|
921
|
|
|
|
|
|
|
|
|
922
|
1
|
|
|
|
|
23
|
return $encoded; |
|
923
|
|
|
|
|
|
|
} |
|
924
|
|
|
|
|
|
|
|
|
925
|
1
|
|
|
1
|
1
|
8
|
sub binary_vrl_decode ($bitstring) { |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
3
|
|
|
926
|
|
|
|
|
|
|
|
|
927
|
1
|
|
|
|
|
3
|
my $decoded = ''; |
|
928
|
1
|
|
|
|
|
4
|
my $bit = substr($bitstring, 0, 1, ''); |
|
929
|
|
|
|
|
|
|
|
|
930
|
1
|
|
|
|
|
7
|
while ($bitstring ne '') { |
|
931
|
|
|
|
|
|
|
|
|
932
|
23
|
|
|
|
|
63
|
$decoded .= $bit; |
|
933
|
|
|
|
|
|
|
|
|
934
|
23
|
|
|
|
|
40
|
my $bl = 0; |
|
935
|
23
|
|
|
|
|
61
|
while (substr($bitstring, 0, 1, '') eq '1') { |
|
936
|
25
|
|
|
|
|
63
|
++$bl; |
|
937
|
|
|
|
|
|
|
} |
|
938
|
|
|
|
|
|
|
|
|
939
|
23
|
100
|
|
|
|
51
|
if ($bl > 0) { |
|
940
|
10
|
|
|
|
|
25
|
$decoded .= $bit x oct('0b1' . join('', map { substr($bitstring, 0, 1, '') } 1 .. $bl - 1)); |
|
|
15
|
|
|
|
|
51
|
|
|
941
|
|
|
|
|
|
|
} |
|
942
|
|
|
|
|
|
|
|
|
943
|
23
|
100
|
|
|
|
79
|
$bit = ($bit eq '1' ? '0' : '1'); |
|
944
|
|
|
|
|
|
|
} |
|
945
|
|
|
|
|
|
|
|
|
946
|
1
|
|
|
|
|
6
|
return $decoded; |
|
947
|
|
|
|
|
|
|
} |
|
948
|
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
############################ |
|
950
|
|
|
|
|
|
|
# Burrows-Wheeler transform |
|
951
|
|
|
|
|
|
|
############################ |
|
952
|
|
|
|
|
|
|
|
|
953
|
31
|
|
|
31
|
1
|
57
|
sub bwt_sort ($s, $LOOKAHEAD_LEN = 128) { # O(n * LOOKAHEAD_LEN) space (fast) |
|
|
31
|
|
|
|
|
85
|
|
|
|
31
|
|
|
|
|
74
|
|
|
|
31
|
|
|
|
|
105
|
|
|
954
|
|
|
|
|
|
|
#<<< |
|
955
|
|
|
|
|
|
|
[ |
|
956
|
63237
|
|
|
|
|
143857
|
map { $_->[1] } sort { |
|
957
|
770584
|
50
|
|
|
|
2127113
|
($a->[0] cmp $b->[0]) |
|
958
|
|
|
|
|
|
|
|| ((substr($s, $a->[1]) . substr($s, 0, $a->[1])) cmp (substr($s, $b->[1]) . substr($s, 0, $b->[1]))) |
|
959
|
|
|
|
|
|
|
} |
|
960
|
|
|
|
|
|
|
map { |
|
961
|
31
|
|
|
|
|
2989
|
my $t = substr($s, $_, $LOOKAHEAD_LEN); |
|
|
63237
|
|
|
|
|
139802
|
|
|
962
|
|
|
|
|
|
|
|
|
963
|
63237
|
100
|
|
|
|
143766
|
if (length($t) < $LOOKAHEAD_LEN) { |
|
964
|
1344
|
100
|
|
|
|
3041
|
$t .= substr($s, 0, ($_ < $LOOKAHEAD_LEN) ? $_ : ($LOOKAHEAD_LEN - length($t))); |
|
965
|
|
|
|
|
|
|
} |
|
966
|
|
|
|
|
|
|
|
|
967
|
63237
|
|
|
|
|
199530
|
[$t, $_] |
|
968
|
|
|
|
|
|
|
} 0 .. length($s) - 1 |
|
969
|
|
|
|
|
|
|
]; |
|
970
|
|
|
|
|
|
|
#>>> |
|
971
|
|
|
|
|
|
|
} |
|
972
|
|
|
|
|
|
|
|
|
973
|
31
|
|
|
31
|
1
|
88
|
sub bwt_encode ($s, $LOOKAHEAD_LEN = 128) { |
|
|
31
|
|
|
|
|
37147
|
|
|
|
31
|
|
|
|
|
93
|
|
|
|
31
|
|
|
|
|
88
|
|
|
974
|
|
|
|
|
|
|
|
|
975
|
31
|
50
|
|
|
|
177
|
if (ref($s) ne '') { |
|
976
|
0
|
|
|
|
|
0
|
return bwt_encode_symbolic($s); |
|
977
|
|
|
|
|
|
|
} |
|
978
|
|
|
|
|
|
|
|
|
979
|
31
|
|
|
|
|
170
|
my $bwt = bwt_sort($s, $LOOKAHEAD_LEN); |
|
980
|
31
|
|
|
|
|
21507
|
my $ret = join('', map { substr($s, $_ - 1, 1) } @$bwt); |
|
|
63237
|
|
|
|
|
144601
|
|
|
981
|
|
|
|
|
|
|
|
|
982
|
31
|
|
|
|
|
8721
|
my $idx = 0; |
|
983
|
31
|
|
|
|
|
129
|
foreach my $i (@$bwt) { |
|
984
|
10385
|
100
|
|
|
|
20553
|
$i || last; |
|
985
|
10355
|
|
|
|
|
18198
|
++$idx; |
|
986
|
|
|
|
|
|
|
} |
|
987
|
|
|
|
|
|
|
|
|
988
|
31
|
|
|
|
|
6658
|
return ($ret, $idx); |
|
989
|
|
|
|
|
|
|
} |
|
990
|
|
|
|
|
|
|
|
|
991
|
43
|
|
|
43
|
1
|
104
|
sub bwt_decode ($bwt, $idx) { # fast inversion |
|
|
43
|
|
|
|
|
96
|
|
|
|
43
|
|
|
|
|
126
|
|
|
|
43
|
|
|
|
|
119
|
|
|
992
|
|
|
|
|
|
|
|
|
993
|
43
|
|
|
|
|
11901
|
my @tail = split(//, $bwt); |
|
994
|
43
|
|
|
|
|
37770
|
my @head = sort @tail; |
|
995
|
|
|
|
|
|
|
|
|
996
|
43
|
|
|
|
|
109
|
my %indices; |
|
997
|
43
|
|
|
|
|
217
|
foreach my $i (0 .. $#tail) { |
|
998
|
63573
|
|
|
|
|
104058
|
push @{$indices{$tail[$i]}}, $i; |
|
|
63573
|
|
|
|
|
170059
|
|
|
999
|
|
|
|
|
|
|
} |
|
1000
|
|
|
|
|
|
|
|
|
1001
|
43
|
|
|
|
|
113
|
my @table; |
|
1002
|
43
|
|
|
|
|
134
|
foreach my $v (@head) { |
|
1003
|
63573
|
|
|
|
|
104791
|
push @table, shift(@{$indices{$v}}); |
|
|
63573
|
|
|
|
|
163481
|
|
|
1004
|
|
|
|
|
|
|
} |
|
1005
|
|
|
|
|
|
|
|
|
1006
|
43
|
|
|
|
|
136
|
my $dec = ''; |
|
1007
|
43
|
|
|
|
|
128
|
my $i = $idx; |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
43
|
|
|
|
|
203
|
for (1 .. scalar(@head)) { |
|
1010
|
63573
|
|
|
|
|
133023
|
$dec .= $head[$i]; |
|
1011
|
63573
|
|
|
|
|
129151
|
$i = $table[$i]; |
|
1012
|
|
|
|
|
|
|
} |
|
1013
|
|
|
|
|
|
|
|
|
1014
|
43
|
|
|
|
|
13395
|
return $dec; |
|
1015
|
|
|
|
|
|
|
} |
|
1016
|
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
############################################## |
|
1018
|
|
|
|
|
|
|
# Burrows-Wheeler transform (symbolic variant) |
|
1019
|
|
|
|
|
|
|
############################################## |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
55
|
|
|
55
|
1
|
131
|
sub bwt_sort_symbolic ($s) { # O(n) space (slowish) |
|
|
55
|
|
|
|
|
105
|
|
|
|
55
|
|
|
|
|
80
|
|
|
1022
|
|
|
|
|
|
|
|
|
1023
|
55
|
|
|
|
|
699
|
my @cyclic = @$s; |
|
1024
|
55
|
|
|
|
|
128
|
my $len = scalar(@cyclic); |
|
1025
|
|
|
|
|
|
|
|
|
1026
|
55
|
|
|
|
|
133
|
my $rle = 1; |
|
1027
|
55
|
|
|
|
|
233
|
foreach my $i (1 .. $len - 1) { |
|
1028
|
69
|
100
|
|
|
|
407
|
if ($cyclic[$i] != $cyclic[$i - 1]) { |
|
1029
|
42
|
|
|
|
|
112
|
$rle = 0; |
|
1030
|
42
|
|
|
|
|
128
|
last; |
|
1031
|
|
|
|
|
|
|
} |
|
1032
|
|
|
|
|
|
|
} |
|
1033
|
|
|
|
|
|
|
|
|
1034
|
55
|
100
|
|
|
|
239
|
$rle && return [0 .. $len - 1]; |
|
1035
|
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
[ |
|
1037
|
|
|
|
|
|
|
sort { |
|
1038
|
42
|
|
|
|
|
527
|
my ($i, $j) = ($a, $b); |
|
|
26259
|
|
|
|
|
49123
|
|
|
1039
|
|
|
|
|
|
|
|
|
1040
|
26259
|
|
|
|
|
61816
|
while ($cyclic[$i] == $cyclic[$j]) { |
|
1041
|
18480
|
100
|
|
|
|
42270
|
$i %= $len if (++$i >= $len); |
|
1042
|
18480
|
100
|
|
|
|
56948
|
$j %= $len if (++$j >= $len); |
|
1043
|
|
|
|
|
|
|
} |
|
1044
|
|
|
|
|
|
|
|
|
1045
|
26259
|
|
|
|
|
50707
|
$cyclic[$i] <=> $cyclic[$j]; |
|
1046
|
|
|
|
|
|
|
} 0 .. $len - 1 |
|
1047
|
|
|
|
|
|
|
]; |
|
1048
|
|
|
|
|
|
|
} |
|
1049
|
|
|
|
|
|
|
|
|
1050
|
55
|
|
|
55
|
1
|
110
|
sub bwt_encode_symbolic ($symbols) { |
|
|
55
|
|
|
|
|
97
|
|
|
|
55
|
|
|
|
|
118
|
|
|
1051
|
|
|
|
|
|
|
|
|
1052
|
55
|
50
|
|
|
|
215
|
if (ref($symbols) eq '') { |
|
1053
|
0
|
|
|
|
|
0
|
$symbols = string2symbols($symbols); |
|
1054
|
|
|
|
|
|
|
} |
|
1055
|
|
|
|
|
|
|
|
|
1056
|
55
|
|
|
|
|
220
|
my $bwt = bwt_sort_symbolic($symbols); |
|
1057
|
55
|
|
|
|
|
305
|
my @ret = map { $symbols->[$_ - 1] } @$bwt; |
|
|
3789
|
|
|
|
|
7526
|
|
|
1058
|
|
|
|
|
|
|
|
|
1059
|
55
|
|
|
|
|
180
|
my $idx = 0; |
|
1060
|
55
|
|
|
|
|
152
|
foreach my $i (@$bwt) { |
|
1061
|
1193
|
100
|
|
|
|
2481
|
$i || last; |
|
1062
|
1142
|
|
|
|
|
2173
|
++$idx; |
|
1063
|
|
|
|
|
|
|
} |
|
1064
|
|
|
|
|
|
|
|
|
1065
|
55
|
|
|
|
|
336
|
return (\@ret, $idx); |
|
1066
|
|
|
|
|
|
|
} |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
55
|
|
|
55
|
1
|
110
|
sub bwt_decode_symbolic ($bwt, $idx) { # fast inversion |
|
|
55
|
|
|
|
|
104
|
|
|
|
55
|
|
|
|
|
145
|
|
|
|
55
|
|
|
|
|
130
|
|
|
1069
|
|
|
|
|
|
|
|
|
1070
|
55
|
|
|
|
|
369
|
my @head = sort { $a <=> $b } @$bwt; |
|
|
23898
|
|
|
|
|
39380
|
|
|
1071
|
|
|
|
|
|
|
|
|
1072
|
55
|
|
|
|
|
115
|
my %indices; |
|
1073
|
55
|
|
|
|
|
203
|
foreach my $i (0 .. $#head) { |
|
1074
|
3789
|
|
|
|
|
6101
|
push @{$indices{$bwt->[$i]}}, $i; |
|
|
3789
|
|
|
|
|
11726
|
|
|
1075
|
|
|
|
|
|
|
} |
|
1076
|
|
|
|
|
|
|
|
|
1077
|
55
|
|
|
|
|
107
|
my @table; |
|
1078
|
55
|
|
|
|
|
132
|
foreach my $v (@head) { |
|
1079
|
3789
|
|
|
|
|
6034
|
push @table, shift(@{$indices{$v}}); |
|
|
3789
|
|
|
|
|
10138
|
|
|
1080
|
|
|
|
|
|
|
} |
|
1081
|
|
|
|
|
|
|
|
|
1082
|
55
|
|
|
|
|
148
|
my @dec; |
|
1083
|
55
|
|
|
|
|
160
|
my $i = $idx; |
|
1084
|
|
|
|
|
|
|
|
|
1085
|
55
|
|
|
|
|
197
|
for (1 .. scalar(@head)) { |
|
1086
|
3789
|
|
|
|
|
8634
|
push @dec, $head[$i]; |
|
1087
|
3789
|
|
|
|
|
8615
|
$i = $table[$i]; |
|
1088
|
|
|
|
|
|
|
} |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
55
|
|
|
|
|
856
|
return \@dec; |
|
1091
|
|
|
|
|
|
|
} |
|
1092
|
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
##################### |
|
1094
|
|
|
|
|
|
|
# RLE4 used in Bzip2 |
|
1095
|
|
|
|
|
|
|
##################### |
|
1096
|
|
|
|
|
|
|
|
|
1097
|
2212
|
|
|
2212
|
1
|
8930
|
sub rle4_encode ($symbols, $max_run = 255) { # RLE1 |
|
|
2212
|
|
|
|
|
5964
|
|
|
|
2212
|
|
|
|
|
4323
|
|
|
|
2212
|
|
|
|
|
3189
|
|
|
1098
|
|
|
|
|
|
|
|
|
1099
|
2212
|
100
|
|
|
|
11375
|
if (ref($symbols) eq '') { |
|
1100
|
12
|
|
|
|
|
64
|
$symbols = string2symbols($symbols); |
|
1101
|
|
|
|
|
|
|
} |
|
1102
|
|
|
|
|
|
|
|
|
1103
|
2212
|
|
|
|
|
3879
|
my $end = $#{$symbols}; |
|
|
2212
|
|
|
|
|
6678
|
|
|
1104
|
2212
|
100
|
|
|
|
5929
|
return [] if ($end < 0); |
|
1105
|
|
|
|
|
|
|
|
|
1106
|
2130
|
|
|
|
|
4616
|
my $prev = $symbols->[0]; |
|
1107
|
2130
|
|
|
|
|
3560
|
my $run = 1; |
|
1108
|
2130
|
|
|
|
|
5889
|
my @rle = ($prev); |
|
1109
|
|
|
|
|
|
|
|
|
1110
|
2130
|
|
|
|
|
5990
|
for (my $i = 1 ; $i <= $end ; ++$i) { |
|
1111
|
|
|
|
|
|
|
|
|
1112
|
118172
|
100
|
|
|
|
311197
|
if ($symbols->[$i] == $prev) { |
|
1113
|
19678
|
|
|
|
|
38058
|
++$run; |
|
1114
|
|
|
|
|
|
|
} |
|
1115
|
|
|
|
|
|
|
else { |
|
1116
|
98494
|
|
|
|
|
180389
|
$run = 1; |
|
1117
|
98494
|
|
|
|
|
202597
|
$prev = $symbols->[$i]; |
|
1118
|
|
|
|
|
|
|
} |
|
1119
|
|
|
|
|
|
|
|
|
1120
|
118172
|
|
|
|
|
279380
|
push @rle, $prev; |
|
1121
|
|
|
|
|
|
|
|
|
1122
|
118172
|
100
|
|
|
|
403979
|
if ($run >= 4) { |
|
1123
|
|
|
|
|
|
|
|
|
1124
|
3278
|
|
|
|
|
5852
|
$run = 0; |
|
1125
|
3278
|
|
|
|
|
6334
|
$i += 1; |
|
1126
|
|
|
|
|
|
|
|
|
1127
|
3278
|
|
100
|
|
|
25048
|
while ($run < $max_run and $i <= $end and $symbols->[$i] == $prev) { |
|
|
|
|
100
|
|
|
|
|
|
1128
|
437139
|
|
|
|
|
631268
|
++$run; |
|
1129
|
437139
|
|
|
|
|
1862689
|
++$i; |
|
1130
|
|
|
|
|
|
|
} |
|
1131
|
|
|
|
|
|
|
|
|
1132
|
3278
|
|
|
|
|
8486
|
push @rle, $run; |
|
1133
|
3278
|
|
|
|
|
7088
|
$run = 1; |
|
1134
|
|
|
|
|
|
|
|
|
1135
|
3278
|
100
|
|
|
|
7948
|
if ($i <= $end) { |
|
1136
|
3191
|
|
|
|
|
6461
|
$prev = $symbols->[$i]; |
|
1137
|
3191
|
|
|
|
|
13898
|
push @rle, $symbols->[$i]; |
|
1138
|
|
|
|
|
|
|
} |
|
1139
|
|
|
|
|
|
|
} |
|
1140
|
|
|
|
|
|
|
} |
|
1141
|
|
|
|
|
|
|
|
|
1142
|
2130
|
|
|
|
|
14624
|
return \@rle; |
|
1143
|
|
|
|
|
|
|
} |
|
1144
|
|
|
|
|
|
|
|
|
1145
|
234
|
|
|
234
|
1
|
507
|
sub rle4_decode ($symbols) { # RLE1 |
|
|
234
|
|
|
|
|
550
|
|
|
|
234
|
|
|
|
|
432
|
|
|
1146
|
|
|
|
|
|
|
|
|
1147
|
234
|
50
|
|
|
|
948
|
if (ref($symbols) eq '') { |
|
1148
|
0
|
|
|
|
|
0
|
$symbols = string2symbols($symbols); |
|
1149
|
|
|
|
|
|
|
} |
|
1150
|
|
|
|
|
|
|
|
|
1151
|
234
|
|
|
|
|
434
|
my $end = $#{$symbols}; |
|
|
234
|
|
|
|
|
543
|
|
|
1152
|
234
|
100
|
|
|
|
846
|
return [] if ($end < 0); |
|
1153
|
|
|
|
|
|
|
|
|
1154
|
220
|
|
|
|
|
865
|
my @dec = $symbols->[0]; |
|
1155
|
220
|
|
|
|
|
570
|
my $prev = $symbols->[0]; |
|
1156
|
220
|
|
|
|
|
661
|
my $run = 1; |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
220
|
|
|
|
|
916
|
for (my $i = 1 ; $i <= $end ; ++$i) { |
|
1159
|
|
|
|
|
|
|
|
|
1160
|
98364
|
100
|
|
|
|
235614
|
if ($symbols->[$i] == $prev) { |
|
1161
|
13278
|
|
|
|
|
23516
|
++$run; |
|
1162
|
|
|
|
|
|
|
} |
|
1163
|
|
|
|
|
|
|
else { |
|
1164
|
85086
|
|
|
|
|
146615
|
$run = 1; |
|
1165
|
85086
|
|
|
|
|
154322
|
$prev = $symbols->[$i]; |
|
1166
|
|
|
|
|
|
|
} |
|
1167
|
|
|
|
|
|
|
|
|
1168
|
98364
|
|
|
|
|
212366
|
push @dec, $prev; |
|
1169
|
|
|
|
|
|
|
|
|
1170
|
98364
|
100
|
|
|
|
302633
|
if ($run >= 4) { |
|
1171
|
1941
|
50
|
|
|
|
5714
|
if (++$i <= $end) { |
|
1172
|
1941
|
|
|
|
|
4636
|
$run = $symbols->[$i]; |
|
1173
|
1941
|
|
|
|
|
9670
|
push @dec, (($prev) x $run); |
|
1174
|
|
|
|
|
|
|
} |
|
1175
|
|
|
|
|
|
|
|
|
1176
|
1941
|
|
|
|
|
5977
|
$run = 0; |
|
1177
|
|
|
|
|
|
|
} |
|
1178
|
|
|
|
|
|
|
} |
|
1179
|
|
|
|
|
|
|
|
|
1180
|
220
|
|
|
|
|
820
|
return \@dec; |
|
1181
|
|
|
|
|
|
|
} |
|
1182
|
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
####################### |
|
1184
|
|
|
|
|
|
|
# Delta encoding (+RLE) |
|
1185
|
|
|
|
|
|
|
####################### |
|
1186
|
|
|
|
|
|
|
|
|
1187
|
3992
|
|
|
3992
|
|
6239
|
sub _compute_elias_costs ($run_length) { |
|
|
3992
|
|
|
|
|
6063
|
|
|
|
3992
|
|
|
|
|
5640
|
|
|
1188
|
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
# Check which method results in better compression |
|
1190
|
3992
|
|
|
|
|
6250
|
my $with_rle = 0; |
|
1191
|
3992
|
|
|
|
|
5823
|
my $without_rle = 0; |
|
1192
|
|
|
|
|
|
|
|
|
1193
|
3992
|
|
|
|
|
6889
|
my $double_with_rle = 0; |
|
1194
|
3992
|
|
|
|
|
6024
|
my $double_without_rle = 0; |
|
1195
|
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
# Check if there are any negative values or zero values |
|
1197
|
3992
|
|
|
|
|
6098
|
my $has_negative = 0; |
|
1198
|
3992
|
|
|
|
|
5848
|
my $has_zero = 0; |
|
1199
|
|
|
|
|
|
|
|
|
1200
|
3992
|
|
|
|
|
8039
|
foreach my $pair (@$run_length) { |
|
1201
|
38913
|
|
|
|
|
76340
|
my ($c, $v) = @$pair; |
|
1202
|
|
|
|
|
|
|
|
|
1203
|
38913
|
100
|
100
|
|
|
101163
|
if ($c < 0 and not $has_negative) { |
|
1204
|
876
|
|
|
|
|
1653
|
$has_negative = 1; |
|
1205
|
|
|
|
|
|
|
} |
|
1206
|
|
|
|
|
|
|
|
|
1207
|
38913
|
100
|
|
|
|
73174
|
if ($c == 0) { |
|
1208
|
7379
|
|
|
|
|
11921
|
$with_rle += 1; |
|
1209
|
7379
|
|
|
|
|
11159
|
$double_with_rle += 1; |
|
1210
|
7379
|
|
|
|
|
11054
|
$without_rle += $v; |
|
1211
|
7379
|
|
|
|
|
10607
|
$double_without_rle += $v; |
|
1212
|
7379
|
|
100
|
|
|
19749
|
$has_zero ||= 1; |
|
1213
|
|
|
|
|
|
|
} |
|
1214
|
|
|
|
|
|
|
else { |
|
1215
|
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
{ # double |
|
1217
|
31534
|
|
|
|
|
72551
|
my $t = int(log(abs($c) + 1) / log(2) + 1); |
|
1218
|
31534
|
|
|
|
|
58745
|
my $l = int(log($t) / log(2) + 1); |
|
1219
|
31534
|
|
|
|
|
60579
|
my $len = 2 * ($l - 1) + ($t - 1) + 3; |
|
1220
|
|
|
|
|
|
|
|
|
1221
|
31534
|
|
|
|
|
46544
|
$double_with_rle += $len; |
|
1222
|
31534
|
|
|
|
|
60537
|
$double_without_rle += $len * $v; |
|
1223
|
|
|
|
|
|
|
} |
|
1224
|
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
{ # single |
|
1226
|
31534
|
|
|
|
|
45827
|
my $t = int(log(abs($c) + 1) / log(2) + 1); |
|
|
31534
|
|
|
|
|
44348
|
|
|
|
31534
|
|
|
|
|
63426
|
|
|
1227
|
31534
|
|
|
|
|
55852
|
my $len = 2 * ($t - 1) + 3; |
|
1228
|
31534
|
|
|
|
|
48686
|
$with_rle += $len; |
|
1229
|
31534
|
|
|
|
|
59095
|
$without_rle += $len * $v; |
|
1230
|
|
|
|
|
|
|
} |
|
1231
|
|
|
|
|
|
|
} |
|
1232
|
|
|
|
|
|
|
|
|
1233
|
38913
|
100
|
|
|
|
73017
|
if ($v == 1) { |
|
1234
|
32198
|
|
|
|
|
46890
|
$with_rle += 1; |
|
1235
|
32198
|
|
|
|
|
68841
|
$double_with_rle += 1; |
|
1236
|
|
|
|
|
|
|
} |
|
1237
|
|
|
|
|
|
|
else { |
|
1238
|
6715
|
|
|
|
|
14385
|
my $t = int(log($v) / log(2) + 1); |
|
1239
|
6715
|
|
|
|
|
12017
|
my $len = 2 * ($t - 1) + 1; |
|
1240
|
6715
|
|
|
|
|
10938
|
$with_rle += $len; |
|
1241
|
6715
|
|
|
|
|
17293
|
$double_with_rle += $len; |
|
1242
|
|
|
|
|
|
|
} |
|
1243
|
|
|
|
|
|
|
} |
|
1244
|
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
scalar { |
|
1246
|
3992
|
|
|
|
|
31116
|
has_negative => $has_negative, |
|
1247
|
|
|
|
|
|
|
has_zero => $has_zero, |
|
1248
|
|
|
|
|
|
|
methods => { |
|
1249
|
|
|
|
|
|
|
with_rle => $with_rle, |
|
1250
|
|
|
|
|
|
|
without_rle => $without_rle, |
|
1251
|
|
|
|
|
|
|
double_with_rle => $double_with_rle, |
|
1252
|
|
|
|
|
|
|
double_without_rle => $double_without_rle, |
|
1253
|
|
|
|
|
|
|
}, |
|
1254
|
|
|
|
|
|
|
}; |
|
1255
|
|
|
|
|
|
|
} |
|
1256
|
|
|
|
|
|
|
|
|
1257
|
3992
|
|
|
3992
|
|
7305
|
sub _find_best_encoding_method ($integers) { |
|
|
3992
|
|
|
|
|
7282
|
|
|
|
3992
|
|
|
|
|
6001
|
|
|
1258
|
3992
|
|
|
|
|
9810
|
my $rl = run_length($integers); |
|
1259
|
3992
|
|
|
|
|
9827
|
my $costs = _compute_elias_costs($rl); |
|
1260
|
3992
|
|
|
|
|
7539
|
my ($best_method) = sort { $costs->{methods}{$a} <=> $costs->{methods}{$b} } sort keys(%{$costs->{methods}}); |
|
|
19086
|
|
|
|
|
46055
|
|
|
|
3992
|
|
|
|
|
26642
|
|
|
1261
|
3992
|
50
|
|
|
|
12468
|
$VERBOSE && say STDERR "$best_method --> $costs->{methods}{$best_method}"; |
|
1262
|
3992
|
|
|
|
|
19360
|
return ($rl, $best_method, $costs); |
|
1263
|
|
|
|
|
|
|
} |
|
1264
|
|
|
|
|
|
|
|
|
1265
|
998
|
|
|
998
|
1
|
1912
|
sub delta_encode ($integers) { |
|
|
998
|
|
|
|
|
1696
|
|
|
|
998
|
|
|
|
|
2057
|
|
|
1266
|
|
|
|
|
|
|
|
|
1267
|
998
|
|
|
|
|
3000
|
my $deltas = deltas($integers); |
|
1268
|
|
|
|
|
|
|
|
|
1269
|
998
|
|
|
|
|
2956
|
my @methods = ( |
|
1270
|
|
|
|
|
|
|
[_find_best_encoding_method($integers), 0, 0], |
|
1271
|
|
|
|
|
|
|
[_find_best_encoding_method($deltas), 1, 0], |
|
1272
|
|
|
|
|
|
|
[_find_best_encoding_method(rle4_encode($integers, scalar(@$integers) + 1)), 0, 1], |
|
1273
|
|
|
|
|
|
|
[_find_best_encoding_method(rle4_encode($deltas, scalar(@$integers) + 1)), 1, 1], |
|
1274
|
|
|
|
|
|
|
); |
|
1275
|
|
|
|
|
|
|
|
|
1276
|
998
|
|
|
|
|
6131
|
my ($best) = sort { $a->[2]{methods}{$a->[1]} <=> $b->[2]{methods}{$b->[1]} } @methods; |
|
|
4626
|
|
|
|
|
12494
|
|
|
1277
|
|
|
|
|
|
|
|
|
1278
|
998
|
|
|
|
|
3263
|
my ($rl, $method, $stats, $with_deltas, $with_rle4) = @$best; |
|
1279
|
|
|
|
|
|
|
|
|
1280
|
998
|
|
|
|
|
1697
|
my $double = 0; |
|
1281
|
998
|
|
|
|
|
5842
|
my $with_rle = 0; |
|
1282
|
998
|
|
|
|
|
2290
|
my $has_negative = $stats->{has_negative}; |
|
1283
|
|
|
|
|
|
|
|
|
1284
|
998
|
100
|
|
|
|
4533
|
if ($method eq 'with_rle') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1285
|
248
|
|
|
|
|
582
|
$with_rle = 1; |
|
1286
|
|
|
|
|
|
|
} |
|
1287
|
|
|
|
|
|
|
elsif ($method eq 'without_rle') { |
|
1288
|
|
|
|
|
|
|
## ok |
|
1289
|
|
|
|
|
|
|
} |
|
1290
|
|
|
|
|
|
|
elsif ($method eq 'double_with_rle') { |
|
1291
|
67
|
|
|
|
|
181
|
$with_rle = 1; |
|
1292
|
67
|
|
|
|
|
145
|
$double = 1; |
|
1293
|
|
|
|
|
|
|
} |
|
1294
|
|
|
|
|
|
|
elsif ($method eq 'double_without_rle') { |
|
1295
|
293
|
|
|
|
|
514
|
$double = 1; |
|
1296
|
|
|
|
|
|
|
} |
|
1297
|
|
|
|
|
|
|
else { |
|
1298
|
0
|
|
|
|
|
0
|
confess "[BUG] Unknown encoding method: $method"; |
|
1299
|
|
|
|
|
|
|
} |
|
1300
|
|
|
|
|
|
|
|
|
1301
|
998
|
|
|
|
|
2087
|
my $code = ''; |
|
1302
|
998
|
|
|
|
|
3722
|
my $bitstring = join('', $double, $with_rle, $has_negative, $with_deltas, $with_rle4); |
|
1303
|
998
|
|
100
|
|
|
2897
|
my $length = sum(map { $_->[1] } @$rl) // 0; |
|
|
8765
|
|
|
|
|
20675
|
|
|
1304
|
|
|
|
|
|
|
|
|
1305
|
998
|
|
|
|
|
3919
|
foreach my $pair ([$length, 1], @$rl) { |
|
1306
|
9763
|
|
|
|
|
19178
|
my ($d, $v) = @$pair; |
|
1307
|
|
|
|
|
|
|
|
|
1308
|
9763
|
100
|
|
|
|
22151
|
if ($d == 0) { |
|
|
|
100
|
|
|
|
|
|
|
1309
|
2117
|
|
|
|
|
4013
|
$code = '0'; |
|
1310
|
|
|
|
|
|
|
} |
|
1311
|
|
|
|
|
|
|
elsif ($double) { |
|
1312
|
2543
|
|
|
|
|
5563
|
my $t = sprintf('%b', abs($d) + 1); |
|
1313
|
2543
|
|
|
|
|
4684
|
my $l = sprintf('%b', length($t)); |
|
1314
|
2543
|
100
|
|
|
|
8892
|
$code = ($has_negative ? ('1' . (($d < 0) ? '0' : '1')) : '') . ('1' x (length($l) - 1)) . '0' . substr($l, 1) . substr($t, 1); |
|
|
|
100
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
} |
|
1316
|
|
|
|
|
|
|
else { |
|
1317
|
5103
|
100
|
|
|
|
14652
|
my $t = sprintf('%b', abs($d) + ($has_negative ? 0 : 1)); |
|
1318
|
5103
|
100
|
|
|
|
18584
|
$code = ($has_negative ? ('1' . (($d < 0) ? '0' : '1')) : '') . ('1' x (length($t) - 1)) . '0' . substr($t, 1); |
|
|
|
100
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
} |
|
1320
|
|
|
|
|
|
|
|
|
1321
|
9763
|
|
|
|
|
15605
|
$bitstring .= $code; |
|
1322
|
|
|
|
|
|
|
|
|
1323
|
9763
|
100
|
|
|
|
19408
|
if (not $with_rle) { |
|
1324
|
6301
|
100
|
|
|
|
12813
|
if ($v > 1) { |
|
1325
|
625
|
|
|
|
|
1390
|
$bitstring .= $code x ($v - 1); |
|
1326
|
|
|
|
|
|
|
} |
|
1327
|
6301
|
|
|
|
|
13168
|
next; |
|
1328
|
|
|
|
|
|
|
} |
|
1329
|
|
|
|
|
|
|
|
|
1330
|
3462
|
100
|
|
|
|
6516
|
if ($v == 1) { |
|
1331
|
2284
|
|
|
|
|
5082
|
$bitstring .= '0'; |
|
1332
|
|
|
|
|
|
|
} |
|
1333
|
|
|
|
|
|
|
else { |
|
1334
|
1178
|
|
|
|
|
4180
|
my $t = sprintf('%b', $v); |
|
1335
|
1178
|
|
|
|
|
4179
|
$bitstring .= join('', '1' x (length($t) - 1), '0', substr($t, 1)); |
|
1336
|
|
|
|
|
|
|
} |
|
1337
|
|
|
|
|
|
|
} |
|
1338
|
|
|
|
|
|
|
|
|
1339
|
998
|
|
|
|
|
137050
|
pack('B*', $bitstring); |
|
1340
|
|
|
|
|
|
|
} |
|
1341
|
|
|
|
|
|
|
|
|
1342
|
990
|
|
|
990
|
1
|
1671
|
sub delta_decode ($fh) { |
|
|
990
|
|
|
|
|
1640
|
|
|
|
990
|
|
|
|
|
2017
|
|
|
1343
|
|
|
|
|
|
|
|
|
1344
|
990
|
100
|
|
|
|
2677
|
if (ref($fh) eq '') { |
|
1345
|
13
|
50
|
|
|
|
196
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
1346
|
13
|
|
|
|
|
48
|
return __SUB__->($fh2); |
|
1347
|
|
|
|
|
|
|
} |
|
1348
|
|
|
|
|
|
|
|
|
1349
|
977
|
|
|
|
|
2250
|
my $buffer = ''; |
|
1350
|
977
|
|
|
|
|
3112
|
my $double = read_bit($fh, \$buffer); |
|
1351
|
977
|
|
|
|
|
2399
|
my $with_rle = read_bit($fh, \$buffer); |
|
1352
|
977
|
|
|
|
|
2246
|
my $has_negative = read_bit($fh, \$buffer); |
|
1353
|
977
|
|
|
|
|
2349
|
my $with_deltas = read_bit($fh, \$buffer); |
|
1354
|
977
|
|
|
|
|
2089
|
my $with_rle4 = read_bit($fh, \$buffer); |
|
1355
|
|
|
|
|
|
|
|
|
1356
|
977
|
|
|
|
|
1869
|
my @deltas; |
|
1357
|
977
|
|
|
|
|
1725
|
my $len = 0; |
|
1358
|
|
|
|
|
|
|
|
|
1359
|
977
|
|
|
|
|
3040
|
for (my $k = 0 ; $k <= $len ; ++$k) { |
|
1360
|
|
|
|
|
|
|
|
|
1361
|
10610
|
|
|
|
|
20851
|
my $bit = read_bit($fh, \$buffer); |
|
1362
|
|
|
|
|
|
|
|
|
1363
|
10610
|
100
|
|
|
|
30717
|
if ($bit eq '0') { |
|
|
|
100
|
|
|
|
|
|
|
1364
|
3087
|
|
|
|
|
5476
|
push @deltas, 0; |
|
1365
|
|
|
|
|
|
|
} |
|
1366
|
|
|
|
|
|
|
elsif ($double) { |
|
1367
|
2573
|
100
|
|
|
|
5701
|
my $bit = $has_negative ? read_bit($fh, \$buffer) : 0; |
|
1368
|
|
|
|
|
|
|
|
|
1369
|
2573
|
100
|
|
|
|
4805
|
my $bl = $has_negative ? 0 : 1; |
|
1370
|
2573
|
|
|
|
|
4406
|
++$bl while (read_bit($fh, \$buffer) eq '1'); |
|
1371
|
|
|
|
|
|
|
|
|
1372
|
2573
|
|
|
|
|
5660
|
my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl)); |
|
|
4472
|
|
|
|
|
7137
|
|
|
1373
|
2573
|
|
|
|
|
5802
|
my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1))); |
|
|
10864
|
|
|
|
|
17218
|
|
|
1374
|
|
|
|
|
|
|
|
|
1375
|
2573
|
100
|
|
|
|
9205
|
push @deltas, ($has_negative ? ($bit eq '1' ? 1 : -1) : 1) * ($int - 1); |
|
|
|
100
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
} |
|
1377
|
|
|
|
|
|
|
else { |
|
1378
|
4950
|
100
|
|
|
|
11486
|
my $bit = $has_negative ? read_bit($fh, \$buffer) : 0; |
|
1379
|
4950
|
100
|
|
|
|
10430
|
my $n = $has_negative ? 0 : 1; |
|
1380
|
4950
|
|
|
|
|
9464
|
++$n while (read_bit($fh, \$buffer) eq '1'); |
|
1381
|
4950
|
|
|
|
|
11963
|
my $d = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n)); |
|
|
7989
|
|
|
|
|
13992
|
|
|
1382
|
4950
|
100
|
|
|
|
17838
|
push @deltas, $has_negative ? ($bit eq '1' ? $d : -$d) : ($d - 1); |
|
|
|
100
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
} |
|
1384
|
|
|
|
|
|
|
|
|
1385
|
10610
|
100
|
|
|
|
22385
|
if ($with_rle) { |
|
1386
|
|
|
|
|
|
|
|
|
1387
|
3113
|
|
|
|
|
4713
|
my $bl = 0; |
|
1388
|
3113
|
|
|
|
|
6048
|
while (read_bit($fh, \$buffer) == 1) { |
|
1389
|
2988
|
|
|
|
|
5900
|
++$bl; |
|
1390
|
|
|
|
|
|
|
} |
|
1391
|
|
|
|
|
|
|
|
|
1392
|
3113
|
100
|
|
|
|
7779
|
if ($bl > 0) { |
|
1393
|
1094
|
|
|
|
|
2462
|
my $run = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl)) - 1; |
|
|
2988
|
|
|
|
|
5191
|
|
|
1394
|
1094
|
|
|
|
|
2296
|
$k += $run; |
|
1395
|
1094
|
|
|
|
|
11034
|
push @deltas, ($deltas[-1]) x $run; |
|
1396
|
|
|
|
|
|
|
} |
|
1397
|
|
|
|
|
|
|
} |
|
1398
|
|
|
|
|
|
|
|
|
1399
|
10610
|
100
|
|
|
|
31869
|
if ($k == 0) { |
|
1400
|
977
|
|
|
|
|
3557
|
$len = pop(@deltas); |
|
1401
|
|
|
|
|
|
|
} |
|
1402
|
|
|
|
|
|
|
} |
|
1403
|
|
|
|
|
|
|
|
|
1404
|
977
|
|
|
|
|
1787
|
my $decoded = \@deltas; |
|
1405
|
977
|
100
|
|
|
|
2438
|
$decoded = rle4_decode($decoded) if $with_rle4; |
|
1406
|
977
|
100
|
|
|
|
3224
|
$decoded = accumulate($decoded) if $with_deltas; |
|
1407
|
977
|
|
|
|
|
5610
|
return $decoded; |
|
1408
|
|
|
|
|
|
|
} |
|
1409
|
|
|
|
|
|
|
|
|
1410
|
|
|
|
|
|
|
################################ |
|
1411
|
|
|
|
|
|
|
# Alphabet encoding (from Bzip2) |
|
1412
|
|
|
|
|
|
|
################################ |
|
1413
|
|
|
|
|
|
|
|
|
1414
|
188
|
|
|
188
|
1
|
343
|
sub encode_alphabet_256 ($alphabet) { |
|
|
188
|
|
|
|
|
353
|
|
|
|
188
|
|
|
|
|
370
|
|
|
1415
|
|
|
|
|
|
|
|
|
1416
|
188
|
|
|
|
|
383
|
my %table; |
|
1417
|
188
|
|
|
|
|
1926
|
@table{@$alphabet} = (); |
|
1418
|
|
|
|
|
|
|
|
|
1419
|
188
|
|
|
|
|
461
|
my $populated = 0; |
|
1420
|
188
|
|
|
|
|
499
|
my @marked; |
|
1421
|
|
|
|
|
|
|
|
|
1422
|
188
|
|
|
|
|
668
|
for (my $i = 0 ; $i <= 255 ; $i += 16) { |
|
1423
|
|
|
|
|
|
|
|
|
1424
|
3008
|
|
|
|
|
4508
|
my $enc = 0; |
|
1425
|
3008
|
|
|
|
|
5380
|
foreach my $j (0 .. 15) { |
|
1426
|
48128
|
100
|
|
|
|
107086
|
if (exists($table{$i + $j})) { |
|
1427
|
1525
|
|
|
|
|
2981
|
$enc |= 1 << $j; |
|
1428
|
|
|
|
|
|
|
} |
|
1429
|
|
|
|
|
|
|
} |
|
1430
|
|
|
|
|
|
|
|
|
1431
|
3008
|
|
|
|
|
4828
|
$populated <<= 1; |
|
1432
|
|
|
|
|
|
|
|
|
1433
|
3008
|
100
|
|
|
|
8368
|
if ($enc > 0) { |
|
1434
|
419
|
|
|
|
|
709
|
$populated |= 1; |
|
1435
|
419
|
|
|
|
|
1257
|
push @marked, $enc; |
|
1436
|
|
|
|
|
|
|
} |
|
1437
|
|
|
|
|
|
|
} |
|
1438
|
|
|
|
|
|
|
|
|
1439
|
188
|
|
|
|
|
541
|
my $bitstring = join('', map { int2bits_lsb($_, 16) } @marked); |
|
|
419
|
|
|
|
|
1223
|
|
|
1440
|
|
|
|
|
|
|
|
|
1441
|
188
|
50
|
|
|
|
1831
|
$VERBOSE && say STDERR "Populated : ", sprintf('%016b', $populated); |
|
1442
|
188
|
50
|
|
|
|
534
|
$VERBOSE && say STDERR "Marked : @marked"; |
|
1443
|
188
|
50
|
|
|
|
537
|
$VERBOSE && say STDERR "Bits len : ", length($bitstring); |
|
1444
|
|
|
|
|
|
|
|
|
1445
|
188
|
|
|
|
|
379
|
my $encoded = ''; |
|
1446
|
188
|
|
|
|
|
573
|
$encoded .= int2bytes($populated, 2); |
|
1447
|
188
|
|
|
|
|
829
|
$encoded .= pack('B*', $bitstring); |
|
1448
|
188
|
|
|
|
|
1283
|
return $encoded; |
|
1449
|
|
|
|
|
|
|
} |
|
1450
|
|
|
|
|
|
|
|
|
1451
|
21
|
|
|
21
|
1
|
41
|
sub decode_alphabet_256 ($fh) { |
|
|
21
|
|
|
|
|
42
|
|
|
|
21
|
|
|
|
|
41
|
|
|
1452
|
|
|
|
|
|
|
|
|
1453
|
21
|
50
|
|
|
|
106
|
if (ref($fh) eq '') { |
|
1454
|
0
|
0
|
|
|
|
0
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
1455
|
0
|
|
|
|
|
0
|
return __SUB__->($fh2); |
|
1456
|
|
|
|
|
|
|
} |
|
1457
|
|
|
|
|
|
|
|
|
1458
|
21
|
|
|
|
|
44
|
my @alphabet; |
|
1459
|
21
|
|
|
|
|
75
|
my $l1 = bytes2int($fh, 2); |
|
1460
|
|
|
|
|
|
|
|
|
1461
|
21
|
|
|
|
|
77
|
for my $i (0 .. 15) { |
|
1462
|
336
|
100
|
|
|
|
1069
|
if ($l1 & (0x8000 >> $i)) { |
|
1463
|
95
|
|
|
|
|
218
|
my $l2 = bytes2int($fh, 2); |
|
1464
|
95
|
|
|
|
|
249
|
for my $j (0 .. 15) { |
|
1465
|
1520
|
100
|
|
|
|
4180
|
if ($l2 & (0x8000 >> $j)) { |
|
1466
|
671
|
|
|
|
|
1664
|
push @alphabet, 16 * $i + $j; |
|
1467
|
|
|
|
|
|
|
} |
|
1468
|
|
|
|
|
|
|
} |
|
1469
|
|
|
|
|
|
|
} |
|
1470
|
|
|
|
|
|
|
} |
|
1471
|
|
|
|
|
|
|
|
|
1472
|
21
|
|
|
|
|
111
|
return \@alphabet; |
|
1473
|
|
|
|
|
|
|
} |
|
1474
|
|
|
|
|
|
|
|
|
1475
|
190
|
|
|
190
|
1
|
351
|
sub encode_alphabet ($alphabet) { |
|
|
190
|
|
|
|
|
380
|
|
|
|
190
|
|
|
|
|
333
|
|
|
1476
|
|
|
|
|
|
|
|
|
1477
|
190
|
|
100
|
|
|
895
|
my $max_symbol = $alphabet->[-1] // -1; |
|
1478
|
|
|
|
|
|
|
|
|
1479
|
190
|
100
|
|
|
|
677
|
if ($max_symbol <= 255) { |
|
1480
|
|
|
|
|
|
|
|
|
1481
|
176
|
|
|
|
|
565
|
my $delta = delta_encode($alphabet); |
|
1482
|
176
|
|
|
|
|
855
|
my $enc = encode_alphabet_256($alphabet); |
|
1483
|
|
|
|
|
|
|
|
|
1484
|
176
|
100
|
|
|
|
925
|
if (length($delta) < length($enc)) { |
|
1485
|
155
|
|
|
|
|
956
|
return (chr(0) . $delta); |
|
1486
|
|
|
|
|
|
|
} |
|
1487
|
|
|
|
|
|
|
|
|
1488
|
21
|
|
|
|
|
175
|
return (chr(1) . $enc); |
|
1489
|
|
|
|
|
|
|
} |
|
1490
|
|
|
|
|
|
|
|
|
1491
|
14
|
|
|
|
|
62
|
return (chr(0) . delta_encode($alphabet)); |
|
1492
|
|
|
|
|
|
|
} |
|
1493
|
|
|
|
|
|
|
|
|
1494
|
190
|
|
|
190
|
1
|
342
|
sub decode_alphabet ($fh) { |
|
|
190
|
|
|
|
|
312
|
|
|
|
190
|
|
|
|
|
489
|
|
|
1495
|
|
|
|
|
|
|
|
|
1496
|
190
|
50
|
|
|
|
616
|
if (ref($fh) eq '') { |
|
1497
|
0
|
0
|
|
|
|
0
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
1498
|
0
|
|
|
|
|
0
|
return __SUB__->($fh2); |
|
1499
|
|
|
|
|
|
|
} |
|
1500
|
|
|
|
|
|
|
|
|
1501
|
190
|
100
|
33
|
|
|
1523
|
if (ord(getc($fh) // confess "error") == 1) { |
|
1502
|
21
|
|
|
|
|
96
|
return decode_alphabet_256($fh); |
|
1503
|
|
|
|
|
|
|
} |
|
1504
|
|
|
|
|
|
|
|
|
1505
|
169
|
|
|
|
|
672
|
return delta_decode($fh); |
|
1506
|
|
|
|
|
|
|
} |
|
1507
|
|
|
|
|
|
|
|
|
1508
|
|
|
|
|
|
|
########################## |
|
1509
|
|
|
|
|
|
|
# Move to front transform |
|
1510
|
|
|
|
|
|
|
########################## |
|
1511
|
|
|
|
|
|
|
|
|
1512
|
230
|
|
|
230
|
1
|
259384
|
sub mtf_encode ($symbols, $alphabet = undef) { |
|
|
230
|
|
|
|
|
514
|
|
|
|
230
|
|
|
|
|
524
|
|
|
|
230
|
|
|
|
|
402
|
|
|
1513
|
|
|
|
|
|
|
|
|
1514
|
230
|
100
|
|
|
|
941
|
if (ref($symbols) eq '') { |
|
1515
|
12
|
|
|
|
|
41
|
$symbols = string2symbols($symbols); |
|
1516
|
|
|
|
|
|
|
} |
|
1517
|
|
|
|
|
|
|
|
|
1518
|
230
|
50
|
66
|
|
|
1245
|
if (defined($alphabet) and ref($alphabet) eq '') { |
|
1519
|
0
|
|
|
|
|
0
|
$alphabet = string2symbols($alphabet); |
|
1520
|
|
|
|
|
|
|
} |
|
1521
|
|
|
|
|
|
|
|
|
1522
|
230
|
|
|
|
|
976
|
my (@C, @table); |
|
1523
|
|
|
|
|
|
|
|
|
1524
|
230
|
|
|
|
|
0
|
my @alphabet; |
|
1525
|
230
|
|
|
|
|
0
|
my @alphabet_copy; |
|
1526
|
230
|
|
|
|
|
476
|
my $return_alphabet = 0; |
|
1527
|
|
|
|
|
|
|
|
|
1528
|
230
|
100
|
|
|
|
860
|
if (defined($alphabet)) { |
|
1529
|
1
|
|
|
|
|
6
|
@alphabet = @$alphabet; |
|
1530
|
|
|
|
|
|
|
} |
|
1531
|
|
|
|
|
|
|
else { |
|
1532
|
229
|
|
|
|
|
59310
|
@alphabet = sort { $a <=> $b } uniq(@$symbols); |
|
|
8906
|
|
|
|
|
15966
|
|
|
1533
|
229
|
|
|
|
|
14490
|
$return_alphabet = 1; |
|
1534
|
229
|
|
|
|
|
1380
|
@alphabet_copy = @alphabet; |
|
1535
|
|
|
|
|
|
|
} |
|
1536
|
|
|
|
|
|
|
|
|
1537
|
230
|
|
|
|
|
500
|
my $index; |
|
1538
|
230
|
|
|
|
|
810
|
my @indices = (0 .. $#alphabet); |
|
1539
|
|
|
|
|
|
|
|
|
1540
|
230
|
|
|
|
|
704
|
foreach my $c (@$symbols) { |
|
1541
|
|
|
|
|
|
|
|
|
1542
|
113268
|
|
|
|
|
231729
|
foreach my $i (@indices) { |
|
1543
|
790651
|
100
|
|
|
|
2271174
|
if ($alphabet[$i] == $c) { |
|
1544
|
113268
|
|
|
|
|
189640
|
$index = $i; |
|
1545
|
113268
|
|
|
|
|
207304
|
last; |
|
1546
|
|
|
|
|
|
|
} |
|
1547
|
|
|
|
|
|
|
} |
|
1548
|
|
|
|
|
|
|
|
|
1549
|
113268
|
|
|
|
|
210905
|
push @C, $index; |
|
1550
|
113268
|
|
|
|
|
347493
|
unshift(@alphabet, splice(@alphabet, $index, 1)); |
|
1551
|
|
|
|
|
|
|
} |
|
1552
|
|
|
|
|
|
|
|
|
1553
|
230
|
100
|
|
|
|
3090
|
$return_alphabet || return \@C; |
|
1554
|
229
|
|
|
|
|
15526
|
return (\@C, \@alphabet_copy); |
|
1555
|
|
|
|
|
|
|
} |
|
1556
|
|
|
|
|
|
|
|
|
1557
|
279
|
|
|
279
|
1
|
666
|
sub mtf_decode ($encoded, $alphabet) { |
|
|
279
|
|
|
|
|
552
|
|
|
|
279
|
|
|
|
|
689
|
|
|
|
279
|
|
|
|
|
616
|
|
|
1558
|
|
|
|
|
|
|
|
|
1559
|
279
|
50
|
|
|
|
1226
|
if (ref($encoded) eq '') { |
|
1560
|
0
|
|
|
|
|
0
|
$encoded = string2symbols($encoded); |
|
1561
|
|
|
|
|
|
|
} |
|
1562
|
|
|
|
|
|
|
|
|
1563
|
279
|
50
|
|
|
|
1045
|
if (ref($alphabet) eq '') { |
|
1564
|
0
|
|
|
|
|
0
|
$alphabet = string2symbols($alphabet); |
|
1565
|
|
|
|
|
|
|
} |
|
1566
|
|
|
|
|
|
|
|
|
1567
|
279
|
|
|
|
|
549
|
my @S; |
|
1568
|
279
|
|
|
|
|
1040
|
my @alpha = @$alphabet; |
|
1569
|
|
|
|
|
|
|
|
|
1570
|
279
|
|
|
|
|
937
|
foreach my $p (@$encoded) { |
|
1571
|
114895
|
|
|
|
|
236212
|
push @S, $alpha[$p]; |
|
1572
|
114895
|
|
|
|
|
276087
|
unshift(@alpha, splice(@alpha, $p, 1)); |
|
1573
|
|
|
|
|
|
|
} |
|
1574
|
|
|
|
|
|
|
|
|
1575
|
279
|
|
|
|
|
1355
|
return \@S; |
|
1576
|
|
|
|
|
|
|
} |
|
1577
|
|
|
|
|
|
|
|
|
1578
|
|
|
|
|
|
|
########################### |
|
1579
|
|
|
|
|
|
|
# Zero Run-length encoding |
|
1580
|
|
|
|
|
|
|
########################### |
|
1581
|
|
|
|
|
|
|
|
|
1582
|
215
|
|
|
215
|
1
|
430
|
sub zrle_encode ($symbols) { # RLE2 |
|
|
215
|
|
|
|
|
434
|
|
|
|
215
|
|
|
|
|
550
|
|
|
1583
|
|
|
|
|
|
|
|
|
1584
|
215
|
50
|
|
|
|
966
|
if (ref($symbols) eq '') { |
|
1585
|
0
|
|
|
|
|
0
|
$symbols = string2symbols($symbols); |
|
1586
|
|
|
|
|
|
|
} |
|
1587
|
|
|
|
|
|
|
|
|
1588
|
215
|
|
|
|
|
482
|
my @rle; |
|
1589
|
215
|
|
|
|
|
427
|
my $end = $#{$symbols}; |
|
|
215
|
|
|
|
|
702
|
|
|
1590
|
|
|
|
|
|
|
|
|
1591
|
215
|
|
|
|
|
788
|
for (my $i = 0 ; $i <= $end ; ++$i) { |
|
1592
|
|
|
|
|
|
|
|
|
1593
|
44329
|
|
|
|
|
76328
|
my $run = 0; |
|
1594
|
44329
|
|
100
|
|
|
176308
|
while ($i <= $end and $symbols->[$i] == 0) { |
|
1595
|
67965
|
|
|
|
|
109647
|
++$run; |
|
1596
|
67965
|
|
|
|
|
262216
|
++$i; |
|
1597
|
|
|
|
|
|
|
} |
|
1598
|
|
|
|
|
|
|
|
|
1599
|
44329
|
100
|
|
|
|
102129
|
if ($run >= 1) { |
|
1600
|
8748
|
|
|
|
|
24291
|
my $t = sprintf('%b', $run + 1); |
|
1601
|
8748
|
|
|
|
|
38513
|
push @rle, split(//, substr($t, 1)); |
|
1602
|
|
|
|
|
|
|
} |
|
1603
|
|
|
|
|
|
|
|
|
1604
|
44329
|
100
|
|
|
|
104876
|
if ($i <= $end) { |
|
1605
|
44225
|
|
|
|
|
130764
|
push @rle, $symbols->[$i] + 1; |
|
1606
|
|
|
|
|
|
|
} |
|
1607
|
|
|
|
|
|
|
} |
|
1608
|
|
|
|
|
|
|
|
|
1609
|
215
|
|
|
|
|
3699
|
return \@rle; |
|
1610
|
|
|
|
|
|
|
} |
|
1611
|
|
|
|
|
|
|
|
|
1612
|
227
|
|
|
227
|
1
|
540
|
sub zrle_decode ($rle) { # RLE2 |
|
|
227
|
|
|
|
|
399
|
|
|
|
227
|
|
|
|
|
424
|
|
|
1613
|
|
|
|
|
|
|
|
|
1614
|
227
|
50
|
|
|
|
889
|
if (ref($rle) eq '') { |
|
1615
|
0
|
|
|
|
|
0
|
$rle = string2symbols($rle); |
|
1616
|
|
|
|
|
|
|
} |
|
1617
|
|
|
|
|
|
|
|
|
1618
|
227
|
|
|
|
|
469
|
my @dec; |
|
1619
|
227
|
|
|
|
|
444
|
my $end = $#{$rle}; |
|
|
227
|
|
|
|
|
606
|
|
|
1620
|
|
|
|
|
|
|
|
|
1621
|
227
|
|
|
|
|
912
|
for (my $i = 0 ; $i <= $end ; ++$i) { |
|
1622
|
44512
|
|
|
|
|
93170
|
my $k = $rle->[$i]; |
|
1623
|
|
|
|
|
|
|
|
|
1624
|
44512
|
100
|
100
|
|
|
180759
|
if ($k == 0 or $k == 1) { |
|
1625
|
8791
|
|
|
|
|
14081
|
my $run = 1; |
|
1626
|
8791
|
|
100
|
|
|
34575
|
while (($i <= $end) and ($k == 0 or $k == 1)) { |
|
|
|
|
100
|
|
|
|
|
|
1627
|
15795
|
|
|
|
|
30833
|
($run <<= 1) |= $k; |
|
1628
|
15795
|
|
|
|
|
91954
|
$k = $rle->[++$i]; |
|
1629
|
|
|
|
|
|
|
} |
|
1630
|
8791
|
|
|
|
|
29142
|
push @dec, (0) x ($run - 1); |
|
1631
|
|
|
|
|
|
|
} |
|
1632
|
|
|
|
|
|
|
|
|
1633
|
44512
|
100
|
|
|
|
110083
|
if ($i <= $end) { |
|
1634
|
44405
|
|
|
|
|
149991
|
push @dec, $k - 1; |
|
1635
|
|
|
|
|
|
|
} |
|
1636
|
|
|
|
|
|
|
} |
|
1637
|
|
|
|
|
|
|
|
|
1638
|
227
|
|
|
|
|
3831
|
return \@dec; |
|
1639
|
|
|
|
|
|
|
} |
|
1640
|
|
|
|
|
|
|
|
|
1641
|
|
|
|
|
|
|
################################################################ |
|
1642
|
|
|
|
|
|
|
# Move-to-front compression (MTF + RLE4 + ZRLE + Huffman coding) |
|
1643
|
|
|
|
|
|
|
################################################################ |
|
1644
|
|
|
|
|
|
|
|
|
1645
|
116
|
|
|
116
|
1
|
451824
|
sub mrl_compress_symbolic ($symbols, $entropy_sub = \&create_huffman_entry) { |
|
|
116
|
|
|
|
|
269
|
|
|
|
116
|
|
|
|
|
322
|
|
|
|
116
|
|
|
|
|
252
|
|
|
1646
|
|
|
|
|
|
|
|
|
1647
|
116
|
100
|
|
|
|
530
|
if (ref($symbols) eq '') { |
|
1648
|
12
|
|
|
|
|
65
|
$symbols = string2symbols($symbols); |
|
1649
|
|
|
|
|
|
|
} |
|
1650
|
|
|
|
|
|
|
|
|
1651
|
116
|
|
|
|
|
468
|
my ($mtf, $alphabet) = mtf_encode($symbols); |
|
1652
|
116
|
|
|
|
|
407
|
my $rle = zrle_encode($mtf); |
|
1653
|
116
|
|
|
|
|
439
|
my $rle4 = rle4_encode($rle, scalar(@$rle)); |
|
1654
|
|
|
|
|
|
|
|
|
1655
|
116
|
|
|
|
|
437
|
encode_alphabet($alphabet) . $entropy_sub->($rle4); |
|
1656
|
|
|
|
|
|
|
} |
|
1657
|
|
|
|
|
|
|
|
|
1658
|
|
|
|
|
|
|
*mrl_compress = \&mrl_compress_symbolic; |
|
1659
|
|
|
|
|
|
|
|
|
1660
|
225
|
|
|
225
|
1
|
649
|
sub mrl_decompress_symbolic ($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
225
|
|
|
|
|
387
|
|
|
|
225
|
|
|
|
|
548
|
|
|
|
225
|
|
|
|
|
328
|
|
|
1661
|
|
|
|
|
|
|
|
|
1662
|
225
|
100
|
|
|
|
690
|
if (ref($fh) eq '') { |
|
1663
|
109
|
50
|
|
|
|
1672
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
1664
|
109
|
|
|
|
|
322
|
return __SUB__->($fh2, $entropy_sub); |
|
1665
|
|
|
|
|
|
|
} |
|
1666
|
|
|
|
|
|
|
|
|
1667
|
116
|
|
|
|
|
340
|
my $alphabet = decode_alphabet($fh); |
|
1668
|
|
|
|
|
|
|
|
|
1669
|
116
|
50
|
|
|
|
390
|
$VERBOSE && say STDERR "Alphabet size: ", scalar(@$alphabet); |
|
1670
|
|
|
|
|
|
|
|
|
1671
|
116
|
|
|
|
|
557
|
my $rle4 = $entropy_sub->($fh); |
|
1672
|
116
|
|
|
|
|
493
|
my $rle = rle4_decode($rle4); |
|
1673
|
116
|
|
|
|
|
448
|
my $mtf = zrle_decode($rle); |
|
1674
|
116
|
|
|
|
|
392
|
my $symbols = mtf_decode($mtf, $alphabet); |
|
1675
|
|
|
|
|
|
|
|
|
1676
|
116
|
|
|
|
|
17150
|
return $symbols; |
|
1677
|
|
|
|
|
|
|
} |
|
1678
|
|
|
|
|
|
|
|
|
1679
|
1
|
|
|
1
|
1
|
8
|
sub mrl_decompress($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
1
|
|
|
|
|
8
|
|
|
|
1
|
|
|
|
|
8
|
|
|
|
1
|
|
|
|
|
3
|
|
|
1680
|
1
|
|
|
|
|
22
|
symbols2string(mrl_decompress_symbolic($fh, $entropy_sub)); |
|
1681
|
|
|
|
|
|
|
} |
|
1682
|
|
|
|
|
|
|
|
|
1683
|
|
|
|
|
|
|
############################################################ |
|
1684
|
|
|
|
|
|
|
# BWT-based compression (BWT + MTF + ZRLE + Huffman coding) |
|
1685
|
|
|
|
|
|
|
############################################################ |
|
1686
|
|
|
|
|
|
|
|
|
1687
|
19
|
|
|
19
|
1
|
608255
|
sub bwt_compress ($chunk, $entropy_sub = \&create_huffman_entry) { |
|
|
19
|
|
|
|
|
73
|
|
|
|
19
|
|
|
|
|
76
|
|
|
|
19
|
|
|
|
|
43
|
|
|
1688
|
|
|
|
|
|
|
|
|
1689
|
19
|
50
|
|
|
|
111
|
if (ref($chunk) ne '') { |
|
1690
|
0
|
|
|
|
|
0
|
return bwt_compress_symbolic($chunk, $entropy_sub); |
|
1691
|
|
|
|
|
|
|
} |
|
1692
|
|
|
|
|
|
|
|
|
1693
|
19
|
|
|
|
|
97
|
my $rle1 = rle4_encode(string2symbols($chunk)); |
|
1694
|
19
|
|
|
|
|
11606
|
my ($bwt, $idx) = bwt_encode(pack('C*', @$rle1)); |
|
1695
|
|
|
|
|
|
|
|
|
1696
|
19
|
50
|
|
|
|
122
|
$VERBOSE && say STDERR "BWT index = $idx"; |
|
1697
|
|
|
|
|
|
|
|
|
1698
|
19
|
|
|
|
|
97
|
my ($mtf, $alphabet) = mtf_encode(string2symbols($bwt)); |
|
1699
|
19
|
|
|
|
|
9945
|
my $rle = zrle_encode($mtf); |
|
1700
|
|
|
|
|
|
|
|
|
1701
|
19
|
|
|
|
|
157
|
pack('N', $idx) . encode_alphabet($alphabet) . $entropy_sub->($rle); |
|
1702
|
|
|
|
|
|
|
} |
|
1703
|
|
|
|
|
|
|
|
|
1704
|
37
|
|
|
37
|
1
|
135
|
sub bwt_decompress ($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
37
|
|
|
|
|
95
|
|
|
|
37
|
|
|
|
|
92
|
|
|
|
37
|
|
|
|
|
77
|
|
|
1705
|
|
|
|
|
|
|
|
|
1706
|
37
|
100
|
|
|
|
193
|
if (ref($fh) eq '') { |
|
1707
|
18
|
50
|
|
|
|
397
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
1708
|
18
|
|
|
|
|
97
|
return __SUB__->($fh2, $entropy_sub); |
|
1709
|
|
|
|
|
|
|
} |
|
1710
|
|
|
|
|
|
|
|
|
1711
|
19
|
|
|
|
|
131
|
my $idx = bytes2int($fh, 4); |
|
1712
|
19
|
|
|
|
|
142
|
my $alphabet = decode_alphabet($fh); |
|
1713
|
|
|
|
|
|
|
|
|
1714
|
19
|
50
|
|
|
|
86
|
$VERBOSE && say STDERR "BWT index = $idx"; |
|
1715
|
19
|
50
|
|
|
|
89
|
$VERBOSE && say STDERR "Alphabet size: ", scalar(@$alphabet); |
|
1716
|
|
|
|
|
|
|
|
|
1717
|
19
|
|
|
|
|
123
|
my $rle = $entropy_sub->($fh); |
|
1718
|
19
|
|
|
|
|
131
|
my $mtf = zrle_decode($rle); |
|
1719
|
19
|
|
|
|
|
115
|
my $bwt = mtf_decode($mtf, $alphabet); |
|
1720
|
19
|
|
|
|
|
1604
|
my $rle4 = bwt_decode(pack('C*', @$bwt), $idx); |
|
1721
|
19
|
|
|
|
|
139
|
my $data = rle4_decode(string2symbols($rle4)); |
|
1722
|
|
|
|
|
|
|
|
|
1723
|
19
|
|
|
|
|
22100
|
pack('C*', @$data); |
|
1724
|
|
|
|
|
|
|
} |
|
1725
|
|
|
|
|
|
|
|
|
1726
|
|
|
|
|
|
|
########################################### |
|
1727
|
|
|
|
|
|
|
# BWT-based compression (symbolic variant) |
|
1728
|
|
|
|
|
|
|
########################################### |
|
1729
|
|
|
|
|
|
|
|
|
1730
|
55
|
|
|
55
|
1
|
704922
|
sub bwt_compress_symbolic ($symbols, $entropy_sub = \&create_huffman_entry) { |
|
|
55
|
|
|
|
|
123
|
|
|
|
55
|
|
|
|
|
152
|
|
|
|
55
|
|
|
|
|
103
|
|
|
1731
|
|
|
|
|
|
|
|
|
1732
|
55
|
100
|
|
|
|
273
|
if (ref($symbols) eq '') { |
|
1733
|
1
|
|
|
|
|
5
|
$symbols = string2symbols($symbols); |
|
1734
|
|
|
|
|
|
|
} |
|
1735
|
|
|
|
|
|
|
|
|
1736
|
55
|
|
|
|
|
228
|
my $rle4 = rle4_encode($symbols); |
|
1737
|
55
|
|
|
|
|
249
|
my ($bwt, $idx) = bwt_encode_symbolic($rle4); |
|
1738
|
|
|
|
|
|
|
|
|
1739
|
55
|
|
|
|
|
239
|
my ($mtf, $alphabet) = mtf_encode($bwt); |
|
1740
|
55
|
|
|
|
|
197
|
my $rle = zrle_encode($mtf); |
|
1741
|
|
|
|
|
|
|
|
|
1742
|
55
|
50
|
|
|
|
193
|
$VERBOSE && say STDERR "BWT index = $idx"; |
|
1743
|
55
|
50
|
0
|
|
|
172
|
$VERBOSE && say STDERR "Max symbol: ", max(@$alphabet) // 0; |
|
1744
|
|
|
|
|
|
|
|
|
1745
|
55
|
|
|
|
|
322
|
pack('N', $idx) . encode_alphabet($alphabet) . $entropy_sub->($rle); |
|
1746
|
|
|
|
|
|
|
} |
|
1747
|
|
|
|
|
|
|
|
|
1748
|
104
|
|
|
104
|
1
|
558
|
sub bwt_decompress_symbolic ($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
104
|
|
|
|
|
233
|
|
|
|
104
|
|
|
|
|
209
|
|
|
|
104
|
|
|
|
|
285
|
|
|
1749
|
|
|
|
|
|
|
|
|
1750
|
104
|
100
|
|
|
|
323
|
if (ref($fh) eq '') { |
|
1751
|
49
|
50
|
|
|
|
909
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
1752
|
49
|
|
|
|
|
209
|
return __SUB__->($fh2, $entropy_sub); |
|
1753
|
|
|
|
|
|
|
} |
|
1754
|
|
|
|
|
|
|
|
|
1755
|
55
|
|
|
|
|
215
|
my $idx = bytes2int($fh, 4); |
|
1756
|
55
|
|
|
|
|
231
|
my $alphabet = decode_alphabet($fh); |
|
1757
|
|
|
|
|
|
|
|
|
1758
|
55
|
50
|
|
|
|
195
|
$VERBOSE && say STDERR "BWT index = $idx"; |
|
1759
|
55
|
50
|
|
|
|
162
|
$VERBOSE && say STDERR "Alphabet size: ", scalar(@$alphabet); |
|
1760
|
|
|
|
|
|
|
|
|
1761
|
55
|
|
|
|
|
191
|
my $rle = $entropy_sub->($fh); |
|
1762
|
55
|
|
|
|
|
424
|
my $mtf = zrle_decode($rle); |
|
1763
|
55
|
|
|
|
|
200
|
my $bwt = mtf_decode($mtf, $alphabet); |
|
1764
|
55
|
|
|
|
|
202
|
my $rle4 = bwt_decode_symbolic($bwt, $idx); |
|
1765
|
55
|
|
|
|
|
216
|
my $data = rle4_decode($rle4); |
|
1766
|
|
|
|
|
|
|
|
|
1767
|
55
|
|
|
|
|
2040
|
return $data; |
|
1768
|
|
|
|
|
|
|
} |
|
1769
|
|
|
|
|
|
|
|
|
1770
|
|
|
|
|
|
|
########################### |
|
1771
|
|
|
|
|
|
|
# Arithmetic Coding entries |
|
1772
|
|
|
|
|
|
|
########################### |
|
1773
|
|
|
|
|
|
|
|
|
1774
|
66
|
|
|
66
|
1
|
178
|
sub create_ac_entry ($symbols) { |
|
|
66
|
|
|
|
|
121
|
|
|
|
66
|
|
|
|
|
147
|
|
|
1775
|
|
|
|
|
|
|
|
|
1776
|
66
|
50
|
|
|
|
272
|
if (ref($symbols) eq '') { |
|
1777
|
0
|
|
|
|
|
0
|
$symbols = string2symbols($symbols); |
|
1778
|
|
|
|
|
|
|
} |
|
1779
|
|
|
|
|
|
|
|
|
1780
|
66
|
|
|
|
|
326
|
my ($enc, $freq) = ac_encode($symbols); |
|
1781
|
66
|
|
50
|
|
|
1172
|
my $max_symbol = max(keys %$freq) // 0; |
|
1782
|
|
|
|
|
|
|
|
|
1783
|
66
|
|
|
|
|
190
|
my @freqs; |
|
1784
|
66
|
|
|
|
|
259
|
foreach my $k (0 .. $max_symbol) { |
|
1785
|
42155
|
|
100
|
|
|
170706
|
push @freqs, $freq->{$k} // 0; |
|
1786
|
|
|
|
|
|
|
} |
|
1787
|
|
|
|
|
|
|
|
|
1788
|
66
|
|
|
|
|
201
|
push @freqs, length($enc) >> 3; |
|
1789
|
|
|
|
|
|
|
|
|
1790
|
66
|
|
|
|
|
291
|
delta_encode(\@freqs) . pack("B*", $enc); |
|
1791
|
|
|
|
|
|
|
} |
|
1792
|
|
|
|
|
|
|
|
|
1793
|
67
|
|
|
67
|
1
|
146
|
sub decode_ac_entry ($fh) { |
|
|
67
|
|
|
|
|
155
|
|
|
|
67
|
|
|
|
|
122
|
|
|
1794
|
|
|
|
|
|
|
|
|
1795
|
67
|
100
|
|
|
|
250
|
if (ref($fh) eq '') { |
|
1796
|
1
|
50
|
|
|
|
33
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
1797
|
1
|
|
|
|
|
11
|
return __SUB__->($fh2); |
|
1798
|
|
|
|
|
|
|
} |
|
1799
|
|
|
|
|
|
|
|
|
1800
|
66
|
|
|
|
|
120
|
my @freqs = @{delta_decode($fh)}; |
|
|
66
|
|
|
|
|
260
|
|
|
1801
|
66
|
|
|
|
|
685
|
my $bits_len = pop(@freqs); |
|
1802
|
|
|
|
|
|
|
|
|
1803
|
66
|
|
|
|
|
134
|
my %freq; |
|
1804
|
66
|
|
|
|
|
264
|
foreach my $i (0 .. $#freqs) { |
|
1805
|
42155
|
100
|
|
|
|
59748
|
if ($freqs[$i]) { |
|
1806
|
745
|
|
|
|
|
1964
|
$freq{$i} = $freqs[$i]; |
|
1807
|
|
|
|
|
|
|
} |
|
1808
|
|
|
|
|
|
|
} |
|
1809
|
|
|
|
|
|
|
|
|
1810
|
66
|
50
|
|
|
|
220
|
$VERBOSE && say STDERR "Encoded length: $bits_len"; |
|
1811
|
66
|
|
|
|
|
293
|
my $bits = read_bits($fh, $bits_len << 3); |
|
1812
|
|
|
|
|
|
|
|
|
1813
|
66
|
50
|
|
|
|
240
|
if ($bits_len > 0) { |
|
1814
|
66
|
|
|
|
|
927
|
open my $bits_fh, '<:raw', \$bits; |
|
1815
|
66
|
|
|
|
|
337
|
return ac_decode($bits_fh, \%freq); |
|
1816
|
|
|
|
|
|
|
} |
|
1817
|
|
|
|
|
|
|
|
|
1818
|
0
|
|
|
|
|
0
|
return []; |
|
1819
|
|
|
|
|
|
|
} |
|
1820
|
|
|
|
|
|
|
|
|
1821
|
|
|
|
|
|
|
#################################### |
|
1822
|
|
|
|
|
|
|
# Adaptive Arithmetic Coding entries |
|
1823
|
|
|
|
|
|
|
#################################### |
|
1824
|
|
|
|
|
|
|
|
|
1825
|
154
|
|
|
154
|
1
|
333
|
sub create_adaptive_ac_entry ($symbols) { |
|
|
154
|
|
|
|
|
258
|
|
|
|
154
|
|
|
|
|
302
|
|
|
1826
|
|
|
|
|
|
|
|
|
1827
|
154
|
50
|
|
|
|
504
|
if (ref($symbols) eq '') { |
|
1828
|
0
|
|
|
|
|
0
|
$symbols = string2symbols($symbols); |
|
1829
|
|
|
|
|
|
|
} |
|
1830
|
|
|
|
|
|
|
|
|
1831
|
154
|
|
|
|
|
565
|
my ($enc, $alphabet) = adaptive_ac_encode($symbols); |
|
1832
|
154
|
|
|
|
|
995
|
delta_encode([@$alphabet, length($enc) >> 3]) . pack('B*', $enc); |
|
1833
|
|
|
|
|
|
|
} |
|
1834
|
|
|
|
|
|
|
|
|
1835
|
155
|
|
|
155
|
1
|
287
|
sub decode_adaptive_ac_entry ($fh) { |
|
|
155
|
|
|
|
|
344
|
|
|
|
155
|
|
|
|
|
239
|
|
|
1836
|
|
|
|
|
|
|
|
|
1837
|
155
|
100
|
|
|
|
490
|
if (ref($fh) eq '') { |
|
1838
|
1
|
50
|
|
|
|
32
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
1839
|
1
|
|
|
|
|
9
|
return __SUB__->($fh2); |
|
1840
|
|
|
|
|
|
|
} |
|
1841
|
|
|
|
|
|
|
|
|
1842
|
154
|
|
|
|
|
410
|
my $alphabet = delta_decode($fh); |
|
1843
|
154
|
|
|
|
|
347
|
my $enc_len = pop(@$alphabet); |
|
1844
|
|
|
|
|
|
|
|
|
1845
|
154
|
50
|
|
|
|
472
|
if ($enc_len > 0) { |
|
1846
|
154
|
|
|
|
|
453
|
my $bits = read_bits($fh, $enc_len << 3); |
|
1847
|
154
|
|
|
|
|
1828
|
open my $bits_fh, '<:raw', \$bits; |
|
1848
|
154
|
|
|
|
|
522
|
return adaptive_ac_decode($bits_fh, $alphabet); |
|
1849
|
|
|
|
|
|
|
} |
|
1850
|
|
|
|
|
|
|
|
|
1851
|
0
|
|
|
|
|
0
|
return []; |
|
1852
|
|
|
|
|
|
|
} |
|
1853
|
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
########################### |
|
1855
|
|
|
|
|
|
|
# Huffman Coding algorithm |
|
1856
|
|
|
|
|
|
|
########################### |
|
1857
|
|
|
|
|
|
|
|
|
1858
|
474
|
|
|
474
|
1
|
798
|
sub huffman_encode ($symbols, $dict) { |
|
|
474
|
|
|
|
|
953
|
|
|
|
474
|
|
|
|
|
775
|
|
|
|
474
|
|
|
|
|
795
|
|
|
1859
|
474
|
|
|
|
|
1480
|
join('', @{$dict}{@$symbols}); |
|
|
474
|
|
|
|
|
29549
|
|
|
1860
|
|
|
|
|
|
|
} |
|
1861
|
|
|
|
|
|
|
|
|
1862
|
402
|
|
|
402
|
1
|
903
|
sub huffman_decode ($bits, $rev_dict) { |
|
|
402
|
|
|
|
|
1014
|
|
|
|
402
|
|
|
|
|
837
|
|
|
|
402
|
|
|
|
|
662
|
|
|
1863
|
402
|
|
|
|
|
1022
|
local $" = '|'; |
|
1864
|
|
|
|
|
|
|
[ |
|
1865
|
402
|
|
|
|
|
839
|
split( |
|
1866
|
|
|
|
|
|
|
' ', $bits =~ s{(@{[ |
|
1867
|
3990
|
|
|
|
|
293016
|
map { $_->[1] } |
|
1868
|
14961
|
|
|
|
|
25659
|
sort { $a->[0] <=> $b->[0] } |
|
1869
|
402
|
|
|
|
|
2051
|
map { [length($_), $_] } |
|
|
3990
|
|
|
|
|
10231
|
|
|
1870
|
|
|
|
|
|
|
keys %$rev_dict] |
|
1871
|
|
|
|
|
|
|
})}{$rev_dict->{$1} }gr |
|
1872
|
|
|
|
|
|
|
) |
|
1873
|
|
|
|
|
|
|
]; |
|
1874
|
|
|
|
|
|
|
} |
|
1875
|
|
|
|
|
|
|
|
|
1876
|
|
|
|
|
|
|
# produce encode and decode dictionary from a tree |
|
1877
|
11046
|
|
|
11046
|
|
16993
|
sub _huffman_walk_tree ($node, $code, $h) { |
|
|
11046
|
|
|
|
|
28916
|
|
|
|
11046
|
|
|
|
|
17423
|
|
|
|
11046
|
|
|
|
|
16284
|
|
|
|
11046
|
|
|
|
|
17750
|
|
|
1878
|
|
|
|
|
|
|
|
|
1879
|
11046
|
|
100
|
|
|
27944
|
my $c = $node->[0] // return $h; |
|
1880
|
10823
|
100
|
|
|
|
21113
|
if (ref $c) { __SUB__->($c->[$_], $code . $_, $h) for ('0', '1') } |
|
|
5199
|
|
|
|
|
46690
|
|
|
1881
|
5624
|
|
|
|
|
15805
|
else { $h->{$c} = $code } |
|
1882
|
|
|
|
|
|
|
|
|
1883
|
10823
|
|
|
|
|
31509
|
return $h; |
|
1884
|
|
|
|
|
|
|
} |
|
1885
|
|
|
|
|
|
|
|
|
1886
|
1216
|
|
|
1216
|
1
|
2362
|
sub huffman_from_code_lengths ($code_lengths) { |
|
|
1216
|
|
|
|
|
1986
|
|
|
|
1216
|
|
|
|
|
2339
|
|
|
1887
|
|
|
|
|
|
|
|
|
1888
|
|
|
|
|
|
|
# This algorithm is based on the pseudocode in RFC 1951 (Section 3.2.2) |
|
1889
|
|
|
|
|
|
|
# (Steps are numbered as in the RFC) |
|
1890
|
|
|
|
|
|
|
|
|
1891
|
|
|
|
|
|
|
# Step 1 |
|
1892
|
1216
|
|
100
|
|
|
13387
|
my $max_length = max(@$code_lengths) // 0; |
|
1893
|
1216
|
|
|
|
|
4274
|
my @length_counts = (0) x ($max_length + 1); |
|
1894
|
1216
|
|
|
|
|
2940
|
foreach my $length (@$code_lengths) { |
|
1895
|
374861
|
|
|
|
|
669200
|
++$length_counts[$length]; |
|
1896
|
|
|
|
|
|
|
} |
|
1897
|
|
|
|
|
|
|
|
|
1898
|
|
|
|
|
|
|
# Step 2 |
|
1899
|
1216
|
|
|
|
|
2136
|
my $code = 0; |
|
1900
|
1216
|
|
|
|
|
4658
|
$length_counts[0] = 0; |
|
1901
|
1216
|
|
|
|
|
3420
|
my @next_code = (0) x ($max_length + 1); |
|
1902
|
1216
|
|
|
|
|
3489
|
foreach my $bits (1 .. $max_length) { |
|
1903
|
3173
|
|
|
|
|
5830
|
$code = ($code + $length_counts[$bits - 1]) << 1; |
|
1904
|
3173
|
|
|
|
|
6509
|
$next_code[$bits] = $code; |
|
1905
|
|
|
|
|
|
|
} |
|
1906
|
|
|
|
|
|
|
|
|
1907
|
|
|
|
|
|
|
# Step 3 |
|
1908
|
1216
|
|
|
|
|
3291
|
my @code_table; |
|
1909
|
1216
|
|
|
|
|
2103
|
foreach my $n (0 .. $#{$code_lengths}) { |
|
|
1216
|
|
|
|
|
3580
|
|
|
1910
|
374861
|
|
|
|
|
572455
|
my $length = $code_lengths->[$n]; |
|
1911
|
374861
|
100
|
|
|
|
806741
|
if ($length != 0) { |
|
1912
|
12769
|
|
|
|
|
38376
|
$code_table[$n] = sprintf('%0*b', $length, $next_code[$length]); |
|
1913
|
12769
|
|
|
|
|
27280
|
++$next_code[$length]; |
|
1914
|
|
|
|
|
|
|
} |
|
1915
|
|
|
|
|
|
|
} |
|
1916
|
|
|
|
|
|
|
|
|
1917
|
1216
|
|
|
|
|
2979
|
my %dict; |
|
1918
|
|
|
|
|
|
|
my %rev_dict; |
|
1919
|
|
|
|
|
|
|
|
|
1920
|
1216
|
|
|
|
|
2198
|
foreach my $i (0 .. $#{$code_lengths}) { |
|
|
1216
|
|
|
|
|
3662
|
|
|
1921
|
374861
|
|
|
|
|
579582
|
my $code = $code_table[$i]; |
|
1922
|
374861
|
100
|
|
|
|
863133
|
if (defined($code)) { |
|
1923
|
12769
|
|
|
|
|
36335
|
$dict{$i} = $code; |
|
1924
|
12769
|
|
|
|
|
39162
|
$rev_dict{$code} = $i; |
|
1925
|
|
|
|
|
|
|
} |
|
1926
|
|
|
|
|
|
|
} |
|
1927
|
|
|
|
|
|
|
|
|
1928
|
1216
|
100
|
|
|
|
23185
|
return (wantarray ? (\%dict, \%rev_dict) : \%dict); |
|
1929
|
|
|
|
|
|
|
} |
|
1930
|
|
|
|
|
|
|
|
|
1931
|
|
|
|
|
|
|
# make a tree, and return resulting dictionaries |
|
1932
|
648
|
|
|
648
|
1
|
1254
|
sub huffman_from_freq ($freq) { |
|
|
648
|
|
|
|
|
1792
|
|
|
|
648
|
|
|
|
|
1205
|
|
|
1933
|
|
|
|
|
|
|
|
|
1934
|
648
|
|
|
|
|
4327
|
my @nodes = map { [$_, $freq->{$_}] } sort { $a <=> $b } keys %$freq; |
|
|
5624
|
|
|
|
|
19481
|
|
|
|
21362
|
|
|
|
|
45640
|
|
|
1935
|
648
|
100
|
|
|
|
2793
|
my $max_symbol = scalar(@nodes) ? $nodes[-1][0] : -1; |
|
1936
|
|
|
|
|
|
|
|
|
1937
|
648
|
|
|
|
|
1229
|
do { # poor man's priority queue |
|
1938
|
5277
|
|
|
|
|
14759
|
@nodes = sort { $a->[1] <=> $b->[1] } @nodes; |
|
|
232852
|
|
|
|
|
499387
|
|
|
1939
|
5277
|
|
|
|
|
14164
|
my ($x, $y) = splice(@nodes, 0, 2); |
|
1940
|
5277
|
100
|
|
|
|
13741
|
if (defined($x)) { |
|
1941
|
5199
|
100
|
|
|
|
11790
|
if (defined($y)) { |
|
1942
|
5054
|
|
|
|
|
25856
|
push @nodes, [[$x, $y], $x->[1] + $y->[1]]; |
|
1943
|
|
|
|
|
|
|
} |
|
1944
|
|
|
|
|
|
|
else { |
|
1945
|
145
|
|
|
|
|
829
|
push @nodes, [[$x], $x->[1]]; |
|
1946
|
|
|
|
|
|
|
} |
|
1947
|
|
|
|
|
|
|
} |
|
1948
|
|
|
|
|
|
|
} while (@nodes > 1); |
|
1949
|
|
|
|
|
|
|
|
|
1950
|
648
|
|
|
|
|
2727
|
my $h = _huffman_walk_tree($nodes[0], '', {}); |
|
1951
|
|
|
|
|
|
|
|
|
1952
|
648
|
|
|
|
|
1504
|
my @code_lengths; |
|
1953
|
648
|
|
|
|
|
2563
|
foreach my $i (0 .. $max_symbol) { |
|
1954
|
192447
|
100
|
|
|
|
385084
|
if (exists $h->{$i}) { |
|
1955
|
5624
|
|
|
|
|
14058
|
$code_lengths[$i] = length($h->{$i}); |
|
1956
|
|
|
|
|
|
|
} |
|
1957
|
|
|
|
|
|
|
else { |
|
1958
|
186823
|
|
|
|
|
370156
|
$code_lengths[$i] = 0; |
|
1959
|
|
|
|
|
|
|
} |
|
1960
|
|
|
|
|
|
|
} |
|
1961
|
|
|
|
|
|
|
|
|
1962
|
648
|
|
|
|
|
2268
|
huffman_from_code_lengths(\@code_lengths); |
|
1963
|
|
|
|
|
|
|
} |
|
1964
|
|
|
|
|
|
|
|
|
1965
|
594
|
|
|
594
|
1
|
1416
|
sub huffman_from_symbols ($symbols) { |
|
|
594
|
|
|
|
|
1068
|
|
|
|
594
|
|
|
|
|
963
|
|
|
1966
|
|
|
|
|
|
|
|
|
1967
|
594
|
50
|
|
|
|
2019
|
if (ref($symbols) eq '') { |
|
1968
|
0
|
|
|
|
|
0
|
$symbols = string2symbols($symbols); |
|
1969
|
|
|
|
|
|
|
} |
|
1970
|
|
|
|
|
|
|
|
|
1971
|
594
|
|
|
|
|
1919
|
huffman_from_freq(frequencies($symbols)); |
|
1972
|
|
|
|
|
|
|
} |
|
1973
|
|
|
|
|
|
|
|
|
1974
|
|
|
|
|
|
|
######################## |
|
1975
|
|
|
|
|
|
|
# Huffman Coding entries |
|
1976
|
|
|
|
|
|
|
######################## |
|
1977
|
|
|
|
|
|
|
|
|
1978
|
474
|
|
|
474
|
1
|
1119
|
sub create_huffman_entry ($symbols) { |
|
|
474
|
|
|
|
|
881
|
|
|
|
474
|
|
|
|
|
846
|
|
|
1979
|
|
|
|
|
|
|
|
|
1980
|
474
|
50
|
|
|
|
1661
|
if (ref($symbols) eq '') { |
|
1981
|
0
|
|
|
|
|
0
|
$symbols = string2symbols($symbols); |
|
1982
|
|
|
|
|
|
|
} |
|
1983
|
|
|
|
|
|
|
|
|
1984
|
474
|
|
|
|
|
1605
|
my $dict = huffman_from_symbols($symbols); |
|
1985
|
474
|
|
|
|
|
2697
|
my $enc = huffman_encode($symbols, $dict); |
|
1986
|
|
|
|
|
|
|
|
|
1987
|
474
|
|
100
|
|
|
6170
|
my $max_symbol = max(keys %$dict) // 0; |
|
1988
|
474
|
50
|
|
|
|
1732
|
$VERBOSE && say STDERR "Max symbol: $max_symbol\n"; |
|
1989
|
|
|
|
|
|
|
|
|
1990
|
474
|
|
|
|
|
887
|
my @code_lengths; |
|
1991
|
474
|
|
|
|
|
1493
|
foreach my $i (0 .. $max_symbol) { |
|
1992
|
176492
|
100
|
|
|
|
351580
|
if (exists($dict->{$i})) { |
|
1993
|
3990
|
|
|
|
|
9360
|
$code_lengths[$i] = length($dict->{$i}); |
|
1994
|
|
|
|
|
|
|
} |
|
1995
|
|
|
|
|
|
|
else { |
|
1996
|
172502
|
|
|
|
|
328274
|
$code_lengths[$i] = 0; |
|
1997
|
|
|
|
|
|
|
} |
|
1998
|
|
|
|
|
|
|
} |
|
1999
|
|
|
|
|
|
|
|
|
2000
|
474
|
|
|
|
|
1686
|
delta_encode(\@code_lengths) . pack("N", length($enc)) . pack("B*", $enc); |
|
2001
|
|
|
|
|
|
|
} |
|
2002
|
|
|
|
|
|
|
|
|
2003
|
475
|
|
|
475
|
1
|
827
|
sub decode_huffman_entry ($fh) { |
|
|
475
|
|
|
|
|
896
|
|
|
|
475
|
|
|
|
|
1025
|
|
|
2004
|
|
|
|
|
|
|
|
|
2005
|
475
|
100
|
|
|
|
1568
|
if (ref($fh) eq '') { |
|
2006
|
1
|
50
|
|
|
|
37
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
2007
|
1
|
|
|
|
|
10
|
return __SUB__->($fh2); |
|
2008
|
|
|
|
|
|
|
} |
|
2009
|
|
|
|
|
|
|
|
|
2010
|
474
|
|
|
|
|
1547
|
my $code_lengths = delta_decode($fh); |
|
2011
|
474
|
|
|
|
|
1361
|
my (undef, $rev_dict) = huffman_from_code_lengths($code_lengths); |
|
2012
|
|
|
|
|
|
|
|
|
2013
|
474
|
|
|
|
|
3017
|
my $enc_len = bytes2int($fh, 4); |
|
2014
|
474
|
50
|
|
|
|
1637
|
$VERBOSE && say STDERR "Encoded length: $enc_len\n"; |
|
2015
|
|
|
|
|
|
|
|
|
2016
|
474
|
100
|
|
|
|
1659
|
if ($enc_len > 0) { |
|
2017
|
402
|
|
|
|
|
1373
|
return huffman_decode(read_bits($fh, $enc_len), $rev_dict); |
|
2018
|
|
|
|
|
|
|
} |
|
2019
|
|
|
|
|
|
|
|
|
2020
|
72
|
|
|
|
|
730
|
return []; |
|
2021
|
|
|
|
|
|
|
} |
|
2022
|
|
|
|
|
|
|
|
|
2023
|
|
|
|
|
|
|
################################################################################### |
|
2024
|
|
|
|
|
|
|
# DEFLATE-like encoding of literals and backreferences produced by the LZSS methods |
|
2025
|
|
|
|
|
|
|
################################################################################### |
|
2026
|
|
|
|
|
|
|
|
|
2027
|
714
|
|
|
714
|
1
|
1167
|
sub make_deflate_tables ($max_dist = $LZ_MAX_DIST, $max_len = $LZ_MAX_LEN) { |
|
|
714
|
|
|
|
|
1583
|
|
|
|
714
|
|
|
|
|
1280
|
|
|
|
714
|
|
|
|
|
1298
|
|
|
2028
|
|
|
|
|
|
|
|
|
2029
|
|
|
|
|
|
|
# [distance value, offset bits] |
|
2030
|
714
|
|
|
|
|
1917
|
my @DISTANCE_SYMBOLS = map { [$_, 0] } (0 .. 4); |
|
|
3570
|
|
|
|
|
8164
|
|
|
2031
|
|
|
|
|
|
|
|
|
2032
|
714
|
|
|
|
|
2820
|
until ($DISTANCE_SYMBOLS[-1][0] > $max_dist) { |
|
2033
|
1952
|
|
|
|
|
5817
|
push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (4 / 3)), $DISTANCE_SYMBOLS[-1][1] + 1]; |
|
2034
|
1952
|
|
|
|
|
7157
|
push @DISTANCE_SYMBOLS, [int($DISTANCE_SYMBOLS[-1][0] * (3 / 2)), $DISTANCE_SYMBOLS[-1][1]]; |
|
2035
|
|
|
|
|
|
|
} |
|
2036
|
|
|
|
|
|
|
|
|
2037
|
|
|
|
|
|
|
# [length, offset bits] |
|
2038
|
714
|
|
|
|
|
1588
|
my @LENGTH_SYMBOLS = ((map { [$_, 0] } (1 .. 10))); |
|
|
7140
|
|
|
|
|
18514
|
|
|
2039
|
|
|
|
|
|
|
|
|
2040
|
|
|
|
|
|
|
{ |
|
2041
|
714
|
|
|
|
|
1331
|
my $delta = 1; |
|
|
714
|
|
|
|
|
1299
|
|
|
2042
|
714
|
|
|
|
|
2160
|
until ($LENGTH_SYMBOLS[-1][0] > $max_len) { |
|
2043
|
286
|
|
|
|
|
866
|
push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1] + 1]; |
|
2044
|
286
|
|
|
|
|
511
|
$delta *= 2; |
|
2045
|
286
|
|
|
|
|
736
|
push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]]; |
|
2046
|
286
|
|
|
|
|
673
|
push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]]; |
|
2047
|
286
|
|
|
|
|
3039
|
push @LENGTH_SYMBOLS, [$LENGTH_SYMBOLS[-1][0] + $delta, $LENGTH_SYMBOLS[-1][1]]; |
|
2048
|
|
|
|
|
|
|
} |
|
2049
|
714
|
|
100
|
|
|
3727
|
while (@LENGTH_SYMBOLS and $LENGTH_SYMBOLS[-1][0] >= $max_len) { |
|
2050
|
6186
|
|
|
|
|
20920
|
pop @LENGTH_SYMBOLS; |
|
2051
|
|
|
|
|
|
|
} |
|
2052
|
714
|
|
|
|
|
1780
|
push @LENGTH_SYMBOLS, [$max_len, 0]; |
|
2053
|
|
|
|
|
|
|
} |
|
2054
|
|
|
|
|
|
|
|
|
2055
|
714
|
|
|
|
|
1571
|
my @LENGTH_INDICES; |
|
2056
|
|
|
|
|
|
|
|
|
2057
|
714
|
|
|
|
|
2478
|
foreach my $i (0 .. $#LENGTH_SYMBOLS) { |
|
2058
|
2812
|
|
|
|
|
4248
|
my ($min, $bits) = @{$LENGTH_SYMBOLS[$i]}; |
|
|
2812
|
|
|
|
|
10971
|
|
|
2059
|
2812
|
|
|
|
|
6777
|
foreach my $k ($min .. $min + (1 << $bits) - 1) { |
|
2060
|
14142
|
|
|
|
|
31421
|
$LENGTH_INDICES[$k] = $i; |
|
2061
|
|
|
|
|
|
|
} |
|
2062
|
|
|
|
|
|
|
} |
|
2063
|
|
|
|
|
|
|
|
|
2064
|
714
|
|
|
|
|
2738
|
return (\@DISTANCE_SYMBOLS, \@LENGTH_SYMBOLS, \@LENGTH_INDICES); |
|
2065
|
|
|
|
|
|
|
} |
|
2066
|
|
|
|
|
|
|
|
|
2067
|
21702
|
|
|
21702
|
1
|
33508
|
sub find_deflate_index ($value, $table) { |
|
|
21702
|
|
|
|
|
37794
|
|
|
|
21702
|
|
|
|
|
44488
|
|
|
|
21702
|
|
|
|
|
39548
|
|
|
2068
|
21702
|
|
|
|
|
34016
|
foreach my $i (0 .. $#{$table}) { |
|
|
21702
|
|
|
|
|
73154
|
|
|
2069
|
374967
|
100
|
|
|
|
944753
|
if ($table->[$i][0] > $value) { |
|
2070
|
21702
|
|
|
|
|
78380
|
return $i - 1; |
|
2071
|
|
|
|
|
|
|
} |
|
2072
|
|
|
|
|
|
|
} |
|
2073
|
0
|
|
|
|
|
0
|
confess "error"; |
|
2074
|
|
|
|
|
|
|
} |
|
2075
|
|
|
|
|
|
|
|
|
2076
|
97
|
|
|
97
|
1
|
196
|
sub deflate_encode ($literals, $distances, $lengths, $entropy_sub = \&create_huffman_entry) { |
|
|
97
|
|
|
|
|
182
|
|
|
|
97
|
|
|
|
|
254
|
|
|
|
97
|
|
|
|
|
228
|
|
|
|
97
|
|
|
|
|
256
|
|
|
|
97
|
|
|
|
|
196
|
|
|
2077
|
|
|
|
|
|
|
|
|
2078
|
97
|
|
100
|
|
|
5546
|
my $max_dist = max(@$distances) // 0; |
|
2079
|
97
|
|
100
|
|
|
1124
|
my $max_len = max(@$lengths) // 0; |
|
2080
|
97
|
|
100
|
|
|
410
|
my $max_symbol = (max(grep { defined($_) } @$literals) // -1) + 1; |
|
|
12682
|
|
|
|
|
28583
|
|
|
2081
|
|
|
|
|
|
|
|
|
2082
|
97
|
|
|
|
|
409
|
my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = make_deflate_tables($max_dist, $max_len); |
|
2083
|
|
|
|
|
|
|
|
|
2084
|
97
|
|
|
|
|
245
|
my @len_symbols; |
|
2085
|
|
|
|
|
|
|
my @dist_symbols; |
|
2086
|
97
|
|
|
|
|
244
|
my $offset_bits = ''; |
|
2087
|
|
|
|
|
|
|
|
|
2088
|
97
|
|
|
|
|
376
|
foreach my $k (0 .. $#$literals) { |
|
2089
|
|
|
|
|
|
|
|
|
2090
|
12682
|
100
|
|
|
|
35001
|
if ($lengths->[$k] == 0) { |
|
2091
|
8710
|
|
|
|
|
26936
|
push @len_symbols, $literals->[$k]; |
|
2092
|
8710
|
|
|
|
|
19649
|
next; |
|
2093
|
|
|
|
|
|
|
} |
|
2094
|
|
|
|
|
|
|
|
|
2095
|
3972
|
|
|
|
|
7722
|
my $len = $lengths->[$k]; |
|
2096
|
3972
|
|
|
|
|
7851
|
my $dist = $distances->[$k]; |
|
2097
|
|
|
|
|
|
|
|
|
2098
|
|
|
|
|
|
|
{ |
|
2099
|
3972
|
|
|
|
|
7586
|
my $len_idx = $LENGTH_INDICES->[$len]; |
|
2100
|
3972
|
|
|
|
|
6527
|
my ($min, $bits) = @{$LENGTH_SYMBOLS->[$len_idx]}; |
|
|
3972
|
|
|
|
|
10590
|
|
|
2101
|
|
|
|
|
|
|
|
|
2102
|
3972
|
|
|
|
|
14456
|
push @len_symbols, $len_idx + $max_symbol; |
|
2103
|
|
|
|
|
|
|
|
|
2104
|
3972
|
100
|
|
|
|
11897
|
if ($bits > 0) { |
|
2105
|
1497
|
|
|
|
|
4842
|
$offset_bits .= sprintf('%0*b', $bits, $len - $min); |
|
2106
|
|
|
|
|
|
|
} |
|
2107
|
|
|
|
|
|
|
} |
|
2108
|
|
|
|
|
|
|
|
|
2109
|
|
|
|
|
|
|
{ |
|
2110
|
3972
|
|
|
|
|
6535
|
my $dist_idx = find_deflate_index($dist, $DISTANCE_SYMBOLS); |
|
|
3972
|
|
|
|
|
6619
|
|
|
|
3972
|
|
|
|
|
9403
|
|
|
2111
|
3972
|
|
|
|
|
7433
|
my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$dist_idx]}; |
|
|
3972
|
|
|
|
|
10758
|
|
|
2112
|
|
|
|
|
|
|
|
|
2113
|
3972
|
|
|
|
|
8408
|
push @dist_symbols, $dist_idx; |
|
2114
|
|
|
|
|
|
|
|
|
2115
|
3972
|
100
|
|
|
|
10055
|
if ($bits > 0) { |
|
2116
|
3753
|
|
|
|
|
16412
|
$offset_bits .= sprintf('%0*b', $bits, $dist - $min); |
|
2117
|
|
|
|
|
|
|
} |
|
2118
|
|
|
|
|
|
|
} |
|
2119
|
|
|
|
|
|
|
} |
|
2120
|
|
|
|
|
|
|
|
|
2121
|
97
|
|
|
|
|
561
|
fibonacci_encode([$max_symbol, $max_dist, $max_len]) . $entropy_sub->(\@len_symbols) . $entropy_sub->(\@dist_symbols) . pack('B*', $offset_bits); |
|
2122
|
|
|
|
|
|
|
} |
|
2123
|
|
|
|
|
|
|
|
|
2124
|
97
|
|
|
97
|
1
|
200
|
sub deflate_decode ($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
97
|
|
|
|
|
180
|
|
|
|
97
|
|
|
|
|
172
|
|
|
|
97
|
|
|
|
|
198
|
|
|
2125
|
|
|
|
|
|
|
|
|
2126
|
97
|
50
|
|
|
|
421
|
if (ref($fh) eq '') { |
|
2127
|
0
|
0
|
|
|
|
0
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
2128
|
0
|
|
|
|
|
0
|
return __SUB__->($fh2, $entropy_sub); |
|
2129
|
|
|
|
|
|
|
} |
|
2130
|
|
|
|
|
|
|
|
|
2131
|
97
|
|
|
|
|
211
|
my ($max_symbol, $max_dist, $max_len) = @{fibonacci_decode($fh)}; |
|
|
97
|
|
|
|
|
375
|
|
|
2132
|
97
|
|
|
|
|
481
|
my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS) = make_deflate_tables($max_dist, $max_len); |
|
2133
|
|
|
|
|
|
|
|
|
2134
|
97
|
|
|
|
|
560
|
my $len_symbols = $entropy_sub->($fh); |
|
2135
|
97
|
|
|
|
|
410
|
my $dist_symbols = $entropy_sub->($fh); |
|
2136
|
|
|
|
|
|
|
|
|
2137
|
97
|
|
|
|
|
274
|
my $bits_len = 0; |
|
2138
|
|
|
|
|
|
|
|
|
2139
|
97
|
|
|
|
|
322
|
foreach my $i (@$dist_symbols) { |
|
2140
|
3972
|
|
|
|
|
7603
|
$bits_len += $DISTANCE_SYMBOLS->[$i][1]; |
|
2141
|
|
|
|
|
|
|
} |
|
2142
|
|
|
|
|
|
|
|
|
2143
|
97
|
|
|
|
|
301
|
foreach my $i (@$len_symbols) { |
|
2144
|
12682
|
100
|
|
|
|
32292
|
if ($i >= $max_symbol) { |
|
2145
|
3972
|
|
|
|
|
9842
|
$bits_len += $LENGTH_SYMBOLS->[$i - $max_symbol][1]; |
|
2146
|
|
|
|
|
|
|
} |
|
2147
|
|
|
|
|
|
|
} |
|
2148
|
|
|
|
|
|
|
|
|
2149
|
97
|
|
|
|
|
341
|
my $bits = read_bits($fh, $bits_len); |
|
2150
|
|
|
|
|
|
|
|
|
2151
|
97
|
|
|
|
|
367
|
my @literals; |
|
2152
|
|
|
|
|
|
|
my @lengths; |
|
2153
|
97
|
|
|
|
|
0
|
my @distances; |
|
2154
|
|
|
|
|
|
|
|
|
2155
|
97
|
|
|
|
|
204
|
my $j = 0; |
|
2156
|
|
|
|
|
|
|
|
|
2157
|
97
|
|
|
|
|
280
|
foreach my $i (@$len_symbols) { |
|
2158
|
12682
|
100
|
|
|
|
26437
|
if ($i >= $max_symbol) { |
|
2159
|
3972
|
|
|
|
|
9079
|
my $dist = $dist_symbols->[$j++]; |
|
2160
|
3972
|
|
|
|
|
6887
|
push @literals, undef; |
|
2161
|
3972
|
|
|
|
|
17551
|
push @lengths, $LENGTH_SYMBOLS->[$i - $max_symbol][0] + oct('0b' . substr($bits, 0, $LENGTH_SYMBOLS->[$i - $max_symbol][1], '')); |
|
2162
|
3972
|
|
|
|
|
19244
|
push @distances, $DISTANCE_SYMBOLS->[$dist][0] + oct('0b' . substr($bits, 0, $DISTANCE_SYMBOLS->[$dist][1], '')); |
|
2163
|
|
|
|
|
|
|
} |
|
2164
|
|
|
|
|
|
|
else { |
|
2165
|
8710
|
|
|
|
|
19093
|
push @literals, $i; |
|
2166
|
8710
|
|
|
|
|
14380
|
push @lengths, 0; |
|
2167
|
8710
|
|
|
|
|
18134
|
push @distances, 0; |
|
2168
|
|
|
|
|
|
|
} |
|
2169
|
|
|
|
|
|
|
} |
|
2170
|
|
|
|
|
|
|
|
|
2171
|
97
|
|
|
|
|
5934
|
return (\@literals, \@distances, \@lengths); |
|
2172
|
|
|
|
|
|
|
} |
|
2173
|
|
|
|
|
|
|
|
|
2174
|
|
|
|
|
|
|
##################### |
|
2175
|
|
|
|
|
|
|
# Elias gamma coding |
|
2176
|
|
|
|
|
|
|
##################### |
|
2177
|
|
|
|
|
|
|
|
|
2178
|
15
|
|
|
15
|
1
|
48
|
sub elias_gamma_encode ($integers) { |
|
|
15
|
|
|
|
|
32
|
|
|
|
15
|
|
|
|
|
30
|
|
|
2179
|
|
|
|
|
|
|
|
|
2180
|
15
|
|
|
|
|
35
|
my $bitstring = ''; |
|
2181
|
15
|
|
|
|
|
66
|
foreach my $k (scalar(@$integers), @$integers) { |
|
2182
|
1322
|
|
|
|
|
2536
|
my $t = sprintf('%b', $k + 1); |
|
2183
|
1322
|
|
|
|
|
3781
|
$bitstring .= ('1' x (length($t) - 1)) . '0' . substr($t, 1); |
|
2184
|
|
|
|
|
|
|
} |
|
2185
|
|
|
|
|
|
|
|
|
2186
|
15
|
|
|
|
|
171
|
pack('B*', $bitstring); |
|
2187
|
|
|
|
|
|
|
} |
|
2188
|
|
|
|
|
|
|
|
|
2189
|
28
|
|
|
28
|
1
|
47
|
sub elias_gamma_decode ($fh) { |
|
|
28
|
|
|
|
|
50
|
|
|
|
28
|
|
|
|
|
43
|
|
|
2190
|
|
|
|
|
|
|
|
|
2191
|
28
|
100
|
|
|
|
95
|
if (ref($fh) eq '') { |
|
2192
|
13
|
50
|
|
|
|
189
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
2193
|
13
|
|
|
|
|
42
|
return __SUB__->($fh2); |
|
2194
|
|
|
|
|
|
|
} |
|
2195
|
|
|
|
|
|
|
|
|
2196
|
15
|
|
|
|
|
46
|
my @ints; |
|
2197
|
15
|
|
|
|
|
29
|
my $len = 0; |
|
2198
|
15
|
|
|
|
|
32
|
my $buffer = ''; |
|
2199
|
|
|
|
|
|
|
|
|
2200
|
15
|
|
|
|
|
58
|
for (my $k = 0 ; $k <= $len ; ++$k) { |
|
2201
|
|
|
|
|
|
|
|
|
2202
|
1322
|
|
|
|
|
2402
|
my $n = 0; |
|
2203
|
1322
|
|
|
|
|
2575
|
++$n while (read_bit($fh, \$buffer) eq '1'); |
|
2204
|
|
|
|
|
|
|
|
|
2205
|
1322
|
|
|
|
|
3457
|
push @ints, oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $n)) - 1; |
|
|
8146
|
|
|
|
|
17875
|
|
|
2206
|
|
|
|
|
|
|
|
|
2207
|
1322
|
100
|
|
|
|
5520
|
if ($k == 0) { |
|
2208
|
15
|
|
|
|
|
52
|
$len = pop(@ints); |
|
2209
|
|
|
|
|
|
|
} |
|
2210
|
|
|
|
|
|
|
} |
|
2211
|
|
|
|
|
|
|
|
|
2212
|
15
|
|
|
|
|
194
|
return \@ints; |
|
2213
|
|
|
|
|
|
|
} |
|
2214
|
|
|
|
|
|
|
|
|
2215
|
|
|
|
|
|
|
##################### |
|
2216
|
|
|
|
|
|
|
# Elias omega coding |
|
2217
|
|
|
|
|
|
|
##################### |
|
2218
|
|
|
|
|
|
|
|
|
2219
|
15
|
|
|
15
|
1
|
46
|
sub elias_omega_encode ($integers) { |
|
|
15
|
|
|
|
|
32
|
|
|
|
15
|
|
|
|
|
28
|
|
|
2220
|
|
|
|
|
|
|
|
|
2221
|
15
|
|
|
|
|
43
|
my $bitstring = ''; |
|
2222
|
15
|
|
|
|
|
89
|
foreach my $k (scalar(@$integers), @$integers) { |
|
2223
|
1341
|
100
|
|
|
|
2650
|
if ($k == 0) { |
|
2224
|
3
|
|
|
|
|
8
|
$bitstring .= '0'; |
|
2225
|
|
|
|
|
|
|
} |
|
2226
|
|
|
|
|
|
|
else { |
|
2227
|
1338
|
|
|
|
|
2536
|
my $t = sprintf('%b', $k + 1); |
|
2228
|
1338
|
|
|
|
|
2172
|
my $l = length($t); |
|
2229
|
1338
|
|
|
|
|
2448
|
my $L = sprintf('%b', $l); |
|
2230
|
1338
|
|
|
|
|
3608
|
$bitstring .= ('1' x (length($L) - 1)) . '0' . substr($L, 1) . substr($t, 1); |
|
2231
|
|
|
|
|
|
|
} |
|
2232
|
|
|
|
|
|
|
} |
|
2233
|
|
|
|
|
|
|
|
|
2234
|
15
|
|
|
|
|
226
|
pack('B*', $bitstring); |
|
2235
|
|
|
|
|
|
|
} |
|
2236
|
|
|
|
|
|
|
|
|
2237
|
28
|
|
|
28
|
1
|
53
|
sub elias_omega_decode ($fh) { |
|
|
28
|
|
|
|
|
47
|
|
|
|
28
|
|
|
|
|
50
|
|
|
2238
|
|
|
|
|
|
|
|
|
2239
|
28
|
100
|
|
|
|
103
|
if (ref($fh) eq '') { |
|
2240
|
13
|
50
|
|
|
|
260
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
2241
|
13
|
|
|
|
|
50
|
return __SUB__->($fh2); |
|
2242
|
|
|
|
|
|
|
} |
|
2243
|
|
|
|
|
|
|
|
|
2244
|
15
|
|
|
|
|
32
|
my @ints; |
|
2245
|
15
|
|
|
|
|
60
|
my $len = 0; |
|
2246
|
15
|
|
|
|
|
35
|
my $buffer = ''; |
|
2247
|
|
|
|
|
|
|
|
|
2248
|
15
|
|
|
|
|
60
|
for (my $k = 0 ; $k <= $len ; ++$k) { |
|
2249
|
|
|
|
|
|
|
|
|
2250
|
1341
|
|
|
|
|
2232
|
my $bl = 0; |
|
2251
|
1341
|
|
|
|
|
2612
|
++$bl while (read_bit($fh, \$buffer) eq '1'); |
|
2252
|
|
|
|
|
|
|
|
|
2253
|
1341
|
100
|
|
|
|
4249
|
if ($bl > 0) { |
|
2254
|
|
|
|
|
|
|
|
|
2255
|
1338
|
|
|
|
|
2773
|
my $bl2 = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. $bl)); |
|
|
2806
|
|
|
|
|
6875
|
|
|
2256
|
1338
|
|
|
|
|
3186
|
my $int = oct('0b1' . join('', map { read_bit($fh, \$buffer) } 1 .. ($bl2 - 1))) - 1; |
|
|
8281
|
|
|
|
|
13980
|
|
|
2257
|
|
|
|
|
|
|
|
|
2258
|
1338
|
|
|
|
|
3820
|
push @ints, $int; |
|
2259
|
|
|
|
|
|
|
} |
|
2260
|
|
|
|
|
|
|
else { |
|
2261
|
3
|
|
|
|
|
6
|
push @ints, 0; |
|
2262
|
|
|
|
|
|
|
} |
|
2263
|
|
|
|
|
|
|
|
|
2264
|
1341
|
100
|
|
|
|
4453
|
if ($k == 0) { |
|
2265
|
15
|
|
|
|
|
54
|
$len = pop(@ints); |
|
2266
|
|
|
|
|
|
|
} |
|
2267
|
|
|
|
|
|
|
} |
|
2268
|
|
|
|
|
|
|
|
|
2269
|
15
|
|
|
|
|
162
|
return \@ints; |
|
2270
|
|
|
|
|
|
|
} |
|
2271
|
|
|
|
|
|
|
|
|
2272
|
|
|
|
|
|
|
################### |
|
2273
|
|
|
|
|
|
|
# LZSS SYMBOLIC |
|
2274
|
|
|
|
|
|
|
################### |
|
2275
|
|
|
|
|
|
|
|
|
2276
|
153
|
|
|
153
|
1
|
344
|
sub lzss_encode_symbolic($symbols, %params) { |
|
|
153
|
|
|
|
|
294
|
|
|
|
153
|
|
|
|
|
267
|
|
|
|
153
|
|
|
|
|
241
|
|
|
2277
|
|
|
|
|
|
|
|
|
2278
|
153
|
50
|
|
|
|
511
|
if (ref($symbols) eq '') { |
|
2279
|
0
|
|
|
|
|
0
|
return lzss_encode($symbols, %params); |
|
2280
|
|
|
|
|
|
|
} |
|
2281
|
|
|
|
|
|
|
|
|
2282
|
153
|
|
33
|
|
|
899
|
my $min_len = $params{min_len} // $LZ_MIN_LEN; |
|
2283
|
153
|
|
33
|
|
|
697
|
my $max_len = $params{max_len} // $LZ_MAX_LEN; |
|
2284
|
153
|
|
33
|
|
|
621
|
my $max_dist = $params{max_dist} // $LZ_MAX_DIST; |
|
2285
|
153
|
|
33
|
|
|
601
|
my $max_chain_len = $params{max_chain_len} // $LZ_MAX_CHAIN_LEN; |
|
2286
|
|
|
|
|
|
|
|
|
2287
|
153
|
|
|
|
|
379
|
my $end = $#$symbols; |
|
2288
|
153
|
|
|
|
|
382
|
my (@literals, @distances, @lengths, %table); |
|
2289
|
|
|
|
|
|
|
|
|
2290
|
153
|
|
|
|
|
555
|
for (my $la = 0 ; $la <= $end ;) { |
|
2291
|
4031
|
|
|
|
|
5955
|
my $best_n = 1; |
|
2292
|
4031
|
|
|
|
|
5909
|
my $best_p = $la; |
|
2293
|
|
|
|
|
|
|
|
|
2294
|
4031
|
|
|
|
|
6161
|
my $upto = $la + $min_len - 1; |
|
2295
|
4031
|
100
|
|
|
|
8991
|
my $lookahead = join(' ', @{$symbols}[$la .. ($upto > $end ? $end : $upto)]); |
|
|
4031
|
|
|
|
|
11919
|
|
|
2296
|
|
|
|
|
|
|
|
|
2297
|
4031
|
100
|
|
|
|
10128
|
if (exists $table{$lookahead}) { |
|
2298
|
|
|
|
|
|
|
|
|
2299
|
430
|
|
|
|
|
738
|
foreach my $p (@{$table{$lookahead}}) { |
|
|
430
|
|
|
|
|
1192
|
|
|
2300
|
|
|
|
|
|
|
|
|
2301
|
1030
|
50
|
|
|
|
2489
|
last if ($la - $p > $max_dist); |
|
2302
|
|
|
|
|
|
|
|
|
2303
|
1030
|
|
|
|
|
1687
|
my $n = $min_len; |
|
2304
|
|
|
|
|
|
|
|
|
2305
|
1030
|
|
100
|
|
|
143796
|
++$n while ($la + $n <= $end and $symbols->[$la + $n - 1] == $symbols->[$p + $n - 1] and $n <= $max_len); |
|
|
|
|
66
|
|
|
|
|
|
2306
|
|
|
|
|
|
|
|
|
2307
|
1030
|
100
|
|
|
|
2778
|
if ($n > $best_n) { |
|
2308
|
509
|
|
|
|
|
1014
|
$best_n = $n; |
|
2309
|
509
|
|
|
|
|
712
|
$best_p = $p; |
|
2310
|
509
|
50
|
|
|
|
1530
|
last if ($n > $max_len); |
|
2311
|
|
|
|
|
|
|
} |
|
2312
|
|
|
|
|
|
|
} |
|
2313
|
|
|
|
|
|
|
} |
|
2314
|
|
|
|
|
|
|
|
|
2315
|
4031
|
100
|
|
|
|
7623
|
if ($best_n == 1) { |
|
2316
|
3601
|
|
|
|
|
9696
|
$table{$lookahead} = [$la]; |
|
2317
|
|
|
|
|
|
|
} |
|
2318
|
|
|
|
|
|
|
else { |
|
2319
|
430
|
|
|
|
|
1727
|
my @matched = @{$symbols}[$la .. $la + $best_n - 1]; |
|
|
430
|
|
|
|
|
4099
|
|
|
2320
|
430
|
|
|
|
|
1928
|
my @key_arr = @matched[0 .. $min_len - 1]; |
|
2321
|
|
|
|
|
|
|
|
|
2322
|
430
|
|
|
|
|
1292
|
foreach my $i (0 .. scalar(@matched) - $min_len) { |
|
2323
|
|
|
|
|
|
|
|
|
2324
|
10588
|
|
|
|
|
25933
|
my $key = join(' ', @key_arr); |
|
2325
|
10588
|
|
|
|
|
16962
|
unshift @{$table{$key}}, $la + $i; |
|
|
10588
|
|
|
|
|
23488
|
|
|
2326
|
10588
|
100
|
|
|
|
17578
|
pop @{$table{$key}} if (@{$table{$key}} > $max_chain_len); |
|
|
7870
|
|
|
|
|
16501
|
|
|
|
10588
|
|
|
|
|
28536
|
|
|
2327
|
|
|
|
|
|
|
|
|
2328
|
10588
|
|
|
|
|
19473
|
shift(@key_arr); |
|
2329
|
10588
|
|
|
|
|
28824
|
push @key_arr, $matched[$i + $min_len]; |
|
2330
|
|
|
|
|
|
|
} |
|
2331
|
|
|
|
|
|
|
} |
|
2332
|
|
|
|
|
|
|
|
|
2333
|
4031
|
100
|
|
|
|
9312
|
if ($best_n > $min_len) { |
|
|
|
100
|
|
|
|
|
|
|
2334
|
|
|
|
|
|
|
|
|
2335
|
420
|
|
|
|
|
859
|
push @lengths, $best_n - 1; |
|
2336
|
420
|
|
|
|
|
831
|
push @distances, $la - $best_p; |
|
2337
|
420
|
|
|
|
|
794
|
push @literals, undef; |
|
2338
|
|
|
|
|
|
|
|
|
2339
|
420
|
|
|
|
|
1431
|
$la += $best_n - 1; |
|
2340
|
|
|
|
|
|
|
} |
|
2341
|
|
|
|
|
|
|
elsif ($best_n == 1) { |
|
2342
|
3601
|
|
|
|
|
6327
|
push @lengths, 0; |
|
2343
|
3601
|
|
|
|
|
5655
|
push @distances, 0; |
|
2344
|
3601
|
|
|
|
|
11318
|
push @literals, $symbols->[$la++]; |
|
2345
|
|
|
|
|
|
|
} |
|
2346
|
|
|
|
|
|
|
else { |
|
2347
|
|
|
|
|
|
|
|
|
2348
|
10
|
|
|
|
|
34
|
push @lengths, (0) x $best_n; |
|
2349
|
10
|
|
|
|
|
31
|
push @distances, (0) x $best_n; |
|
2350
|
10
|
|
|
|
|
27
|
push @literals, @{$symbols}[$la .. $la + $best_n - 1]; |
|
|
10
|
|
|
|
|
35
|
|
|
2351
|
|
|
|
|
|
|
|
|
2352
|
10
|
|
|
|
|
34
|
$la += $best_n; |
|
2353
|
|
|
|
|
|
|
} |
|
2354
|
|
|
|
|
|
|
} |
|
2355
|
|
|
|
|
|
|
|
|
2356
|
153
|
|
|
|
|
2702
|
return (\@literals, \@distances, \@lengths); |
|
2357
|
|
|
|
|
|
|
} |
|
2358
|
|
|
|
|
|
|
|
|
2359
|
90
|
|
|
90
|
1
|
215
|
sub lzss_decode_symbolic ($literals, $distances, $lengths) { |
|
|
90
|
|
|
|
|
178
|
|
|
|
90
|
|
|
|
|
145
|
|
|
|
90
|
|
|
|
|
167
|
|
|
|
90
|
|
|
|
|
152
|
|
|
2360
|
|
|
|
|
|
|
|
|
2361
|
90
|
|
|
|
|
217
|
my @data; |
|
2362
|
90
|
|
|
|
|
179
|
my $data_len = 0; |
|
2363
|
|
|
|
|
|
|
|
|
2364
|
90
|
|
|
|
|
340
|
foreach my $i (0 .. $#$lengths) { |
|
2365
|
|
|
|
|
|
|
|
|
2366
|
3215
|
100
|
|
|
|
6788
|
if ($lengths->[$i] == 0) { |
|
2367
|
2826
|
|
|
|
|
5777
|
push @data, $literals->[$i]; |
|
2368
|
2826
|
|
|
|
|
4779
|
$data_len += 1; |
|
2369
|
2826
|
|
|
|
|
5355
|
next; |
|
2370
|
|
|
|
|
|
|
} |
|
2371
|
|
|
|
|
|
|
|
|
2372
|
389
|
|
33
|
|
|
1177
|
my $length = $lengths->[$i] // confess "bad input"; |
|
2373
|
389
|
|
33
|
|
|
2060
|
my $dist = $distances->[$i] // confess "bad input"; |
|
2374
|
|
|
|
|
|
|
|
|
2375
|
389
|
100
|
|
|
|
1096
|
if ($dist >= $length) { # non-overlapping matches |
|
|
|
100
|
|
|
|
|
|
|
2376
|
271
|
|
|
|
|
2288
|
push @data, @data[$data_len - $dist .. $data_len - $dist + $length - 1]; |
|
2377
|
|
|
|
|
|
|
} |
|
2378
|
|
|
|
|
|
|
elsif ($dist == 1) { # run-length of last character |
|
2379
|
34
|
|
|
|
|
1595
|
push @data, ($data[-1]) x $length; |
|
2380
|
|
|
|
|
|
|
} |
|
2381
|
|
|
|
|
|
|
else { # overlapping matches |
|
2382
|
84
|
|
|
|
|
223
|
foreach my $j (1 .. $length) { |
|
2383
|
737
|
|
|
|
|
2273
|
push @data, $data[$data_len + $j - $dist - 1]; |
|
2384
|
|
|
|
|
|
|
} |
|
2385
|
|
|
|
|
|
|
} |
|
2386
|
|
|
|
|
|
|
|
|
2387
|
389
|
|
|
|
|
1410
|
$data_len += $length; |
|
2388
|
|
|
|
|
|
|
} |
|
2389
|
|
|
|
|
|
|
|
|
2390
|
90
|
|
|
|
|
1588
|
return \@data; |
|
2391
|
|
|
|
|
|
|
} |
|
2392
|
|
|
|
|
|
|
|
|
2393
|
|
|
|
|
|
|
################### |
|
2394
|
|
|
|
|
|
|
# LZSS Encoding |
|
2395
|
|
|
|
|
|
|
################### |
|
2396
|
|
|
|
|
|
|
|
|
2397
|
269
|
|
|
269
|
1
|
307337
|
sub lzss_encode ($str, %params) { |
|
|
269
|
|
|
|
|
896
|
|
|
|
269
|
|
|
|
|
887
|
|
|
|
269
|
|
|
|
|
577
|
|
|
2398
|
|
|
|
|
|
|
|
|
2399
|
269
|
100
|
|
|
|
1596
|
if (ref($str) ne '') { |
|
2400
|
140
|
|
|
|
|
525
|
return lzss_encode_symbolic($str, %params); |
|
2401
|
|
|
|
|
|
|
} |
|
2402
|
|
|
|
|
|
|
|
|
2403
|
129
|
|
66
|
|
|
965
|
my $min_len = $params{min_len} // $LZ_MIN_LEN; |
|
2404
|
129
|
|
66
|
|
|
681
|
my $max_len = $params{max_len} // $LZ_MAX_LEN; |
|
2405
|
129
|
|
66
|
|
|
696
|
my $max_dist = $params{max_dist} // $LZ_MAX_DIST; |
|
2406
|
129
|
|
33
|
|
|
671
|
my $max_chain_len = $params{max_chain_len} // $LZ_MAX_CHAIN_LEN; |
|
2407
|
|
|
|
|
|
|
|
|
2408
|
129
|
|
|
|
|
170400
|
my @symbols = unpack('C*', $str); |
|
2409
|
129
|
|
|
|
|
31533
|
my $end = $#symbols; |
|
2410
|
|
|
|
|
|
|
|
|
2411
|
129
|
|
|
|
|
435
|
my (@literals, @distances, @lengths, %table); |
|
2412
|
|
|
|
|
|
|
|
|
2413
|
129
|
|
|
|
|
689
|
for (my $la = 0 ; $la <= $end ;) { |
|
2414
|
|
|
|
|
|
|
|
|
2415
|
25792
|
|
|
|
|
42411
|
my $best_n = 1; |
|
2416
|
25792
|
|
|
|
|
40504
|
my $best_p = $la; |
|
2417
|
|
|
|
|
|
|
|
|
2418
|
25792
|
|
|
|
|
83218
|
my $lookahead = substr($str, $la, $min_len); |
|
2419
|
|
|
|
|
|
|
|
|
2420
|
25792
|
100
|
|
|
|
83558
|
if (exists $table{$lookahead}) { |
|
2421
|
8093
|
|
|
|
|
16508
|
foreach my $p (@{$table{$lookahead}}) { |
|
|
8093
|
|
|
|
|
25571
|
|
|
2422
|
|
|
|
|
|
|
|
|
2423
|
90156
|
50
|
|
|
|
212306
|
last if ($la - $p > $max_dist); |
|
2424
|
|
|
|
|
|
|
|
|
2425
|
90156
|
|
|
|
|
148098
|
my $n = $min_len; |
|
2426
|
|
|
|
|
|
|
|
|
2427
|
90156
|
|
100
|
|
|
4225926
|
++$n while ($la + $n <= $end and $symbols[$la + $n - 1] == $symbols[$p + $n - 1] and $n <= $max_len); |
|
|
|
|
100
|
|
|
|
|
|
2428
|
|
|
|
|
|
|
|
|
2429
|
90156
|
100
|
|
|
|
324009
|
if ($n > $best_n) { |
|
2430
|
13150
|
|
|
|
|
22728
|
$best_p = $p; |
|
2431
|
13150
|
|
|
|
|
24258
|
$best_n = $n; |
|
2432
|
13150
|
100
|
|
|
|
44201
|
last if ($best_n > $max_len); |
|
2433
|
|
|
|
|
|
|
} |
|
2434
|
|
|
|
|
|
|
} |
|
2435
|
|
|
|
|
|
|
} |
|
2436
|
|
|
|
|
|
|
|
|
2437
|
25792
|
100
|
|
|
|
61521
|
if ($best_n == 1) { |
|
2438
|
17699
|
|
|
|
|
64127
|
$table{$lookahead} = [$la]; |
|
2439
|
|
|
|
|
|
|
} |
|
2440
|
|
|
|
|
|
|
else { |
|
2441
|
|
|
|
|
|
|
|
|
2442
|
8093
|
|
|
|
|
29714
|
my $matched = substr($str, $la, $best_n); |
|
2443
|
|
|
|
|
|
|
|
|
2444
|
8093
|
|
|
|
|
32204
|
foreach my $i (0 .. $best_n - $min_len) { |
|
2445
|
153650
|
|
|
|
|
378256
|
my $key = substr($matched, $i, $min_len); |
|
2446
|
153650
|
|
|
|
|
261878
|
unshift @{$table{$key}}, $la + $i; |
|
|
153650
|
|
|
|
|
433728
|
|
|
2447
|
153650
|
100
|
|
|
|
256243
|
pop(@{$table{$key}}) if (@{$table{$key}} > $max_chain_len); |
|
|
64258
|
|
|
|
|
187065
|
|
|
|
153650
|
|
|
|
|
474389
|
|
|
2448
|
|
|
|
|
|
|
} |
|
2449
|
|
|
|
|
|
|
} |
|
2450
|
|
|
|
|
|
|
|
|
2451
|
25792
|
100
|
|
|
|
62601
|
if ($best_n == 1) { |
|
2452
|
17699
|
|
|
|
|
51735
|
$table{$lookahead} = [$la]; |
|
2453
|
|
|
|
|
|
|
} |
|
2454
|
|
|
|
|
|
|
|
|
2455
|
25792
|
100
|
|
|
|
64973
|
if ($best_n > $min_len) { |
|
|
|
100
|
|
|
|
|
|
|
2456
|
|
|
|
|
|
|
|
|
2457
|
8085
|
|
|
|
|
20076
|
push @lengths, $best_n - 1; |
|
2458
|
8085
|
|
|
|
|
18488
|
push @distances, $la - $best_p; |
|
2459
|
8085
|
|
|
|
|
16569
|
push @literals, undef; |
|
2460
|
|
|
|
|
|
|
|
|
2461
|
8085
|
|
|
|
|
43764
|
$la += $best_n - 1; |
|
2462
|
|
|
|
|
|
|
} |
|
2463
|
|
|
|
|
|
|
elsif ($best_n == 1) { |
|
2464
|
17699
|
|
|
|
|
39822
|
push @lengths, 0; |
|
2465
|
17699
|
|
|
|
|
33068
|
push @distances, 0; |
|
2466
|
17699
|
|
|
|
|
69529
|
push @literals, $symbols[$la++]; |
|
2467
|
|
|
|
|
|
|
} |
|
2468
|
|
|
|
|
|
|
else { |
|
2469
|
|
|
|
|
|
|
|
|
2470
|
8
|
|
|
|
|
29
|
push @lengths, (0) x $best_n; |
|
2471
|
8
|
|
|
|
|
22
|
push @distances, (0) x $best_n; |
|
2472
|
8
|
|
|
|
|
36
|
push @literals, @symbols[$la .. $la + $best_n - 1]; |
|
2473
|
|
|
|
|
|
|
|
|
2474
|
8
|
|
|
|
|
33
|
$la += $best_n; |
|
2475
|
|
|
|
|
|
|
} |
|
2476
|
|
|
|
|
|
|
} |
|
2477
|
|
|
|
|
|
|
|
|
2478
|
129
|
|
|
|
|
63904
|
return (\@literals, \@distances, \@lengths); |
|
2479
|
|
|
|
|
|
|
} |
|
2480
|
|
|
|
|
|
|
|
|
2481
|
56
|
|
|
56
|
1
|
142
|
sub lzss_decode ($literals, $distances, $lengths) { |
|
|
56
|
|
|
|
|
112
|
|
|
|
56
|
|
|
|
|
110
|
|
|
|
56
|
|
|
|
|
114
|
|
|
|
56
|
|
|
|
|
102
|
|
|
2482
|
|
|
|
|
|
|
|
|
2483
|
56
|
|
|
|
|
198
|
my $data = ''; |
|
2484
|
56
|
|
|
|
|
144
|
my $data_len = 0; |
|
2485
|
|
|
|
|
|
|
|
|
2486
|
56
|
|
|
|
|
232
|
foreach my $i (0 .. $#$lengths) { |
|
2487
|
|
|
|
|
|
|
|
|
2488
|
11141
|
100
|
|
|
|
27253
|
if ($lengths->[$i] == 0) { |
|
2489
|
7344
|
|
|
|
|
17318
|
$data .= chr($literals->[$i]); |
|
2490
|
7344
|
|
|
|
|
13704
|
++$data_len; |
|
2491
|
7344
|
|
|
|
|
19731
|
next; |
|
2492
|
|
|
|
|
|
|
} |
|
2493
|
|
|
|
|
|
|
|
|
2494
|
3797
|
|
33
|
|
|
10559
|
my $length = $lengths->[$i] // confess "bad input"; |
|
2495
|
3797
|
|
33
|
|
|
10894
|
my $dist = $distances->[$i] // confess "bad input"; |
|
2496
|
|
|
|
|
|
|
|
|
2497
|
3797
|
100
|
|
|
|
10925
|
if ($dist >= $length) { # non-overlapping matches |
|
|
|
100
|
|
|
|
|
|
|
2498
|
3620
|
|
33
|
|
|
15044
|
$data .= substr($data, $data_len - $dist, $length) // confess "bad input"; |
|
2499
|
|
|
|
|
|
|
} |
|
2500
|
|
|
|
|
|
|
elsif ($dist == 1) { # run-length of last character |
|
2501
|
37
|
|
|
|
|
260
|
$data .= substr($data, -1) x $length; |
|
2502
|
|
|
|
|
|
|
} |
|
2503
|
|
|
|
|
|
|
else { # overlapping matches |
|
2504
|
140
|
|
|
|
|
452
|
foreach my $i (1 .. $length) { |
|
2505
|
2532
|
|
33
|
|
|
10391
|
$data .= substr($data, $data_len + $i - $dist - 1, 1) // confess "bad input"; |
|
2506
|
|
|
|
|
|
|
} |
|
2507
|
|
|
|
|
|
|
} |
|
2508
|
|
|
|
|
|
|
|
|
2509
|
3797
|
|
|
|
|
11183
|
$data_len += $length; |
|
2510
|
|
|
|
|
|
|
} |
|
2511
|
|
|
|
|
|
|
|
|
2512
|
56
|
|
|
|
|
7580
|
return $data; |
|
2513
|
|
|
|
|
|
|
} |
|
2514
|
|
|
|
|
|
|
|
|
2515
|
|
|
|
|
|
|
################### |
|
2516
|
|
|
|
|
|
|
# LZSSF Compression |
|
2517
|
|
|
|
|
|
|
################### |
|
2518
|
|
|
|
|
|
|
|
|
2519
|
19
|
|
|
19
|
1
|
43
|
sub lzss_encode_fast_symbolic ($symbols, %params) { |
|
|
19
|
|
|
|
|
38
|
|
|
|
19
|
|
|
|
|
35
|
|
|
|
19
|
|
|
|
|
30
|
|
|
2520
|
|
|
|
|
|
|
|
|
2521
|
19
|
50
|
|
|
|
78
|
if (ref($symbols) eq '') { |
|
2522
|
0
|
|
|
|
|
0
|
return lzss_encode_fast($symbols, %params); |
|
2523
|
|
|
|
|
|
|
} |
|
2524
|
|
|
|
|
|
|
|
|
2525
|
19
|
|
|
|
|
37
|
my $la = 0; |
|
2526
|
19
|
|
|
|
|
41
|
my $end = $#$symbols; |
|
2527
|
|
|
|
|
|
|
|
|
2528
|
19
|
|
33
|
|
|
108
|
my $min_len = $params{min_len} // $LZ_MIN_LEN; # minimum match length |
|
2529
|
19
|
|
33
|
|
|
81
|
my $max_len = $params{max_len} // $LZ_MAX_LEN; # maximum match length |
|
2530
|
19
|
|
33
|
|
|
93
|
my $max_dist = $params{max_dist} // $LZ_MAX_DIST; # maximum offset distance |
|
2531
|
|
|
|
|
|
|
|
|
2532
|
19
|
|
|
|
|
59
|
my (@literals, @distances, @lengths, %table); |
|
2533
|
|
|
|
|
|
|
|
|
2534
|
19
|
|
|
|
|
72
|
while ($la <= $end) { |
|
2535
|
|
|
|
|
|
|
|
|
2536
|
2765
|
|
|
|
|
3643
|
my $best_n = 1; |
|
2537
|
2765
|
|
|
|
|
3378
|
my $best_p = $la; |
|
2538
|
|
|
|
|
|
|
|
|
2539
|
2765
|
|
|
|
|
3645
|
my $upto = $la + $min_len - 1; |
|
2540
|
2765
|
100
|
|
|
|
5005
|
my $lookahead = join(' ', @{$symbols}[$la .. ($upto > $end ? $end : $upto)]); |
|
|
2765
|
|
|
|
|
5541
|
|
|
2541
|
|
|
|
|
|
|
|
|
2542
|
2765
|
100
|
66
|
|
|
6563
|
if (exists($table{$lookahead}) and $la - $table{$lookahead} <= $max_dist) { |
|
2543
|
|
|
|
|
|
|
|
|
2544
|
374
|
|
|
|
|
591
|
my $p = $table{$lookahead}; |
|
2545
|
374
|
|
|
|
|
534
|
my $n = $min_len; |
|
2546
|
|
|
|
|
|
|
|
|
2547
|
374
|
|
100
|
|
|
10443
|
++$n while ($la + $n <= $end and $symbols->[$la + $n - 1] == $symbols->[$p + $n - 1] and $n <= $max_len); |
|
|
|
|
66
|
|
|
|
|
|
2548
|
|
|
|
|
|
|
|
|
2549
|
374
|
|
|
|
|
538
|
$best_p = $p; |
|
2550
|
374
|
|
|
|
|
545
|
$best_n = $n; |
|
2551
|
|
|
|
|
|
|
} |
|
2552
|
|
|
|
|
|
|
|
|
2553
|
2765
|
|
|
|
|
4979
|
$table{$lookahead} = $la; |
|
2554
|
|
|
|
|
|
|
|
|
2555
|
2765
|
100
|
|
|
|
5122
|
if ($best_n > $min_len) { |
|
|
|
100
|
|
|
|
|
|
|
2556
|
|
|
|
|
|
|
|
|
2557
|
373
|
|
|
|
|
574
|
push @lengths, $best_n - 1; |
|
2558
|
373
|
|
|
|
|
528
|
push @distances, $la - $best_p; |
|
2559
|
373
|
|
|
|
|
491
|
push @literals, undef; |
|
2560
|
|
|
|
|
|
|
|
|
2561
|
373
|
|
|
|
|
786
|
$la += $best_n - 1; |
|
2562
|
|
|
|
|
|
|
} |
|
2563
|
|
|
|
|
|
|
elsif ($best_n == 1) { |
|
2564
|
2391
|
|
|
|
|
3232
|
push @lengths, 0; |
|
2565
|
2391
|
|
|
|
|
3161
|
push @distances, 0; |
|
2566
|
2391
|
|
|
|
|
6816
|
push @literals, $symbols->[$la++]; |
|
2567
|
|
|
|
|
|
|
} |
|
2568
|
|
|
|
|
|
|
else { |
|
2569
|
|
|
|
|
|
|
|
|
2570
|
1
|
|
|
|
|
3
|
push @lengths, (0) x $best_n; |
|
2571
|
1
|
|
|
|
|
4
|
push @distances, (0) x $best_n; |
|
2572
|
1
|
|
|
|
|
3
|
push @literals, @{$symbols}[$la .. $la + $best_n - 1]; |
|
|
1
|
|
|
|
|
3
|
|
|
2573
|
|
|
|
|
|
|
|
|
2574
|
1
|
|
|
|
|
4
|
$la += $best_n; |
|
2575
|
|
|
|
|
|
|
} |
|
2576
|
|
|
|
|
|
|
} |
|
2577
|
|
|
|
|
|
|
|
|
2578
|
19
|
|
|
|
|
587
|
return (\@literals, \@distances, \@lengths); |
|
2579
|
|
|
|
|
|
|
} |
|
2580
|
|
|
|
|
|
|
|
|
2581
|
104
|
|
|
104
|
1
|
281
|
sub lzss_encode_fast($str, %params) { |
|
|
104
|
|
|
|
|
444
|
|
|
|
104
|
|
|
|
|
350
|
|
|
|
104
|
|
|
|
|
260
|
|
|
2582
|
|
|
|
|
|
|
|
|
2583
|
104
|
100
|
|
|
|
599
|
if (ref($str) ne '') { |
|
2584
|
19
|
|
|
|
|
90
|
return lzss_encode_fast_symbolic($str, %params); |
|
2585
|
|
|
|
|
|
|
} |
|
2586
|
|
|
|
|
|
|
|
|
2587
|
85
|
|
|
|
|
171298
|
my @symbols = unpack('C*', $str); |
|
2588
|
|
|
|
|
|
|
|
|
2589
|
85
|
|
|
|
|
35930
|
my $la = 0; |
|
2590
|
85
|
|
|
|
|
252
|
my $end = $#symbols; |
|
2591
|
|
|
|
|
|
|
|
|
2592
|
85
|
|
33
|
|
|
726
|
my $min_len = $params{min_len} // $LZ_MIN_LEN; # minimum match length |
|
2593
|
85
|
|
33
|
|
|
520
|
my $max_len = $params{max_len} // $LZ_MAX_LEN; # maximum match length |
|
2594
|
85
|
|
33
|
|
|
450
|
my $max_dist = $params{max_dist} // $LZ_MAX_DIST; # maximum offset distance |
|
2595
|
|
|
|
|
|
|
|
|
2596
|
85
|
|
|
|
|
220
|
my (@literals, @distances, @lengths, %table); |
|
2597
|
|
|
|
|
|
|
|
|
2598
|
85
|
|
|
|
|
373
|
while ($la <= $end) { |
|
2599
|
|
|
|
|
|
|
|
|
2600
|
25645
|
|
|
|
|
43341
|
my $best_n = 1; |
|
2601
|
25645
|
|
|
|
|
44553
|
my $best_p = $la; |
|
2602
|
|
|
|
|
|
|
|
|
2603
|
25645
|
|
|
|
|
71849
|
my $lookahead = substr($str, $la, $min_len); |
|
2604
|
|
|
|
|
|
|
|
|
2605
|
25645
|
100
|
66
|
|
|
136006
|
if (exists($table{$lookahead}) and $la - $table{$lookahead} <= $max_dist) { |
|
2606
|
|
|
|
|
|
|
|
|
2607
|
10518
|
|
|
|
|
23002
|
my $p = $table{$lookahead}; |
|
2608
|
10518
|
|
|
|
|
18105
|
my $n = $min_len; |
|
2609
|
|
|
|
|
|
|
|
|
2610
|
10518
|
|
100
|
|
|
797210
|
++$n while ($la + $n <= $end and $symbols[$la + $n - 1] == $symbols[$p + $n - 1] and $n <= $max_len); |
|
|
|
|
100
|
|
|
|
|
|
2611
|
|
|
|
|
|
|
|
|
2612
|
10518
|
|
|
|
|
24632
|
$best_p = $p; |
|
2613
|
10518
|
|
|
|
|
23054
|
$best_n = $n; |
|
2614
|
|
|
|
|
|
|
} |
|
2615
|
|
|
|
|
|
|
|
|
2616
|
25645
|
|
|
|
|
74364
|
$table{$lookahead} = $la; |
|
2617
|
|
|
|
|
|
|
|
|
2618
|
25645
|
100
|
|
|
|
63656
|
if ($best_n > $min_len) { |
|
|
|
100
|
|
|
|
|
|
|
2619
|
|
|
|
|
|
|
|
|
2620
|
10512
|
|
|
|
|
22036
|
push @lengths, $best_n - 1; |
|
2621
|
10512
|
|
|
|
|
22703
|
push @distances, $la - $best_p; |
|
2622
|
10512
|
|
|
|
|
21719
|
push @literals, undef; |
|
2623
|
|
|
|
|
|
|
|
|
2624
|
10512
|
|
|
|
|
38126
|
$la += $best_n - 1; |
|
2625
|
|
|
|
|
|
|
} |
|
2626
|
|
|
|
|
|
|
elsif ($best_n == 1) { |
|
2627
|
15127
|
|
|
|
|
31641
|
push @lengths, 0; |
|
2628
|
15127
|
|
|
|
|
27795
|
push @distances, 0; |
|
2629
|
15127
|
|
|
|
|
59572
|
push @literals, $symbols[$la++]; |
|
2630
|
|
|
|
|
|
|
} |
|
2631
|
|
|
|
|
|
|
else { |
|
2632
|
|
|
|
|
|
|
|
|
2633
|
6
|
|
|
|
|
24
|
push @lengths, (0) x $best_n; |
|
2634
|
6
|
|
|
|
|
20
|
push @distances, (0) x $best_n; |
|
2635
|
6
|
|
|
|
|
31
|
push @literals, @symbols[$la .. $la + $best_n - 1]; |
|
2636
|
|
|
|
|
|
|
|
|
2637
|
6
|
|
|
|
|
27
|
$la += $best_n; |
|
2638
|
|
|
|
|
|
|
} |
|
2639
|
|
|
|
|
|
|
} |
|
2640
|
|
|
|
|
|
|
|
|
2641
|
85
|
|
|
|
|
47170
|
return (\@literals, \@distances, \@lengths); |
|
2642
|
|
|
|
|
|
|
} |
|
2643
|
|
|
|
|
|
|
|
|
2644
|
|
|
|
|
|
|
################################ |
|
2645
|
|
|
|
|
|
|
# LZ77 encoding, inspired by LZ4 |
|
2646
|
|
|
|
|
|
|
################################ |
|
2647
|
|
|
|
|
|
|
|
|
2648
|
130
|
|
|
130
|
1
|
216004
|
sub lz77_encode($chunk, $lzss_encoding_sub = \&lzss_encode) { |
|
|
130
|
|
|
|
|
290
|
|
|
|
130
|
|
|
|
|
367
|
|
|
|
130
|
|
|
|
|
279
|
|
|
2649
|
|
|
|
|
|
|
|
|
2650
|
130
|
|
|
|
|
386
|
local $LZ_MAX_LEN = ~0; # maximum match length |
|
2651
|
|
|
|
|
|
|
|
|
2652
|
130
|
|
|
|
|
649
|
my ($literals, $distances, $lengths) = $lzss_encoding_sub->($chunk); |
|
2653
|
|
|
|
|
|
|
|
|
2654
|
130
|
|
|
|
|
315
|
my $literals_end = $#{$literals}; |
|
|
130
|
|
|
|
|
327
|
|
|
2655
|
130
|
|
|
|
|
329
|
my (@symbols, @len_symbols, @match_symbols, @dist_symbols); |
|
2656
|
|
|
|
|
|
|
|
|
2657
|
130
|
|
|
|
|
508
|
for (my $i = 0 ; $i <= $literals_end ; ++$i) { |
|
2658
|
|
|
|
|
|
|
|
|
2659
|
4445
|
|
|
|
|
7715
|
my $j = $i; |
|
2660
|
4445
|
|
100
|
|
|
20533
|
while ($i <= $literals_end and defined($literals->[$i])) { |
|
2661
|
11215
|
|
|
|
|
43191
|
++$i; |
|
2662
|
|
|
|
|
|
|
} |
|
2663
|
|
|
|
|
|
|
|
|
2664
|
4445
|
|
|
|
|
8594
|
my $literals_length = $i - $j; |
|
2665
|
4445
|
|
100
|
|
|
11866
|
my $match_len = $lengths->[$i] // 0; |
|
2666
|
|
|
|
|
|
|
|
|
2667
|
4445
|
100
|
|
|
|
14003
|
push @match_symbols, (($literals_length >= 7 ? 7 : $literals_length) << 5) | ($match_len >= 31 ? 31 : $match_len); |
|
|
|
100
|
|
|
|
|
|
|
2668
|
|
|
|
|
|
|
|
|
2669
|
4445
|
|
|
|
|
7326
|
$literals_length -= 7; |
|
2670
|
4445
|
|
|
|
|
7050
|
$match_len -= 31; |
|
2671
|
|
|
|
|
|
|
|
|
2672
|
4445
|
|
|
|
|
10481
|
while ($literals_length >= 0) { |
|
2673
|
470
|
50
|
|
|
|
1267
|
push @len_symbols, ($literals_length >= 255 ? 255 : $literals_length); |
|
2674
|
470
|
|
|
|
|
1151
|
$literals_length -= 255; |
|
2675
|
|
|
|
|
|
|
} |
|
2676
|
|
|
|
|
|
|
|
|
2677
|
4445
|
100
|
|
|
|
10227
|
if ($i > $j) { |
|
2678
|
1715
|
|
|
|
|
4282
|
push @symbols, @{$literals}[$j .. $i - 1]; |
|
|
1715
|
|
|
|
|
16173
|
|
|
2679
|
|
|
|
|
|
|
} |
|
2680
|
|
|
|
|
|
|
|
|
2681
|
4445
|
|
|
|
|
11852
|
while ($match_len >= 0) { |
|
2682
|
658
|
100
|
|
|
|
1907
|
push @match_symbols, ($match_len >= 255 ? 255 : $match_len); |
|
2683
|
658
|
|
|
|
|
1820
|
$match_len -= 255; |
|
2684
|
|
|
|
|
|
|
} |
|
2685
|
|
|
|
|
|
|
|
|
2686
|
4445
|
|
100
|
|
|
18279
|
push @dist_symbols, $distances->[$i] // 0; |
|
2687
|
|
|
|
|
|
|
} |
|
2688
|
|
|
|
|
|
|
|
|
2689
|
130
|
|
|
|
|
4893
|
return (\@symbols, \@dist_symbols, \@len_symbols, \@match_symbols); |
|
2690
|
|
|
|
|
|
|
} |
|
2691
|
|
|
|
|
|
|
|
|
2692
|
|
|
|
|
|
|
*lz77_encode_symbolic = \&lz77_encode; |
|
2693
|
|
|
|
|
|
|
|
|
2694
|
44
|
|
|
44
|
1
|
173
|
sub lz77_decode($symbols, $dist_symbols, $len_symbols, $match_symbols) { |
|
|
44
|
|
|
|
|
121
|
|
|
|
44
|
|
|
|
|
80
|
|
|
|
44
|
|
|
|
|
84
|
|
|
|
44
|
|
|
|
|
82
|
|
|
|
44
|
|
|
|
|
71
|
|
|
2695
|
|
|
|
|
|
|
|
|
2696
|
44
|
|
|
|
|
96
|
my $data = ''; |
|
2697
|
44
|
|
|
|
|
105
|
my $data_len = 0; |
|
2698
|
|
|
|
|
|
|
|
|
2699
|
44
|
|
|
|
|
4605
|
my @symbols = @$symbols; |
|
2700
|
44
|
|
|
|
|
385
|
my @len_symbols = @$len_symbols; |
|
2701
|
44
|
|
|
|
|
4169
|
my @match_symbols = @$match_symbols; |
|
2702
|
44
|
|
|
|
|
2262
|
my @dist_symbols = @$dist_symbols; |
|
2703
|
|
|
|
|
|
|
|
|
2704
|
44
|
|
|
|
|
189
|
while (@symbols) { |
|
2705
|
|
|
|
|
|
|
|
|
2706
|
3899
|
|
33
|
|
|
13269
|
my $len_byte = shift(@match_symbols) // confess "bad input"; |
|
2707
|
|
|
|
|
|
|
|
|
2708
|
3899
|
|
|
|
|
12887
|
my $literals_length = $len_byte >> 5; |
|
2709
|
3899
|
|
|
|
|
9535
|
my $match_len = $len_byte & 0b11111; |
|
2710
|
|
|
|
|
|
|
|
|
2711
|
3899
|
100
|
|
|
|
11608
|
if ($literals_length == 7) { |
|
2712
|
324
|
|
|
|
|
679
|
while (1) { |
|
2713
|
324
|
|
33
|
|
|
1189
|
my $byte_len = shift(@len_symbols) // confess "bad input"; |
|
2714
|
324
|
|
|
|
|
864
|
$literals_length += $byte_len; |
|
2715
|
324
|
50
|
|
|
|
1271
|
last if $byte_len != 255; |
|
2716
|
|
|
|
|
|
|
} |
|
2717
|
|
|
|
|
|
|
} |
|
2718
|
|
|
|
|
|
|
|
|
2719
|
3899
|
100
|
|
|
|
11649
|
if ($literals_length > 0) { |
|
2720
|
1314
|
|
|
|
|
8248
|
$data .= pack("C*", splice(@symbols, 0, $literals_length)); |
|
2721
|
1314
|
|
|
|
|
6210
|
$data_len += $literals_length; |
|
2722
|
|
|
|
|
|
|
} |
|
2723
|
|
|
|
|
|
|
|
|
2724
|
3899
|
100
|
|
|
|
12811
|
if ($match_len == 31) { |
|
2725
|
608
|
|
|
|
|
1520
|
while (1) { |
|
2726
|
620
|
|
33
|
|
|
2518
|
my $byte_len = shift(@match_symbols) // confess "bad input"; |
|
2727
|
620
|
|
|
|
|
1918
|
$match_len += $byte_len; |
|
2728
|
620
|
100
|
|
|
|
2460
|
last if $byte_len != 255; |
|
2729
|
|
|
|
|
|
|
} |
|
2730
|
|
|
|
|
|
|
} |
|
2731
|
|
|
|
|
|
|
|
|
2732
|
3899
|
|
33
|
|
|
13432
|
my $dist = shift(@dist_symbols) // confess "bad input"; |
|
2733
|
|
|
|
|
|
|
|
|
2734
|
3899
|
100
|
|
|
|
11910
|
if ($dist >= $match_len) { # non-overlapping matches |
|
|
|
100
|
|
|
|
|
|
|
2735
|
3728
|
|
33
|
|
|
18295
|
$data .= substr($data, $data_len - $dist, $match_len) // confess "bad input"; |
|
2736
|
|
|
|
|
|
|
} |
|
2737
|
|
|
|
|
|
|
elsif ($dist == 1) { # run-length of last character |
|
2738
|
33
|
|
|
|
|
190
|
$data .= substr($data, -1) x $match_len; |
|
2739
|
|
|
|
|
|
|
} |
|
2740
|
|
|
|
|
|
|
else { # overlapping matches |
|
2741
|
138
|
|
|
|
|
542
|
foreach my $i (1 .. $match_len) { |
|
2742
|
2510
|
|
33
|
|
|
11281
|
$data .= substr($data, $data_len + $i - $dist - 1, 1) // confess "bad input"; |
|
2743
|
|
|
|
|
|
|
} |
|
2744
|
|
|
|
|
|
|
} |
|
2745
|
|
|
|
|
|
|
|
|
2746
|
3899
|
|
|
|
|
16811
|
$data_len += $match_len; |
|
2747
|
|
|
|
|
|
|
} |
|
2748
|
|
|
|
|
|
|
|
|
2749
|
44
|
|
|
|
|
9003
|
return $data; |
|
2750
|
|
|
|
|
|
|
} |
|
2751
|
|
|
|
|
|
|
|
|
2752
|
86
|
|
|
86
|
1
|
155
|
sub lz77_decode_symbolic($symbols, $dist_symbols, $len_symbols, $match_symbols) { |
|
|
86
|
|
|
|
|
147
|
|
|
|
86
|
|
|
|
|
175
|
|
|
|
86
|
|
|
|
|
184
|
|
|
|
86
|
|
|
|
|
154
|
|
|
|
86
|
|
|
|
|
150
|
|
|
2753
|
|
|
|
|
|
|
|
|
2754
|
86
|
|
|
|
|
168
|
my @data; |
|
2755
|
86
|
|
|
|
|
173
|
my $data_len = 0; |
|
2756
|
|
|
|
|
|
|
|
|
2757
|
86
|
|
|
|
|
1057
|
my @symbols = @$symbols; |
|
2758
|
86
|
|
|
|
|
226
|
my @len_symbols = @$len_symbols; |
|
2759
|
86
|
|
|
|
|
299
|
my @match_symbols = @$match_symbols; |
|
2760
|
86
|
|
|
|
|
321
|
my @dist_symbols = @$dist_symbols; |
|
2761
|
|
|
|
|
|
|
|
|
2762
|
86
|
|
|
|
|
264
|
while (@symbols) { |
|
2763
|
|
|
|
|
|
|
|
|
2764
|
546
|
|
33
|
|
|
1206
|
my $len_byte = shift(@match_symbols) // confess "bad input"; |
|
2765
|
|
|
|
|
|
|
|
|
2766
|
546
|
|
|
|
|
1139
|
my $literals_length = $len_byte >> 5; |
|
2767
|
546
|
|
|
|
|
989
|
my $match_len = $len_byte & 0b11111; |
|
2768
|
|
|
|
|
|
|
|
|
2769
|
546
|
100
|
|
|
|
1175
|
if ($literals_length == 7) { |
|
2770
|
146
|
|
|
|
|
198
|
while (1) { |
|
2771
|
146
|
|
33
|
|
|
294
|
my $byte_len = shift(@len_symbols) // confess "bad input"; |
|
2772
|
146
|
|
|
|
|
250
|
$literals_length += $byte_len; |
|
2773
|
146
|
50
|
|
|
|
387
|
last if $byte_len != 255; |
|
2774
|
|
|
|
|
|
|
} |
|
2775
|
|
|
|
|
|
|
} |
|
2776
|
|
|
|
|
|
|
|
|
2777
|
546
|
100
|
|
|
|
1245
|
if ($literals_length > 0) { |
|
2778
|
401
|
|
|
|
|
1774
|
push @data, splice(@symbols, 0, $literals_length); |
|
2779
|
401
|
|
|
|
|
1065
|
$data_len += $literals_length; |
|
2780
|
|
|
|
|
|
|
} |
|
2781
|
|
|
|
|
|
|
|
|
2782
|
546
|
100
|
|
|
|
1139
|
if ($match_len == 31) { |
|
2783
|
29
|
|
|
|
|
70
|
while (1) { |
|
2784
|
38
|
|
33
|
|
|
156
|
my $byte_len = shift(@match_symbols) // confess "bad input"; |
|
2785
|
38
|
|
|
|
|
85
|
$match_len += $byte_len; |
|
2786
|
38
|
100
|
|
|
|
143
|
last if $byte_len != 255; |
|
2787
|
|
|
|
|
|
|
} |
|
2788
|
|
|
|
|
|
|
} |
|
2789
|
|
|
|
|
|
|
|
|
2790
|
546
|
|
33
|
|
|
1173
|
my $dist = shift(@dist_symbols) // confess "bad input"; |
|
2791
|
|
|
|
|
|
|
|
|
2792
|
546
|
100
|
|
|
|
1313
|
if ($dist >= $match_len) { # non-overlapping matches |
|
|
|
100
|
|
|
|
|
|
|
2793
|
434
|
|
|
|
|
2557
|
push @data, @data[scalar(@data) - $dist .. scalar(@data) - $dist + $match_len - 1]; |
|
2794
|
|
|
|
|
|
|
} |
|
2795
|
|
|
|
|
|
|
elsif ($dist == 1) { # run-length of last character |
|
2796
|
33
|
|
|
|
|
1610
|
push @data, ($data[-1]) x $match_len; |
|
2797
|
|
|
|
|
|
|
} |
|
2798
|
|
|
|
|
|
|
else { # overlapping matches |
|
2799
|
79
|
|
|
|
|
237
|
foreach my $j (1 .. $match_len) { |
|
2800
|
703
|
|
|
|
|
2406
|
push @data, $data[$data_len + $j - $dist - 1]; |
|
2801
|
|
|
|
|
|
|
} |
|
2802
|
|
|
|
|
|
|
} |
|
2803
|
|
|
|
|
|
|
|
|
2804
|
546
|
|
|
|
|
1912
|
$data_len += $match_len; |
|
2805
|
|
|
|
|
|
|
} |
|
2806
|
|
|
|
|
|
|
|
|
2807
|
86
|
|
|
|
|
1436
|
return \@data; |
|
2808
|
|
|
|
|
|
|
} |
|
2809
|
|
|
|
|
|
|
|
|
2810
|
93
|
|
|
93
|
1
|
2482569
|
sub lz77_compress($chunk, $entropy_sub = \&create_huffman_entry, $lzss_encoding_sub = \&lzss_encode) { |
|
|
93
|
|
|
|
|
310
|
|
|
|
93
|
|
|
|
|
289
|
|
|
|
93
|
|
|
|
|
345
|
|
|
|
93
|
|
|
|
|
211
|
|
|
2811
|
93
|
|
|
|
|
403
|
my ($symbols, $dist_symbols, $len_symbols, $match_symbols) = lz77_encode($chunk, $lzss_encoding_sub); |
|
2812
|
93
|
|
|
|
|
391
|
$entropy_sub->($symbols) . $entropy_sub->($len_symbols) . $entropy_sub->($match_symbols) . obh_encode($dist_symbols, $entropy_sub); |
|
2813
|
|
|
|
|
|
|
} |
|
2814
|
|
|
|
|
|
|
|
|
2815
|
|
|
|
|
|
|
*lz77_compress_symbolic = \&lz77_compress; |
|
2816
|
|
|
|
|
|
|
|
|
2817
|
46
|
|
|
46
|
1
|
178
|
sub lz77_decompress($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
46
|
|
|
|
|
135
|
|
|
|
46
|
|
|
|
|
145
|
|
|
|
46
|
|
|
|
|
79
|
|
|
2818
|
|
|
|
|
|
|
|
|
2819
|
46
|
100
|
|
|
|
192
|
if (ref($fh) eq '') { |
|
2820
|
23
|
50
|
|
|
|
520
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
2821
|
23
|
|
|
|
|
107
|
return __SUB__->($fh2, $entropy_sub); |
|
2822
|
|
|
|
|
|
|
} |
|
2823
|
|
|
|
|
|
|
|
|
2824
|
23
|
|
|
|
|
97
|
my $symbols = $entropy_sub->($fh); |
|
2825
|
23
|
|
|
|
|
127
|
my $len_symbols = $entropy_sub->($fh); |
|
2826
|
23
|
|
|
|
|
130
|
my $match_symbols = $entropy_sub->($fh); |
|
2827
|
23
|
|
|
|
|
168
|
my $dist_symbols = obh_decode($fh, $entropy_sub); |
|
2828
|
|
|
|
|
|
|
|
|
2829
|
23
|
|
|
|
|
130
|
lz77_decode($symbols, $dist_symbols, $len_symbols, $match_symbols); |
|
2830
|
|
|
|
|
|
|
} |
|
2831
|
|
|
|
|
|
|
|
|
2832
|
134
|
|
|
134
|
1
|
317
|
sub lz77_decompress_symbolic($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
134
|
|
|
|
|
292
|
|
|
|
134
|
|
|
|
|
284
|
|
|
|
134
|
|
|
|
|
218
|
|
|
2833
|
|
|
|
|
|
|
|
|
2834
|
134
|
100
|
|
|
|
453
|
if (ref($fh) eq '') { |
|
2835
|
64
|
50
|
|
|
|
1140
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
2836
|
64
|
|
|
|
|
228
|
return __SUB__->($fh2, $entropy_sub); |
|
2837
|
|
|
|
|
|
|
} |
|
2838
|
|
|
|
|
|
|
|
|
2839
|
70
|
|
|
|
|
229
|
my $symbols = $entropy_sub->($fh); |
|
2840
|
70
|
|
|
|
|
264
|
my $len_symbols = $entropy_sub->($fh); |
|
2841
|
70
|
|
|
|
|
230
|
my $match_symbols = $entropy_sub->($fh); |
|
2842
|
70
|
|
|
|
|
417
|
my $dist_symbols = obh_decode($fh, $entropy_sub); |
|
2843
|
|
|
|
|
|
|
|
|
2844
|
70
|
|
|
|
|
350
|
lz77_decode_symbolic($symbols, $dist_symbols, $len_symbols, $match_symbols); |
|
2845
|
|
|
|
|
|
|
} |
|
2846
|
|
|
|
|
|
|
|
|
2847
|
|
|
|
|
|
|
######################### |
|
2848
|
|
|
|
|
|
|
# LZSS + DEFLATE encoding |
|
2849
|
|
|
|
|
|
|
######################### |
|
2850
|
|
|
|
|
|
|
|
|
2851
|
97
|
|
|
97
|
1
|
2411206
|
sub lzss_compress($chunk, $entropy_sub = \&create_huffman_entry, $lzss_encoding_sub = \&lzss_encode) { |
|
|
97
|
|
|
|
|
289
|
|
|
|
97
|
|
|
|
|
287
|
|
|
|
97
|
|
|
|
|
373
|
|
|
|
97
|
|
|
|
|
216
|
|
|
2852
|
97
|
|
|
|
|
446
|
my ($literals, $distances, $lengths) = $lzss_encoding_sub->($chunk); |
|
2853
|
97
|
|
|
|
|
537
|
deflate_encode($literals, $distances, $lengths, $entropy_sub); |
|
2854
|
|
|
|
|
|
|
} |
|
2855
|
|
|
|
|
|
|
|
|
2856
|
|
|
|
|
|
|
*lzss_compress_symbolic = \&lzss_compress; |
|
2857
|
|
|
|
|
|
|
|
|
2858
|
68
|
|
|
68
|
1
|
251
|
sub lzss_decompress($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
68
|
|
|
|
|
169
|
|
|
|
68
|
|
|
|
|
191
|
|
|
|
68
|
|
|
|
|
114
|
|
|
2859
|
|
|
|
|
|
|
|
|
2860
|
68
|
100
|
|
|
|
419
|
if (ref($fh) eq '') { |
|
2861
|
34
|
50
|
|
|
|
1028
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
2862
|
34
|
|
|
|
|
177
|
return __SUB__->($fh2, $entropy_sub); |
|
2863
|
|
|
|
|
|
|
} |
|
2864
|
|
|
|
|
|
|
|
|
2865
|
34
|
|
|
|
|
164
|
my ($literals, $distances, $lengths) = deflate_decode($fh, $entropy_sub); |
|
2866
|
34
|
|
|
|
|
231
|
lzss_decode($literals, $distances, $lengths); |
|
2867
|
|
|
|
|
|
|
} |
|
2868
|
|
|
|
|
|
|
|
|
2869
|
126
|
|
|
126
|
1
|
619
|
sub lzss_decompress_symbolic($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
126
|
|
|
|
|
223
|
|
|
|
126
|
|
|
|
|
220
|
|
|
|
126
|
|
|
|
|
227
|
|
|
2870
|
|
|
|
|
|
|
|
|
2871
|
126
|
100
|
|
|
|
380
|
if (ref($fh) eq '') { |
|
2872
|
63
|
50
|
|
|
|
1072
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
2873
|
63
|
|
|
|
|
225
|
return __SUB__->($fh2, $entropy_sub); |
|
2874
|
|
|
|
|
|
|
} |
|
2875
|
|
|
|
|
|
|
|
|
2876
|
63
|
|
|
|
|
250
|
my ($literals, $distances, $lengths) = deflate_decode($fh, $entropy_sub); |
|
2877
|
63
|
|
|
|
|
308
|
lzss_decode_symbolic($literals, $distances, $lengths); |
|
2878
|
|
|
|
|
|
|
} |
|
2879
|
|
|
|
|
|
|
|
|
2880
|
|
|
|
|
|
|
######################################### |
|
2881
|
|
|
|
|
|
|
# LZB -- LZSS with byte-oriented encoding |
|
2882
|
|
|
|
|
|
|
######################################### |
|
2883
|
|
|
|
|
|
|
|
|
2884
|
28
|
|
|
28
|
1
|
430499
|
sub lzb_compress ($chunk, $lzss_encoding_sub = \&lzss_encode) { |
|
|
28
|
|
|
|
|
92
|
|
|
|
28
|
|
|
|
|
110
|
|
|
|
28
|
|
|
|
|
60
|
|
|
2885
|
|
|
|
|
|
|
|
|
2886
|
28
|
|
|
|
|
56
|
my ($literals, $distances, $lengths) = do { |
|
2887
|
28
|
|
|
|
|
74
|
local $LZ_MAX_DIST = (1 << 16) - 1; |
|
2888
|
28
|
|
|
|
|
82
|
local $LZ_MAX_LEN = ~0; |
|
2889
|
28
|
|
|
|
|
122
|
$lzss_encoding_sub->($chunk); |
|
2890
|
|
|
|
|
|
|
}; |
|
2891
|
|
|
|
|
|
|
|
|
2892
|
28
|
|
|
|
|
73
|
my $literals_end = $#{$literals}; |
|
|
28
|
|
|
|
|
80
|
|
|
2893
|
28
|
|
|
|
|
70
|
my $data = ''; |
|
2894
|
|
|
|
|
|
|
|
|
2895
|
28
|
|
|
|
|
108
|
for (my $i = 0 ; $i <= $literals_end ; ++$i) { |
|
2896
|
|
|
|
|
|
|
|
|
2897
|
3601
|
|
|
|
|
5605
|
my $j = $i; |
|
2898
|
3601
|
|
100
|
|
|
14354
|
while ($i <= $literals_end and defined($literals->[$i])) { |
|
2899
|
5515
|
|
|
|
|
20894
|
++$i; |
|
2900
|
|
|
|
|
|
|
} |
|
2901
|
|
|
|
|
|
|
|
|
2902
|
3601
|
|
|
|
|
6025
|
my $literals_length = $i - $j; |
|
2903
|
3601
|
|
100
|
|
|
7906
|
my $match_len = $lengths->[$i] // 0; |
|
2904
|
|
|
|
|
|
|
|
|
2905
|
3601
|
100
|
|
|
|
15245
|
$data .= chr((($literals_length >= 7 ? 7 : $literals_length) << 5) | ($match_len >= 31 ? 31 : $match_len)); |
|
|
|
100
|
|
|
|
|
|
|
2906
|
|
|
|
|
|
|
|
|
2907
|
3601
|
|
|
|
|
5894
|
$literals_length -= 7; |
|
2908
|
3601
|
|
|
|
|
5145
|
$match_len -= 31; |
|
2909
|
|
|
|
|
|
|
|
|
2910
|
3601
|
|
|
|
|
7451
|
while ($literals_length >= 0) { |
|
2911
|
235
|
50
|
|
|
|
592
|
$data .= $literals_length >= 255 ? "\xff" : chr($literals_length); |
|
2912
|
235
|
|
|
|
|
591
|
$literals_length -= 255; |
|
2913
|
|
|
|
|
|
|
} |
|
2914
|
|
|
|
|
|
|
|
|
2915
|
3601
|
100
|
|
|
|
6766
|
if ($i > $j) { |
|
2916
|
1109
|
|
|
|
|
2195
|
$data .= pack('C*', @{$literals}[$j .. $i - 1]); |
|
|
1109
|
|
|
|
|
3499
|
|
|
2917
|
|
|
|
|
|
|
} |
|
2918
|
|
|
|
|
|
|
|
|
2919
|
3601
|
|
|
|
|
9714
|
while ($match_len >= 0) { |
|
2920
|
611
|
100
|
|
|
|
1533
|
$data .= $match_len >= 255 ? "\xff" : chr($match_len); |
|
2921
|
611
|
|
|
|
|
1496
|
$match_len -= 255; |
|
2922
|
|
|
|
|
|
|
} |
|
2923
|
|
|
|
|
|
|
|
|
2924
|
3601
|
|
100
|
|
|
16011
|
$data .= pack('B*', sprintf('%016b', $distances->[$i] // 0)); |
|
2925
|
|
|
|
|
|
|
} |
|
2926
|
|
|
|
|
|
|
|
|
2927
|
28
|
|
|
|
|
166
|
return fibonacci_encode([length $data]) . $data; |
|
2928
|
|
|
|
|
|
|
} |
|
2929
|
|
|
|
|
|
|
|
|
2930
|
56
|
|
|
56
|
1
|
125
|
sub lzb_decompress($fh) { |
|
|
56
|
|
|
|
|
111
|
|
|
|
56
|
|
|
|
|
87
|
|
|
2931
|
|
|
|
|
|
|
|
|
2932
|
56
|
100
|
|
|
|
179
|
if (ref($fh) eq '') { |
|
2933
|
28
|
50
|
|
|
|
600
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
2934
|
28
|
|
|
|
|
137
|
return __SUB__->($fh2); |
|
2935
|
|
|
|
|
|
|
} |
|
2936
|
|
|
|
|
|
|
|
|
2937
|
28
|
|
|
|
|
124
|
my $data = ''; |
|
2938
|
28
|
|
|
|
|
63
|
my $search_window = ''; |
|
2939
|
28
|
|
|
|
|
55
|
my $search_window_size = 1 << 16; |
|
2940
|
|
|
|
|
|
|
|
|
2941
|
28
|
|
33
|
|
|
98
|
my $block_size = fibonacci_decode($fh)->[0] // confess "decompression error"; |
|
2942
|
|
|
|
|
|
|
|
|
2943
|
28
|
|
33
|
|
|
1190
|
read($fh, (my $block), $block_size) // confess "Read error: $!"; |
|
2944
|
|
|
|
|
|
|
|
|
2945
|
28
|
|
|
|
|
157
|
while ($block ne '') { |
|
2946
|
|
|
|
|
|
|
|
|
2947
|
3601
|
|
|
|
|
10913
|
my $len_byte = ord substr($block, 0, 1, ''); |
|
2948
|
|
|
|
|
|
|
|
|
2949
|
3601
|
|
|
|
|
9083
|
my $literals_length = $len_byte >> 5; |
|
2950
|
3601
|
|
|
|
|
8969
|
my $match_len = $len_byte & 0b11111; |
|
2951
|
|
|
|
|
|
|
|
|
2952
|
3601
|
100
|
|
|
|
10206
|
if ($literals_length == 7) { |
|
2953
|
235
|
|
|
|
|
476
|
while (1) { |
|
2954
|
235
|
|
|
|
|
783
|
my $byte_len = ord substr($block, 0, 1, ''); |
|
2955
|
235
|
|
|
|
|
534
|
$literals_length += $byte_len; |
|
2956
|
235
|
50
|
|
|
|
974
|
last if $byte_len != 255; |
|
2957
|
|
|
|
|
|
|
} |
|
2958
|
|
|
|
|
|
|
} |
|
2959
|
|
|
|
|
|
|
|
|
2960
|
3601
|
100
|
|
|
|
10442
|
if ($literals_length > 0) { |
|
2961
|
1109
|
|
|
|
|
3794
|
$search_window .= substr($block, 0, $literals_length, ''); |
|
2962
|
|
|
|
|
|
|
} |
|
2963
|
|
|
|
|
|
|
|
|
2964
|
3601
|
100
|
|
|
|
10061
|
if ($match_len == 31) { |
|
2965
|
601
|
|
|
|
|
1183
|
while (1) { |
|
2966
|
611
|
|
|
|
|
1834
|
my $byte_len = ord substr($block, 0, 1, ''); |
|
2967
|
611
|
|
|
|
|
1493
|
$match_len += $byte_len; |
|
2968
|
611
|
100
|
|
|
|
2389
|
last if $byte_len != 255; |
|
2969
|
|
|
|
|
|
|
} |
|
2970
|
|
|
|
|
|
|
} |
|
2971
|
|
|
|
|
|
|
|
|
2972
|
3601
|
|
|
|
|
14998
|
my $offset = oct('0b' . unpack('B*', substr($block, 0, 2, ''))); |
|
2973
|
|
|
|
|
|
|
|
|
2974
|
3601
|
100
|
|
|
|
10659
|
if ($offset >= $match_len) { # non-overlapping matches |
|
|
|
100
|
|
|
|
|
|
|
2975
|
3467
|
|
|
|
|
13292
|
$search_window .= substr($search_window, length($search_window) - $offset, $match_len); |
|
2976
|
|
|
|
|
|
|
} |
|
2977
|
|
|
|
|
|
|
elsif ($offset == 1) { # run-length of last character |
|
2978
|
20
|
|
|
|
|
111
|
$search_window .= substr($search_window, -1) x $match_len; |
|
2979
|
|
|
|
|
|
|
} |
|
2980
|
|
|
|
|
|
|
else { # overlapping matches |
|
2981
|
114
|
|
|
|
|
505
|
foreach my $i (1 .. $match_len) { |
|
2982
|
2393
|
|
|
|
|
8216
|
$search_window .= substr($search_window, length($search_window) - $offset, 1); |
|
2983
|
|
|
|
|
|
|
} |
|
2984
|
|
|
|
|
|
|
} |
|
2985
|
|
|
|
|
|
|
|
|
2986
|
3601
|
|
|
|
|
12144
|
$data .= substr($search_window, -($match_len + $literals_length)); |
|
2987
|
3601
|
50
|
|
|
|
21947
|
$search_window = substr($search_window, -$search_window_size) if (length($search_window) > 2 * $search_window_size); |
|
2988
|
|
|
|
|
|
|
} |
|
2989
|
|
|
|
|
|
|
|
|
2990
|
28
|
|
|
|
|
555
|
return $data; |
|
2991
|
|
|
|
|
|
|
} |
|
2992
|
|
|
|
|
|
|
|
|
2993
|
|
|
|
|
|
|
################################################################ |
|
2994
|
|
|
|
|
|
|
# Encode a list of symbols, using offset bits and huffman coding |
|
2995
|
|
|
|
|
|
|
################################################################ |
|
2996
|
|
|
|
|
|
|
|
|
2997
|
254
|
|
|
254
|
1
|
625
|
sub obh_encode ($distances, $entropy_sub = \&create_huffman_entry) { |
|
|
254
|
|
|
|
|
502
|
|
|
|
254
|
|
|
|
|
623
|
|
|
|
254
|
|
|
|
|
451
|
|
|
2998
|
|
|
|
|
|
|
|
|
2999
|
254
|
|
100
|
|
|
2041
|
my $max_dist = max(@$distances) // 0; |
|
3000
|
254
|
|
|
|
|
1013
|
my ($DISTANCE_SYMBOLS) = make_deflate_tables($max_dist, 0); |
|
3001
|
|
|
|
|
|
|
|
|
3002
|
254
|
|
|
|
|
632
|
my @symbols; |
|
3003
|
254
|
|
|
|
|
574
|
my $offset_bits = ''; |
|
3004
|
|
|
|
|
|
|
|
|
3005
|
254
|
|
|
|
|
637
|
foreach my $dist (@$distances) { |
|
3006
|
|
|
|
|
|
|
|
|
3007
|
10250
|
|
|
|
|
23915
|
my $i = find_deflate_index($dist, $DISTANCE_SYMBOLS); |
|
3008
|
10250
|
|
|
|
|
25750
|
my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$i]}; |
|
|
10250
|
|
|
|
|
24125
|
|
|
3009
|
|
|
|
|
|
|
|
|
3010
|
10250
|
|
|
|
|
20691
|
push @symbols, $i; |
|
3011
|
|
|
|
|
|
|
|
|
3012
|
10250
|
100
|
|
|
|
24347
|
if ($bits > 0) { |
|
3013
|
9699
|
|
|
|
|
40155
|
$offset_bits .= sprintf('%0*b', $bits, $dist - $min); |
|
3014
|
|
|
|
|
|
|
} |
|
3015
|
|
|
|
|
|
|
} |
|
3016
|
|
|
|
|
|
|
|
|
3017
|
254
|
|
|
|
|
1115
|
fibonacci_encode([$max_dist]) . $entropy_sub->(\@symbols) . pack('B*', $offset_bits); |
|
3018
|
|
|
|
|
|
|
} |
|
3019
|
|
|
|
|
|
|
|
|
3020
|
306
|
|
|
306
|
1
|
650
|
sub obh_decode ($fh, $entropy_sub = \&decode_huffman_entry) { |
|
|
306
|
|
|
|
|
586
|
|
|
|
306
|
|
|
|
|
610
|
|
|
|
306
|
|
|
|
|
503
|
|
|
3021
|
|
|
|
|
|
|
|
|
3022
|
306
|
100
|
|
|
|
1065
|
if (ref($fh) eq '') { |
|
3023
|
52
|
50
|
|
|
|
857
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
3024
|
52
|
|
|
|
|
172
|
return __SUB__->($fh2, $entropy_sub); |
|
3025
|
|
|
|
|
|
|
} |
|
3026
|
|
|
|
|
|
|
|
|
3027
|
254
|
|
|
|
|
830
|
my $max_dist = fibonacci_decode($fh)->[0]; |
|
3028
|
254
|
|
|
|
|
823
|
my ($DISTANCE_SYMBOLS) = make_deflate_tables($max_dist, 0); |
|
3029
|
|
|
|
|
|
|
|
|
3030
|
254
|
|
|
|
|
978
|
my $symbols = $entropy_sub->($fh); |
|
3031
|
254
|
|
|
|
|
745
|
my $bits_len = 0; |
|
3032
|
|
|
|
|
|
|
|
|
3033
|
254
|
|
|
|
|
679
|
foreach my $i (@$symbols) { |
|
3034
|
10250
|
|
|
|
|
28203
|
$bits_len += $DISTANCE_SYMBOLS->[$i][1]; |
|
3035
|
|
|
|
|
|
|
} |
|
3036
|
|
|
|
|
|
|
|
|
3037
|
254
|
|
|
|
|
802
|
my $bits = read_bits($fh, $bits_len); |
|
3038
|
|
|
|
|
|
|
|
|
3039
|
254
|
|
|
|
|
575
|
my @distances; |
|
3040
|
254
|
|
|
|
|
666
|
foreach my $i (@$symbols) { |
|
3041
|
10250
|
|
|
|
|
45265
|
push @distances, $DISTANCE_SYMBOLS->[$i][0] + oct('0b' . substr($bits, 0, $DISTANCE_SYMBOLS->[$i][1], '')); |
|
3042
|
|
|
|
|
|
|
} |
|
3043
|
|
|
|
|
|
|
|
|
3044
|
254
|
|
|
|
|
4963
|
return \@distances; |
|
3045
|
|
|
|
|
|
|
} |
|
3046
|
|
|
|
|
|
|
|
|
3047
|
|
|
|
|
|
|
################# |
|
3048
|
|
|
|
|
|
|
# LZW Compression |
|
3049
|
|
|
|
|
|
|
################# |
|
3050
|
|
|
|
|
|
|
|
|
3051
|
25
|
|
|
25
|
1
|
56
|
sub lzw_encode ($uncompressed) { |
|
|
25
|
|
|
|
|
59
|
|
|
|
25
|
|
|
|
|
62
|
|
|
3052
|
|
|
|
|
|
|
|
|
3053
|
|
|
|
|
|
|
# Build the dictionary |
|
3054
|
25
|
|
|
|
|
63
|
my $dict_size = 256; |
|
3055
|
25
|
|
|
|
|
70
|
my %dictionary; |
|
3056
|
|
|
|
|
|
|
|
|
3057
|
25
|
|
|
|
|
149
|
foreach my $i (0 .. $dict_size - 1) { |
|
3058
|
6400
|
|
|
|
|
18257
|
$dictionary{chr($i)} = $i; |
|
3059
|
|
|
|
|
|
|
} |
|
3060
|
|
|
|
|
|
|
|
|
3061
|
25
|
|
|
|
|
78
|
my $w = ''; |
|
3062
|
25
|
|
|
|
|
69
|
my @result; |
|
3063
|
|
|
|
|
|
|
|
|
3064
|
25
|
|
|
|
|
25720
|
foreach my $c (split(//, $uncompressed)) { |
|
3065
|
36992
|
|
|
|
|
103894
|
my $wc = $w . $c; |
|
3066
|
36992
|
100
|
|
|
|
104308
|
if (exists $dictionary{$wc}) { |
|
3067
|
28074
|
|
|
|
|
84045
|
$w = $wc; |
|
3068
|
|
|
|
|
|
|
} |
|
3069
|
|
|
|
|
|
|
else { |
|
3070
|
8918
|
|
|
|
|
26910
|
push @result, $dictionary{$w}; |
|
3071
|
|
|
|
|
|
|
|
|
3072
|
|
|
|
|
|
|
# Add wc to the dictionary |
|
3073
|
8918
|
|
|
|
|
29162
|
$dictionary{$wc} = $dict_size++; |
|
3074
|
8918
|
|
|
|
|
26097
|
$w = $c; |
|
3075
|
|
|
|
|
|
|
} |
|
3076
|
|
|
|
|
|
|
} |
|
3077
|
|
|
|
|
|
|
|
|
3078
|
|
|
|
|
|
|
# Output the code for w |
|
3079
|
25
|
100
|
|
|
|
14083
|
if ($w ne '') { |
|
3080
|
24
|
|
|
|
|
92
|
push @result, $dictionary{$w}; |
|
3081
|
|
|
|
|
|
|
} |
|
3082
|
|
|
|
|
|
|
|
|
3083
|
25
|
|
|
|
|
4319
|
return \@result; |
|
3084
|
|
|
|
|
|
|
} |
|
3085
|
|
|
|
|
|
|
|
|
3086
|
25
|
|
|
25
|
1
|
86
|
sub lzw_decode ($compressed) { |
|
|
25
|
|
|
|
|
66
|
|
|
|
25
|
|
|
|
|
53
|
|
|
3087
|
|
|
|
|
|
|
|
|
3088
|
25
|
100
|
|
|
|
118
|
@$compressed || return ''; |
|
3089
|
|
|
|
|
|
|
|
|
3090
|
|
|
|
|
|
|
# Build the dictionary |
|
3091
|
24
|
|
|
|
|
52
|
my $dict_size = 256; |
|
3092
|
24
|
|
|
|
|
365
|
my @dictionary = map { chr($_) } 0 .. $dict_size - 1; |
|
|
6144
|
|
|
|
|
12666
|
|
|
3093
|
|
|
|
|
|
|
|
|
3094
|
24
|
|
|
|
|
320
|
my $w = $dictionary[$compressed->[0]]; |
|
3095
|
24
|
|
|
|
|
73
|
my $result = $w; |
|
3096
|
|
|
|
|
|
|
|
|
3097
|
24
|
|
|
|
|
107
|
foreach my $j (1 .. $#$compressed) { |
|
3098
|
8918
|
|
|
|
|
18489
|
my $k = $compressed->[$j]; |
|
3099
|
|
|
|
|
|
|
|
|
3100
|
8918
|
50
|
|
|
|
21326
|
my $entry = |
|
|
|
100
|
|
|
|
|
|
|
3101
|
|
|
|
|
|
|
($k < $dict_size) ? $dictionary[$k] |
|
3102
|
|
|
|
|
|
|
: ($k == $dict_size) ? ($w . substr($w, 0, 1)) |
|
3103
|
|
|
|
|
|
|
: confess "Bad compressed k: $k"; |
|
3104
|
|
|
|
|
|
|
|
|
3105
|
8918
|
|
|
|
|
13628
|
$result .= $entry; |
|
3106
|
|
|
|
|
|
|
|
|
3107
|
|
|
|
|
|
|
# Add w+entry[0] to the dictionary |
|
3108
|
8918
|
|
|
|
|
26862
|
push @dictionary, $w . substr($entry, 0, 1); |
|
3109
|
8918
|
|
|
|
|
11187
|
++$dict_size; |
|
3110
|
8918
|
|
|
|
|
17111
|
$w = $entry; |
|
3111
|
|
|
|
|
|
|
} |
|
3112
|
|
|
|
|
|
|
|
|
3113
|
24
|
|
|
|
|
7992
|
return $result; |
|
3114
|
|
|
|
|
|
|
} |
|
3115
|
|
|
|
|
|
|
|
|
3116
|
25
|
|
|
25
|
1
|
1113618
|
sub lzw_compress ($chunk, $enc_method = \&abc_encode) { |
|
|
25
|
|
|
|
|
80
|
|
|
|
25
|
|
|
|
|
82
|
|
|
|
25
|
|
|
|
|
55
|
|
|
3117
|
25
|
|
|
|
|
185
|
$enc_method->(lzw_encode($chunk)); |
|
3118
|
|
|
|
|
|
|
} |
|
3119
|
|
|
|
|
|
|
|
|
3120
|
50
|
|
|
50
|
1
|
172
|
sub lzw_decompress ($fh, $dec_method = \&abc_decode) { |
|
|
50
|
|
|
|
|
108
|
|
|
|
50
|
|
|
|
|
139
|
|
|
|
50
|
|
|
|
|
86
|
|
|
3121
|
|
|
|
|
|
|
|
|
3122
|
50
|
100
|
|
|
|
205
|
if (ref($fh) eq '') { |
|
3123
|
25
|
50
|
|
|
|
663
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
3124
|
25
|
|
|
|
|
107
|
return __SUB__->($fh2, $dec_method); |
|
3125
|
|
|
|
|
|
|
} |
|
3126
|
|
|
|
|
|
|
|
|
3127
|
25
|
|
|
|
|
194
|
lzw_decode($dec_method->($fh)); |
|
3128
|
|
|
|
|
|
|
} |
|
3129
|
|
|
|
|
|
|
|
|
3130
|
|
|
|
|
|
|
################################### |
|
3131
|
|
|
|
|
|
|
# CRC-32 Pure Perl implementation |
|
3132
|
|
|
|
|
|
|
################################### |
|
3133
|
|
|
|
|
|
|
|
|
3134
|
|
|
|
|
|
|
sub _create_crc32_table { |
|
3135
|
4
|
|
|
4
|
|
6
|
my @table; |
|
3136
|
4
|
|
|
|
|
15
|
for my $i (0 .. 255) { |
|
3137
|
1024
|
|
|
|
|
1181
|
my $k = $i; |
|
3138
|
1024
|
|
|
|
|
1577
|
for (0 .. 7) { |
|
3139
|
8192
|
100
|
|
|
|
11672
|
if ($k & 1) { |
|
3140
|
4096
|
|
|
|
|
5566
|
$k >>= 1; |
|
3141
|
4096
|
|
|
|
|
5827
|
$k ^= 0xedb88320; |
|
3142
|
|
|
|
|
|
|
} |
|
3143
|
|
|
|
|
|
|
else { |
|
3144
|
4096
|
|
|
|
|
5642
|
$k >>= 1; |
|
3145
|
|
|
|
|
|
|
} |
|
3146
|
|
|
|
|
|
|
} |
|
3147
|
1024
|
|
|
|
|
2669
|
push(@table, $k & 0xffffffff); |
|
3148
|
|
|
|
|
|
|
} |
|
3149
|
4
|
|
|
|
|
22
|
return \@table; |
|
3150
|
|
|
|
|
|
|
} |
|
3151
|
|
|
|
|
|
|
|
|
3152
|
119
|
|
|
119
|
1
|
4412
|
sub crc32($str, $crc = 0) { |
|
|
119
|
|
|
|
|
510
|
|
|
|
119
|
|
|
|
|
311
|
|
|
|
119
|
|
|
|
|
248
|
|
|
3153
|
119
|
|
|
|
|
265
|
state $crc_table = _create_crc32_table(); |
|
3154
|
119
|
|
|
|
|
314
|
$crc &= 0xffffffff; |
|
3155
|
119
|
|
|
|
|
315
|
$crc ^= 0xffffffff; |
|
3156
|
119
|
|
|
|
|
60895
|
foreach my $c (unpack("C*", $str)) { |
|
3157
|
212008
|
|
|
|
|
579458
|
$crc = (($crc >> 8) ^ $crc_table->[($crc & 0xff) ^ $c]); |
|
3158
|
|
|
|
|
|
|
} |
|
3159
|
119
|
|
|
|
|
28779
|
return (($crc & 0xffffffff) ^ 0xffffffff); |
|
3160
|
|
|
|
|
|
|
} |
|
3161
|
|
|
|
|
|
|
|
|
3162
|
57
|
|
|
57
|
1
|
1179
|
sub adler32($str, $adler = 1) { |
|
|
57
|
|
|
|
|
182
|
|
|
|
57
|
|
|
|
|
146
|
|
|
|
57
|
|
|
|
|
91
|
|
|
3163
|
|
|
|
|
|
|
|
|
3164
|
|
|
|
|
|
|
# Reference: |
|
3165
|
|
|
|
|
|
|
# https://datatracker.ietf.org/doc/html/rfc1950#section-9 |
|
3166
|
|
|
|
|
|
|
|
|
3167
|
57
|
|
|
|
|
167
|
my $s1 = $adler & 0xffff; |
|
3168
|
57
|
|
|
|
|
141
|
my $s2 = ($adler >> 16) & 0xffff; |
|
3169
|
|
|
|
|
|
|
|
|
3170
|
57
|
|
|
|
|
1040
|
foreach my $c (unpack('C*', $str)) { |
|
3171
|
5186
|
|
|
|
|
9175
|
$s1 = ($s1 + $c) % 65521; |
|
3172
|
5186
|
|
|
|
|
10563
|
$s2 = ($s2 + $s1) % 65521; |
|
3173
|
|
|
|
|
|
|
} |
|
3174
|
57
|
|
|
|
|
655
|
return (($s2 << 16) + $s1); |
|
3175
|
|
|
|
|
|
|
} |
|
3176
|
|
|
|
|
|
|
|
|
3177
|
|
|
|
|
|
|
############################# |
|
3178
|
|
|
|
|
|
|
# Bzip2 compression |
|
3179
|
|
|
|
|
|
|
############################# |
|
3180
|
|
|
|
|
|
|
|
|
3181
|
12
|
|
|
12
|
|
25
|
sub _bzip2_encode_code_lengths($dict) { |
|
|
12
|
|
|
|
|
24
|
|
|
|
12
|
|
|
|
|
20
|
|
|
3182
|
12
|
|
|
|
|
25
|
my @lengths; |
|
3183
|
|
|
|
|
|
|
|
|
3184
|
12
|
|
50
|
|
|
358
|
foreach my $symbol (0 .. max(keys %$dict) // 0) { |
|
3185
|
247
|
50
|
|
|
|
566
|
if (exists($dict->{$symbol})) { |
|
3186
|
247
|
|
|
|
|
592
|
push @lengths, length($dict->{$symbol}); |
|
3187
|
|
|
|
|
|
|
} |
|
3188
|
|
|
|
|
|
|
else { |
|
3189
|
0
|
|
|
|
|
0
|
confess "Incomplete Huffman tree not supported"; |
|
3190
|
0
|
|
|
|
|
0
|
push @lengths, 0; |
|
3191
|
|
|
|
|
|
|
} |
|
3192
|
|
|
|
|
|
|
} |
|
3193
|
|
|
|
|
|
|
|
|
3194
|
12
|
|
|
|
|
68
|
my $deltas = deltas(\@lengths); |
|
3195
|
|
|
|
|
|
|
|
|
3196
|
12
|
50
|
|
|
|
46
|
$VERBOSE && say STDERR "Code lengths: (@lengths)"; |
|
3197
|
12
|
50
|
|
|
|
43
|
$VERBOSE && say STDERR "Code lengths deltas: (@$deltas)"; |
|
3198
|
|
|
|
|
|
|
|
|
3199
|
12
|
|
|
|
|
39
|
my $bitstring = int2bits(shift(@$deltas), 5) . '0'; |
|
3200
|
|
|
|
|
|
|
|
|
3201
|
12
|
|
|
|
|
46
|
foreach my $d (@$deltas) { |
|
3202
|
235
|
100
|
|
|
|
705
|
$bitstring .= (($d > 0) ? ('10' x $d) : ('11' x abs($d))) . '0'; |
|
3203
|
|
|
|
|
|
|
} |
|
3204
|
|
|
|
|
|
|
|
|
3205
|
12
|
50
|
|
|
|
42
|
$VERBOSE && say STDERR "Deltas bitstring: $bitstring"; |
|
3206
|
|
|
|
|
|
|
|
|
3207
|
12
|
|
|
|
|
103
|
return $bitstring; |
|
3208
|
|
|
|
|
|
|
} |
|
3209
|
|
|
|
|
|
|
|
|
3210
|
26
|
|
|
26
|
1
|
186304
|
sub bzip2_compress($fh) { |
|
|
26
|
|
|
|
|
80
|
|
|
|
26
|
|
|
|
|
47
|
|
|
3211
|
|
|
|
|
|
|
|
|
3212
|
26
|
100
|
|
|
|
113
|
if (ref($fh) eq '') { |
|
3213
|
13
|
50
|
|
|
|
246
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
3214
|
13
|
|
|
|
|
74
|
return __SUB__->($fh2); |
|
3215
|
|
|
|
|
|
|
} |
|
3216
|
|
|
|
|
|
|
|
|
3217
|
13
|
|
|
|
|
34
|
my $level = 9; |
|
3218
|
|
|
|
|
|
|
|
|
3219
|
|
|
|
|
|
|
# There is a CRC32 issue on some non-compressible inputs, when using very large chunk sizes |
|
3220
|
|
|
|
|
|
|
## my $CHUNK_SIZE = 100_000 * $level; |
|
3221
|
13
|
|
|
|
|
33
|
my $CHUNK_SIZE = 1 << 17; |
|
3222
|
|
|
|
|
|
|
|
|
3223
|
13
|
|
|
|
|
41
|
my $compressed = "BZh" . $level; |
|
3224
|
|
|
|
|
|
|
|
|
3225
|
13
|
|
|
|
|
42
|
state $block_header_bitstring = unpack("B48", "1AY&SY"); |
|
3226
|
13
|
|
|
|
|
33
|
state $block_footer_bitstring = unpack("B48", "\27rE8P\x90"); |
|
3227
|
|
|
|
|
|
|
|
|
3228
|
13
|
|
|
|
|
29
|
my $bitstring = ''; |
|
3229
|
13
|
|
|
|
|
29
|
my $stream_crc32 = 0; |
|
3230
|
|
|
|
|
|
|
|
|
3231
|
13
|
|
|
|
|
117
|
while (read($fh, (my $chunk), $CHUNK_SIZE)) { |
|
3232
|
|
|
|
|
|
|
|
|
3233
|
12
|
|
|
|
|
30
|
$bitstring .= $block_header_bitstring; |
|
3234
|
|
|
|
|
|
|
|
|
3235
|
12
|
|
|
|
|
1425
|
my $crc32 = crc32(pack('b*', unpack('B*', $chunk))); |
|
3236
|
12
|
50
|
|
|
|
955
|
$VERBOSE && say STDERR "CRC32: $crc32"; |
|
3237
|
|
|
|
|
|
|
|
|
3238
|
12
|
|
|
|
|
53
|
$crc32 = oct('0b' . int2bits_lsb($crc32, 32)); |
|
3239
|
12
|
50
|
|
|
|
45
|
$VERBOSE && say STDERR "Bzip2-CRC32: $crc32"; |
|
3240
|
|
|
|
|
|
|
|
|
3241
|
12
|
|
|
|
|
62
|
$stream_crc32 = ($crc32 ^ (0xffffffff & ((0xffffffff & ($stream_crc32 << 1)) | (($stream_crc32 >> 31) & 0x1)))) & 0xffffffff; |
|
3242
|
|
|
|
|
|
|
|
|
3243
|
12
|
|
|
|
|
76
|
$bitstring .= int2bits($crc32, 32); |
|
3244
|
12
|
|
|
|
|
44
|
$bitstring .= '0'; # not randomized |
|
3245
|
|
|
|
|
|
|
|
|
3246
|
12
|
|
|
|
|
67
|
my $rle4 = rle4_encode($chunk); |
|
3247
|
12
|
|
|
|
|
56
|
my ($bwt, $bwt_idx) = bwt_encode(symbols2string($rle4)); |
|
3248
|
|
|
|
|
|
|
|
|
3249
|
12
|
|
|
|
|
64
|
$bitstring .= int2bits($bwt_idx, 24); |
|
3250
|
|
|
|
|
|
|
|
|
3251
|
12
|
|
|
|
|
81
|
my ($mtf, $alphabet) = mtf_encode($bwt); |
|
3252
|
12
|
50
|
|
|
|
52
|
$VERBOSE && say STDERR "Alphabet: (@$alphabet)"; |
|
3253
|
|
|
|
|
|
|
|
|
3254
|
12
|
|
|
|
|
57
|
$bitstring .= unpack('B*', encode_alphabet_256($alphabet)); |
|
3255
|
|
|
|
|
|
|
|
|
3256
|
12
|
|
|
|
|
38
|
my @zrle = reverse @{zrle_encode([reverse @$mtf])}; |
|
|
12
|
|
|
|
|
2131
|
|
|
3257
|
|
|
|
|
|
|
|
|
3258
|
12
|
|
|
|
|
2812
|
my $eob = scalar(@$alphabet) + 1; # end-of-block symbol |
|
3259
|
12
|
50
|
|
|
|
52
|
$VERBOSE && say STDERR "EOB symbol: $eob"; |
|
3260
|
12
|
|
|
|
|
32
|
push @zrle, $eob; |
|
3261
|
|
|
|
|
|
|
|
|
3262
|
12
|
|
|
|
|
1955
|
my ($dict) = huffman_from_symbols([@zrle, 0 .. $eob - 1]); |
|
3263
|
12
|
|
|
|
|
2667
|
my $num_sels = int(sprintf('%.0f', 0.5 + (scalar(@zrle) / 50))); # ceil(|zrle| / 50) |
|
3264
|
12
|
50
|
|
|
|
68
|
$VERBOSE && say STDERR "Number of selectors: $num_sels"; |
|
3265
|
|
|
|
|
|
|
|
|
3266
|
12
|
|
|
|
|
47
|
$bitstring .= int2bits(2, 3); |
|
3267
|
12
|
|
|
|
|
43
|
$bitstring .= int2bits($num_sels, 15); |
|
3268
|
12
|
|
|
|
|
227
|
$bitstring .= '0' x $num_sels; |
|
3269
|
|
|
|
|
|
|
|
|
3270
|
12
|
|
|
|
|
76
|
$bitstring .= _bzip2_encode_code_lengths($dict) x 2; |
|
3271
|
12
|
|
|
|
|
89
|
$bitstring .= join('', @{$dict}{@zrle}); |
|
|
12
|
|
|
|
|
3115
|
|
|
3272
|
|
|
|
|
|
|
|
|
3273
|
12
|
|
|
|
|
9994
|
$compressed .= pack('B*', substr($bitstring, 0, length($bitstring) - (length($bitstring) % 8), '')); |
|
3274
|
|
|
|
|
|
|
} |
|
3275
|
|
|
|
|
|
|
|
|
3276
|
13
|
|
|
|
|
56
|
$bitstring .= $block_footer_bitstring; |
|
3277
|
13
|
|
|
|
|
49
|
$bitstring .= int2bits($stream_crc32, 32); |
|
3278
|
13
|
|
|
|
|
446
|
$compressed .= pack('B*', $bitstring); |
|
3279
|
|
|
|
|
|
|
|
|
3280
|
13
|
|
|
|
|
308
|
return $compressed; |
|
3281
|
|
|
|
|
|
|
} |
|
3282
|
|
|
|
|
|
|
|
|
3283
|
|
|
|
|
|
|
################################# |
|
3284
|
|
|
|
|
|
|
# Bzip2 decompression |
|
3285
|
|
|
|
|
|
|
################################# |
|
3286
|
|
|
|
|
|
|
|
|
3287
|
46
|
|
|
46
|
1
|
1137
|
sub bzip2_decompress($fh) { |
|
|
46
|
|
|
|
|
136
|
|
|
|
46
|
|
|
|
|
97
|
|
|
3288
|
|
|
|
|
|
|
|
|
3289
|
46
|
100
|
|
|
|
181
|
if (ref($fh) eq '') { |
|
3290
|
23
|
50
|
|
|
|
367
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
3291
|
23
|
|
|
|
|
106
|
return __SUB__->($fh2); |
|
3292
|
|
|
|
|
|
|
} |
|
3293
|
|
|
|
|
|
|
|
|
3294
|
23
|
|
|
|
|
57
|
state $MaxHuffmanBits = 20; |
|
3295
|
23
|
|
|
|
|
87
|
my $decompressed = ''; |
|
3296
|
|
|
|
|
|
|
|
|
3297
|
23
|
|
|
|
|
164
|
while (!eof($fh)) { |
|
3298
|
|
|
|
|
|
|
|
|
3299
|
26
|
|
|
|
|
64
|
my $buffer = ''; |
|
3300
|
|
|
|
|
|
|
|
|
3301
|
26
|
50
|
33
|
|
|
128
|
(bytes2int($fh, 2) == 0x425a and getc($fh) eq 'h') |
|
3302
|
|
|
|
|
|
|
or confess "Not a valid Bzip2 archive"; |
|
3303
|
|
|
|
|
|
|
|
|
3304
|
26
|
|
|
|
|
124
|
my $level = getc($fh); |
|
3305
|
|
|
|
|
|
|
|
|
3306
|
26
|
50
|
|
|
|
193
|
if ($level !~ /^[1-9]\z/) { |
|
3307
|
0
|
|
|
|
|
0
|
confess "Invalid level: $level"; |
|
3308
|
|
|
|
|
|
|
} |
|
3309
|
|
|
|
|
|
|
|
|
3310
|
26
|
50
|
|
|
|
82
|
$VERBOSE && say STDERR "Compression level: $level"; |
|
3311
|
|
|
|
|
|
|
|
|
3312
|
26
|
|
|
|
|
73
|
my $stream_crc32 = 0; |
|
3313
|
|
|
|
|
|
|
|
|
3314
|
26
|
|
|
|
|
107
|
while (!eof($fh)) { |
|
3315
|
|
|
|
|
|
|
|
|
3316
|
50
|
|
|
|
|
228
|
my $block_magic = pack "B48", join('', map { read_bit($fh, \$buffer) } 1 .. 48); |
|
|
2400
|
|
|
|
|
4540
|
|
|
3317
|
|
|
|
|
|
|
|
|
3318
|
50
|
100
|
|
|
|
415
|
if ($block_magic eq "1AY&SY") { # BlockHeader |
|
|
|
50
|
|
|
|
|
|
|
3319
|
24
|
50
|
|
|
|
76
|
$VERBOSE && say STDERR "Block header detected"; |
|
3320
|
|
|
|
|
|
|
|
|
3321
|
24
|
|
|
|
|
91
|
my $crc32 = bits2int($fh, 32, \$buffer); |
|
3322
|
24
|
50
|
|
|
|
97
|
$VERBOSE && say STDERR "CRC32 = $crc32"; |
|
3323
|
|
|
|
|
|
|
|
|
3324
|
24
|
|
|
|
|
129
|
$stream_crc32 = ($crc32 ^ (0xffffffff & ((0xffffffff & ($stream_crc32 << 1)) | (($stream_crc32 >> 31) & 0x1)))) & 0xffffffff; |
|
3325
|
|
|
|
|
|
|
|
|
3326
|
24
|
|
|
|
|
88
|
my $randomized = read_bit($fh, \$buffer); |
|
3327
|
24
|
50
|
|
|
|
122
|
$randomized == 0 or confess "randomized not supported"; |
|
3328
|
|
|
|
|
|
|
|
|
3329
|
24
|
|
|
|
|
73
|
my $bwt_idx = bits2int($fh, 24, \$buffer); |
|
3330
|
24
|
50
|
|
|
|
77
|
$VERBOSE && say STDERR "BWT index: $bwt_idx"; |
|
3331
|
|
|
|
|
|
|
|
|
3332
|
24
|
|
|
|
|
47
|
my @alphabet; |
|
3333
|
24
|
|
|
|
|
68
|
my $l1 = bits2int($fh, 16, \$buffer); |
|
3334
|
24
|
|
|
|
|
73
|
for my $i (0 .. 15) { |
|
3335
|
384
|
100
|
|
|
|
956
|
if ($l1 & (0x8000 >> $i)) { |
|
3336
|
78
|
|
|
|
|
247
|
my $l2 = bits2int($fh, 16, \$buffer); |
|
3337
|
78
|
|
|
|
|
195
|
for my $j (0 .. 15) { |
|
3338
|
1248
|
100
|
|
|
|
3063
|
if ($l2 & (0x8000 >> $j)) { |
|
3339
|
300
|
|
|
|
|
695
|
push @alphabet, 16 * $i + $j; |
|
3340
|
|
|
|
|
|
|
} |
|
3341
|
|
|
|
|
|
|
} |
|
3342
|
|
|
|
|
|
|
} |
|
3343
|
|
|
|
|
|
|
} |
|
3344
|
|
|
|
|
|
|
|
|
3345
|
24
|
50
|
|
|
|
68
|
$VERBOSE && say STDERR "MTF alphabet: (@alphabet)"; |
|
3346
|
|
|
|
|
|
|
|
|
3347
|
24
|
|
|
|
|
73
|
my $num_trees = bits2int($fh, 3, \$buffer); |
|
3348
|
24
|
50
|
|
|
|
92
|
$VERBOSE && say STDERR "Number or trees: $num_trees"; |
|
3349
|
|
|
|
|
|
|
|
|
3350
|
24
|
|
|
|
|
75
|
my $num_sels = bits2int($fh, 15, \$buffer); |
|
3351
|
24
|
50
|
|
|
|
73
|
$VERBOSE && say STDERR "Number of selectors: $num_sels"; |
|
3352
|
|
|
|
|
|
|
|
|
3353
|
24
|
|
|
|
|
47
|
my @idxs; |
|
3354
|
24
|
|
|
|
|
72
|
for (1 .. $num_sels) { |
|
3355
|
239
|
|
|
|
|
379
|
my $i = 0; |
|
3356
|
239
|
|
|
|
|
472
|
while (read_bit($fh, \$buffer)) { |
|
3357
|
2
|
|
|
|
|
6
|
$i += 1; |
|
3358
|
2
|
50
|
|
|
|
9
|
($i < $num_trees) or confess "error"; |
|
3359
|
|
|
|
|
|
|
} |
|
3360
|
239
|
|
|
|
|
629
|
push @idxs, $i; |
|
3361
|
|
|
|
|
|
|
} |
|
3362
|
|
|
|
|
|
|
|
|
3363
|
24
|
|
|
|
|
168
|
my $sels = mtf_decode(\@idxs, [0 .. $num_trees - 1]); |
|
3364
|
24
|
50
|
|
|
|
103
|
$VERBOSE && say STDERR "Selectors: (@$sels)"; |
|
3365
|
|
|
|
|
|
|
|
|
3366
|
24
|
|
|
|
|
67
|
my $num_syms = scalar(@alphabet) + 2; |
|
3367
|
|
|
|
|
|
|
|
|
3368
|
24
|
|
|
|
|
46
|
my @trees; |
|
3369
|
24
|
|
|
|
|
91
|
for (1 .. $num_trees) { |
|
3370
|
48
|
|
|
|
|
110
|
my @clens; |
|
3371
|
48
|
|
|
|
|
143
|
my $clen = bits2int($fh, 5, \$buffer); |
|
3372
|
48
|
|
|
|
|
129
|
for (1 .. $num_syms) { |
|
3373
|
696
|
|
|
|
|
1233
|
while (1) { |
|
3374
|
|
|
|
|
|
|
|
|
3375
|
1109
|
50
|
33
|
|
|
4095
|
($clen > 0 and $clen <= $MaxHuffmanBits) or confess "invalid code length: $clen"; |
|
3376
|
|
|
|
|
|
|
|
|
3377
|
1109
|
100
|
|
|
|
2290
|
if (not read_bit($fh, \$buffer)) { |
|
3378
|
696
|
|
|
|
|
1410
|
last; |
|
3379
|
|
|
|
|
|
|
} |
|
3380
|
|
|
|
|
|
|
|
|
3381
|
413
|
100
|
|
|
|
1010
|
$clen -= read_bit($fh, \$buffer) ? 1 : -1; |
|
3382
|
|
|
|
|
|
|
} |
|
3383
|
|
|
|
|
|
|
|
|
3384
|
696
|
|
|
|
|
2052
|
push @clens, $clen; |
|
3385
|
|
|
|
|
|
|
} |
|
3386
|
48
|
|
|
|
|
112
|
push @trees, \@clens; |
|
3387
|
48
|
50
|
|
|
|
184
|
$VERBOSE && say STDERR "Code lengths: (@clens)"; |
|
3388
|
|
|
|
|
|
|
} |
|
3389
|
|
|
|
|
|
|
|
|
3390
|
24
|
|
|
|
|
62
|
foreach my $tree (@trees) { |
|
3391
|
48
|
|
|
|
|
192
|
my $maxLen = max(@$tree); |
|
3392
|
48
|
|
|
|
|
98
|
my $sum = 1 << $maxLen; |
|
3393
|
48
|
|
|
|
|
111
|
for my $clen (@$tree) { |
|
3394
|
696
|
|
|
|
|
1429
|
$sum -= (1 << $maxLen) >> $clen; |
|
3395
|
|
|
|
|
|
|
} |
|
3396
|
48
|
50
|
|
|
|
176
|
$sum == 0 or confess "incomplete tree not supported: (@$tree)"; |
|
3397
|
|
|
|
|
|
|
} |
|
3398
|
|
|
|
|
|
|
|
|
3399
|
24
|
|
|
|
|
68
|
my @huffman_trees = map { (huffman_from_code_lengths($_))[1] } @trees; |
|
|
48
|
|
|
|
|
165
|
|
|
3400
|
|
|
|
|
|
|
|
|
3401
|
24
|
|
|
|
|
72
|
my $eob = @alphabet + 1; |
|
3402
|
|
|
|
|
|
|
|
|
3403
|
24
|
|
|
|
|
47
|
my @zrle; |
|
3404
|
24
|
|
|
|
|
60
|
my $code = ''; |
|
3405
|
|
|
|
|
|
|
|
|
3406
|
24
|
|
|
|
|
45
|
my $sel_idx = 0; |
|
3407
|
24
|
|
|
|
|
66
|
my $tree = $huffman_trees[$sels->[$sel_idx]]; |
|
3408
|
24
|
|
|
|
|
58
|
my $decoded = 50; |
|
3409
|
|
|
|
|
|
|
|
|
3410
|
24
|
|
|
|
|
133
|
while (!eof($fh)) { |
|
3411
|
42314
|
|
|
|
|
72158
|
$code .= read_bit($fh, \$buffer); |
|
3412
|
|
|
|
|
|
|
|
|
3413
|
42314
|
50
|
|
|
|
81774
|
if (length($code) > $MaxHuffmanBits) { |
|
3414
|
0
|
|
|
|
|
0
|
confess "[!] Something went wrong: length of code `$code` is > $MaxHuffmanBits."; |
|
3415
|
|
|
|
|
|
|
} |
|
3416
|
|
|
|
|
|
|
|
|
3417
|
42314
|
100
|
|
|
|
115238
|
if (exists($tree->{$code})) { |
|
3418
|
|
|
|
|
|
|
|
|
3419
|
11076
|
|
|
|
|
20492
|
my $sym = $tree->{$code}; |
|
3420
|
|
|
|
|
|
|
|
|
3421
|
11076
|
100
|
|
|
|
20298
|
if ($sym == $eob) { # end of block marker |
|
3422
|
24
|
50
|
|
|
|
72
|
$VERBOSE && say STDERR "EOB detected: $sym"; |
|
3423
|
24
|
|
|
|
|
73
|
last; |
|
3424
|
|
|
|
|
|
|
} |
|
3425
|
|
|
|
|
|
|
|
|
3426
|
11052
|
|
|
|
|
19372
|
push @zrle, $sym; |
|
3427
|
11052
|
|
|
|
|
14878
|
$code = ''; |
|
3428
|
|
|
|
|
|
|
|
|
3429
|
11052
|
100
|
|
|
|
33975
|
if (--$decoded <= 0) { |
|
3430
|
215
|
50
|
|
|
|
673
|
if (++$sel_idx <= $#$sels) { |
|
3431
|
215
|
|
|
|
|
619
|
$tree = $huffman_trees[$sels->[$sel_idx]]; |
|
3432
|
|
|
|
|
|
|
} |
|
3433
|
|
|
|
|
|
|
else { |
|
3434
|
0
|
|
|
|
|
0
|
confess "No more selectors"; # should not happen |
|
3435
|
|
|
|
|
|
|
} |
|
3436
|
215
|
|
|
|
|
658
|
$decoded = 50; |
|
3437
|
|
|
|
|
|
|
} |
|
3438
|
|
|
|
|
|
|
} |
|
3439
|
|
|
|
|
|
|
} |
|
3440
|
|
|
|
|
|
|
|
|
3441
|
24
|
|
|
|
|
50
|
my @mtf = reverse @{zrle_decode([reverse @zrle])}; |
|
|
24
|
|
|
|
|
3441
|
|
|
3442
|
24
|
|
|
|
|
2003
|
my $bwt = symbols2string mtf_decode(\@mtf, \@alphabet); |
|
3443
|
|
|
|
|
|
|
|
|
3444
|
24
|
|
|
|
|
688
|
my $rle4 = string2symbols bwt_decode($bwt, $bwt_idx); |
|
3445
|
24
|
|
|
|
|
116
|
my $data = rle4_decode($rle4); |
|
3446
|
24
|
|
|
|
|
94
|
my $dec = symbols2string($data); |
|
3447
|
|
|
|
|
|
|
|
|
3448
|
24
|
|
|
|
|
1312
|
my $new_crc32 = oct('0b' . int2bits_lsb(crc32(pack('b*', unpack('B*', $dec))), 32)); |
|
3449
|
|
|
|
|
|
|
|
|
3450
|
24
|
50
|
|
|
|
230
|
$VERBOSE && say STDERR "Computed CRC32: $new_crc32"; |
|
3451
|
|
|
|
|
|
|
|
|
3452
|
24
|
50
|
|
|
|
118
|
if ($crc32 != $new_crc32) { |
|
3453
|
0
|
|
|
|
|
0
|
confess "CRC32 error: $crc32 (stored) != $new_crc32 (actual)"; |
|
3454
|
|
|
|
|
|
|
} |
|
3455
|
|
|
|
|
|
|
|
|
3456
|
24
|
|
|
|
|
8533
|
$decompressed .= $dec; |
|
3457
|
|
|
|
|
|
|
} |
|
3458
|
|
|
|
|
|
|
elsif ($block_magic eq "\27rE8P\x90") { # BlockFooter |
|
3459
|
26
|
50
|
|
|
|
88
|
$VERBOSE && say STDERR "Block footer detected"; |
|
3460
|
26
|
|
|
|
|
120
|
my $stored_stream_crc32 = bits2int($fh, 32, \$buffer); |
|
3461
|
26
|
50
|
|
|
|
102
|
$VERBOSE && say STDERR "Stream CRC: $stored_stream_crc32"; |
|
3462
|
|
|
|
|
|
|
|
|
3463
|
26
|
50
|
|
|
|
129
|
if ($stored_stream_crc32 != $stream_crc32) { |
|
3464
|
0
|
|
|
|
|
0
|
confess "Stream CRC32 error: $stored_stream_crc32 (stored) != $stream_crc32 (actual)"; |
|
3465
|
|
|
|
|
|
|
} |
|
3466
|
|
|
|
|
|
|
|
|
3467
|
26
|
|
|
|
|
83
|
$buffer = ''; |
|
3468
|
26
|
|
|
|
|
91
|
last; |
|
3469
|
|
|
|
|
|
|
} |
|
3470
|
|
|
|
|
|
|
else { |
|
3471
|
0
|
|
|
|
|
0
|
confess "Unknown block magic: $block_magic"; |
|
3472
|
|
|
|
|
|
|
} |
|
3473
|
|
|
|
|
|
|
} |
|
3474
|
|
|
|
|
|
|
|
|
3475
|
26
|
50
|
|
|
|
251
|
$VERBOSE && say STDERR "End of container"; |
|
3476
|
|
|
|
|
|
|
} |
|
3477
|
|
|
|
|
|
|
|
|
3478
|
23
|
|
|
|
|
405
|
return $decompressed; |
|
3479
|
|
|
|
|
|
|
} |
|
3480
|
|
|
|
|
|
|
|
|
3481
|
|
|
|
|
|
|
######################################## |
|
3482
|
|
|
|
|
|
|
# GZIP compressor |
|
3483
|
|
|
|
|
|
|
######################################## |
|
3484
|
|
|
|
|
|
|
|
|
3485
|
108
|
|
|
108
|
|
237
|
sub _code_length_encoding ($dict) { |
|
|
108
|
|
|
|
|
242
|
|
|
|
108
|
|
|
|
|
242
|
|
|
3486
|
|
|
|
|
|
|
|
|
3487
|
108
|
|
|
|
|
224
|
my @lengths; |
|
3488
|
|
|
|
|
|
|
|
|
3489
|
108
|
|
100
|
|
|
1939
|
foreach my $symbol (0 .. max(keys %$dict) // 0) { |
|
3490
|
14760
|
100
|
|
|
|
26605
|
if (exists($dict->{$symbol})) { |
|
3491
|
1057
|
|
|
|
|
2296
|
push @lengths, length($dict->{$symbol}); |
|
3492
|
|
|
|
|
|
|
} |
|
3493
|
|
|
|
|
|
|
else { |
|
3494
|
13703
|
|
|
|
|
25140
|
push @lengths, 0; |
|
3495
|
|
|
|
|
|
|
} |
|
3496
|
|
|
|
|
|
|
} |
|
3497
|
|
|
|
|
|
|
|
|
3498
|
108
|
|
|
|
|
333
|
my $size = scalar(@lengths); |
|
3499
|
108
|
|
|
|
|
480
|
my $rl = run_length(\@lengths); |
|
3500
|
108
|
|
|
|
|
310
|
my $offset_bits = ''; |
|
3501
|
|
|
|
|
|
|
|
|
3502
|
108
|
|
|
|
|
237
|
my @CL_symbols; |
|
3503
|
|
|
|
|
|
|
|
|
3504
|
108
|
|
|
|
|
285
|
foreach my $pair (@$rl) { |
|
3505
|
1203
|
|
|
|
|
3081
|
my ($v, $run) = @$pair; |
|
3506
|
|
|
|
|
|
|
|
|
3507
|
1203
|
|
100
|
|
|
3192
|
while ($v == 0 and $run >= 3) { |
|
3508
|
|
|
|
|
|
|
|
|
3509
|
251
|
100
|
|
|
|
640
|
if ($run >= 11) { |
|
3510
|
171
|
|
|
|
|
369
|
push @CL_symbols, 18; |
|
3511
|
171
|
|
|
|
|
269
|
$run -= 11; |
|
3512
|
171
|
|
|
|
|
627
|
$offset_bits .= int2bits_lsb(min($run, 127), 7); |
|
3513
|
171
|
|
|
|
|
440
|
$run -= 127; |
|
3514
|
|
|
|
|
|
|
} |
|
3515
|
|
|
|
|
|
|
|
|
3516
|
251
|
100
|
100
|
|
|
1161
|
if ($run >= 3 and $run < 11) { |
|
3517
|
82
|
|
|
|
|
166
|
push @CL_symbols, 17; |
|
3518
|
82
|
|
|
|
|
158
|
$run -= 3; |
|
3519
|
82
|
|
|
|
|
282
|
$offset_bits .= int2bits_lsb(min($run, 7), 3); |
|
3520
|
82
|
|
|
|
|
378
|
$run -= 7; |
|
3521
|
|
|
|
|
|
|
} |
|
3522
|
|
|
|
|
|
|
} |
|
3523
|
|
|
|
|
|
|
|
|
3524
|
1203
|
100
|
|
|
|
2434
|
if ($v == 0) { |
|
3525
|
396
|
100
|
|
|
|
1093
|
push(@CL_symbols, (0) x $run) if ($run > 0); |
|
3526
|
396
|
|
|
|
|
914
|
next; |
|
3527
|
|
|
|
|
|
|
} |
|
3528
|
|
|
|
|
|
|
|
|
3529
|
807
|
|
|
|
|
1471
|
push @CL_symbols, $v; |
|
3530
|
807
|
|
|
|
|
1175
|
$run -= 1; |
|
3531
|
|
|
|
|
|
|
|
|
3532
|
807
|
|
|
|
|
1664
|
while ($run >= 3) { |
|
3533
|
17
|
|
|
|
|
32
|
push @CL_symbols, 16; |
|
3534
|
17
|
|
|
|
|
32
|
$run -= 3; |
|
3535
|
17
|
|
|
|
|
63
|
$offset_bits .= int2bits_lsb(min($run, 3), 2); |
|
3536
|
17
|
|
|
|
|
56
|
$run -= 3; |
|
3537
|
|
|
|
|
|
|
} |
|
3538
|
|
|
|
|
|
|
|
|
3539
|
807
|
100
|
|
|
|
2137
|
push(@CL_symbols, ($v) x $run) if ($run > 0); |
|
3540
|
|
|
|
|
|
|
} |
|
3541
|
|
|
|
|
|
|
|
|
3542
|
108
|
|
|
|
|
1172
|
return (\@CL_symbols, $size, $offset_bits); |
|
3543
|
|
|
|
|
|
|
} |
|
3544
|
|
|
|
|
|
|
|
|
3545
|
108
|
|
|
108
|
|
189
|
sub _cl_encoded_bitstring ($cl_dict, $cl_symbols, $offset_bits) { |
|
|
108
|
|
|
|
|
215
|
|
|
|
108
|
|
|
|
|
171
|
|
|
|
108
|
|
|
|
|
206
|
|
|
|
108
|
|
|
|
|
222
|
|
|
3546
|
|
|
|
|
|
|
|
|
3547
|
108
|
|
|
|
|
193
|
my $bitstring = ''; |
|
3548
|
108
|
|
|
|
|
303
|
foreach my $cl_symbol (@$cl_symbols) { |
|
3549
|
1494
|
|
|
|
|
3047
|
$bitstring .= $cl_dict->{$cl_symbol}; |
|
3550
|
1494
|
100
|
|
|
|
4407
|
if ($cl_symbol == 16) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3551
|
17
|
|
|
|
|
44
|
$bitstring .= substr($offset_bits, 0, 2, ''); |
|
3552
|
|
|
|
|
|
|
} |
|
3553
|
|
|
|
|
|
|
elsif ($cl_symbol == 17) { |
|
3554
|
82
|
|
|
|
|
220
|
$bitstring .= substr($offset_bits, 0, 3, ''); |
|
3555
|
|
|
|
|
|
|
} |
|
3556
|
|
|
|
|
|
|
elsif ($cl_symbol == 18) { |
|
3557
|
171
|
|
|
|
|
454
|
$bitstring .= substr($offset_bits, 0, 7, ''); |
|
3558
|
|
|
|
|
|
|
} |
|
3559
|
|
|
|
|
|
|
} |
|
3560
|
|
|
|
|
|
|
|
|
3561
|
108
|
|
|
|
|
418
|
return $bitstring; |
|
3562
|
|
|
|
|
|
|
} |
|
3563
|
|
|
|
|
|
|
|
|
3564
|
54
|
|
|
54
|
|
122
|
sub _create_cl_dictionary (@cl_symbols) { |
|
|
54
|
|
|
|
|
279
|
|
|
|
54
|
|
|
|
|
107
|
|
|
3565
|
|
|
|
|
|
|
|
|
3566
|
54
|
|
|
|
|
139
|
my @keys; |
|
3567
|
54
|
|
|
|
|
200
|
my $freq = frequencies(\@cl_symbols); |
|
3568
|
|
|
|
|
|
|
|
|
3569
|
54
|
|
|
|
|
147
|
while (1) { |
|
3570
|
54
|
|
|
|
|
213
|
my ($cl_dict) = huffman_from_freq($freq); |
|
3571
|
|
|
|
|
|
|
|
|
3572
|
|
|
|
|
|
|
# The CL codes must have at most 7 bits |
|
3573
|
54
|
50
|
|
330
|
|
760
|
return $cl_dict if all { length($_) <= 7 } values %$cl_dict; |
|
|
330
|
|
|
|
|
1115
|
|
|
3574
|
|
|
|
|
|
|
|
|
3575
|
0
|
0
|
|
|
|
0
|
if (scalar(@keys) == 0) { |
|
3576
|
0
|
|
|
|
|
0
|
@keys = sort { $freq->{$b} <=> $freq->{$a} } keys %$freq; |
|
|
0
|
|
|
|
|
0
|
|
|
3577
|
|
|
|
|
|
|
} |
|
3578
|
|
|
|
|
|
|
|
|
3579
|
|
|
|
|
|
|
# Scale down the frequencies and try again |
|
3580
|
0
|
|
|
|
|
0
|
foreach my $k (@keys) { |
|
3581
|
0
|
0
|
|
|
|
0
|
if ($freq->{$k} > 1) { |
|
3582
|
0
|
|
|
|
|
0
|
$freq->{$k} >>= 1; |
|
3583
|
|
|
|
|
|
|
} |
|
3584
|
|
|
|
|
|
|
else { |
|
3585
|
0
|
|
|
|
|
0
|
last; |
|
3586
|
|
|
|
|
|
|
} |
|
3587
|
|
|
|
|
|
|
} |
|
3588
|
|
|
|
|
|
|
} |
|
3589
|
|
|
|
|
|
|
} |
|
3590
|
|
|
|
|
|
|
|
|
3591
|
54
|
|
|
54
|
1
|
125
|
sub deflate_create_block_type_2 ($literals, $distances, $lengths) { |
|
|
54
|
|
|
|
|
127
|
|
|
|
54
|
|
|
|
|
116
|
|
|
|
54
|
|
|
|
|
105
|
|
|
|
54
|
|
|
|
|
134
|
|
|
3592
|
|
|
|
|
|
|
|
|
3593
|
54
|
50
|
|
|
|
183
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
3594
|
54
|
|
|
|
|
123
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
3595
|
54
|
|
|
|
|
113
|
local $LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing |
|
3596
|
|
|
|
|
|
|
|
|
3597
|
54
|
|
|
|
|
123
|
state $deflate_tables = [make_deflate_tables()]; |
|
3598
|
54
|
|
|
|
|
205
|
my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = @$deflate_tables; |
|
3599
|
|
|
|
|
|
|
|
|
3600
|
54
|
|
|
|
|
298
|
my @CL_order = (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15); |
|
3601
|
|
|
|
|
|
|
|
|
3602
|
54
|
|
|
|
|
118
|
my $bitstring = '01'; |
|
3603
|
|
|
|
|
|
|
|
|
3604
|
54
|
|
|
|
|
191
|
my @len_symbols; |
|
3605
|
|
|
|
|
|
|
my @dist_symbols; |
|
3606
|
54
|
|
|
|
|
126
|
my $offset_bits = ''; |
|
3607
|
|
|
|
|
|
|
|
|
3608
|
54
|
|
|
|
|
210
|
foreach my $k (0 .. $#$literals) { |
|
3609
|
|
|
|
|
|
|
|
|
3610
|
10267
|
100
|
|
|
|
22964
|
if ($lengths->[$k] == 0) { |
|
3611
|
6527
|
|
|
|
|
18297
|
push @len_symbols, $literals->[$k]; |
|
3612
|
6527
|
|
|
|
|
13028
|
next; |
|
3613
|
|
|
|
|
|
|
} |
|
3614
|
|
|
|
|
|
|
|
|
3615
|
3740
|
|
|
|
|
5502
|
my $len = $lengths->[$k]; |
|
3616
|
3740
|
|
|
|
|
5791
|
my $dist = $distances->[$k]; |
|
3617
|
|
|
|
|
|
|
|
|
3618
|
|
|
|
|
|
|
{ |
|
3619
|
3740
|
|
|
|
|
5318
|
my $len_idx = $LENGTH_INDICES->[$len]; |
|
3620
|
3740
|
|
|
|
|
4968
|
my ($min, $bits) = @{$LENGTH_SYMBOLS->[$len_idx]}; |
|
|
3740
|
|
|
|
|
7385
|
|
|
3621
|
|
|
|
|
|
|
|
|
3622
|
3740
|
|
|
|
|
10700
|
push @len_symbols, [$len_idx + 256 - 1, $bits]; |
|
3623
|
3740
|
100
|
|
|
|
10058
|
$offset_bits .= int2bits_lsb($len - $min, $bits) if ($bits > 0); |
|
3624
|
|
|
|
|
|
|
} |
|
3625
|
|
|
|
|
|
|
|
|
3626
|
|
|
|
|
|
|
{ |
|
3627
|
3740
|
|
|
|
|
5081
|
my $dist_idx = find_deflate_index($dist, $DISTANCE_SYMBOLS); |
|
|
3740
|
|
|
|
|
6127
|
|
|
|
3740
|
|
|
|
|
6516
|
|
|
3628
|
3740
|
|
|
|
|
5388
|
my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$dist_idx]}; |
|
|
3740
|
|
|
|
|
7209
|
|
|
3629
|
|
|
|
|
|
|
|
|
3630
|
3740
|
|
|
|
|
8771
|
push @dist_symbols, [$dist_idx - 1, $bits]; |
|
3631
|
3740
|
100
|
|
|
|
8934
|
$offset_bits .= int2bits_lsb($dist - $min, $bits) if ($bits > 0); |
|
3632
|
|
|
|
|
|
|
} |
|
3633
|
|
|
|
|
|
|
} |
|
3634
|
|
|
|
|
|
|
|
|
3635
|
54
|
|
|
|
|
173
|
push @len_symbols, 256; # end-of-block marker |
|
3636
|
|
|
|
|
|
|
|
|
3637
|
54
|
100
|
|
|
|
270
|
my ($dict) = huffman_from_symbols([map { ref($_) eq 'ARRAY' ? $_->[0] : $_ } @len_symbols]); |
|
|
10321
|
|
|
|
|
39175
|
|
|
3638
|
54
|
|
|
|
|
5006
|
my ($dist_dict) = huffman_from_symbols([map { $_->[0] } @dist_symbols]); |
|
|
3740
|
|
|
|
|
8189
|
|
|
3639
|
|
|
|
|
|
|
|
|
3640
|
54
|
|
|
|
|
1040
|
my ($LL_code_lengths, $LL_cl_len, $LL_offset_bits) = _code_length_encoding($dict); |
|
3641
|
54
|
|
|
|
|
190
|
my ($distance_code_lengths, $distance_cl_len, $distance_offset_bits) = _code_length_encoding($dist_dict); |
|
3642
|
|
|
|
|
|
|
|
|
3643
|
54
|
|
|
|
|
479
|
my $cl_dict = _create_cl_dictionary(@$LL_code_lengths, @$distance_code_lengths); |
|
3644
|
|
|
|
|
|
|
|
|
3645
|
54
|
|
|
|
|
150
|
my @CL_code_lenghts; |
|
3646
|
54
|
|
|
|
|
187
|
foreach my $symbol (0 .. 18) { |
|
3647
|
1026
|
100
|
|
|
|
2018
|
if (exists($cl_dict->{$symbol})) { |
|
3648
|
330
|
|
|
|
|
751
|
push @CL_code_lenghts, length($cl_dict->{$symbol}); |
|
3649
|
|
|
|
|
|
|
} |
|
3650
|
|
|
|
|
|
|
else { |
|
3651
|
696
|
|
|
|
|
1189
|
push @CL_code_lenghts, 0; |
|
3652
|
|
|
|
|
|
|
} |
|
3653
|
|
|
|
|
|
|
} |
|
3654
|
|
|
|
|
|
|
|
|
3655
|
|
|
|
|
|
|
# Put the CL codes in the required order |
|
3656
|
54
|
|
|
|
|
322
|
@CL_code_lenghts = @CL_code_lenghts[@CL_order]; |
|
3657
|
|
|
|
|
|
|
|
|
3658
|
54
|
|
66
|
|
|
381
|
while (scalar(@CL_code_lenghts) > 4 and $CL_code_lenghts[-1] == 0) { |
|
3659
|
90
|
|
|
|
|
332
|
pop @CL_code_lenghts; |
|
3660
|
|
|
|
|
|
|
} |
|
3661
|
|
|
|
|
|
|
|
|
3662
|
54
|
|
|
|
|
182
|
my $CL_code_lengths_bitstring = join('', map { int2bits_lsb($_, 3) } @CL_code_lenghts); |
|
|
936
|
|
|
|
|
1729
|
|
|
3663
|
|
|
|
|
|
|
|
|
3664
|
54
|
|
|
|
|
328
|
my $LL_code_lengths_bitstring = _cl_encoded_bitstring($cl_dict, $LL_code_lengths, $LL_offset_bits); |
|
3665
|
54
|
|
|
|
|
176
|
my $distance_code_lengths_bitstring = _cl_encoded_bitstring($cl_dict, $distance_code_lengths, $distance_offset_bits); |
|
3666
|
|
|
|
|
|
|
|
|
3667
|
|
|
|
|
|
|
# (5 bits) HLIT = (number of LL code entries present) - 257 |
|
3668
|
54
|
|
|
|
|
163
|
my $HLIT = $LL_cl_len - 257; |
|
3669
|
|
|
|
|
|
|
|
|
3670
|
|
|
|
|
|
|
# (5 bits) HDIST = (number of distance code entries present) - 1 |
|
3671
|
54
|
|
|
|
|
141
|
my $HDIST = $distance_cl_len - 1; |
|
3672
|
|
|
|
|
|
|
|
|
3673
|
|
|
|
|
|
|
# (4 bits) HCLEN = (number of CL code entries present) - 4 |
|
3674
|
54
|
|
|
|
|
136
|
my $HCLEN = scalar(@CL_code_lenghts) - 4; |
|
3675
|
|
|
|
|
|
|
|
|
3676
|
54
|
|
|
|
|
194
|
$bitstring .= int2bits_lsb($HLIT, 5); |
|
3677
|
54
|
|
|
|
|
3221
|
$bitstring .= int2bits_lsb($HDIST, 5); |
|
3678
|
54
|
|
|
|
|
152
|
$bitstring .= int2bits_lsb($HCLEN, 4); |
|
3679
|
|
|
|
|
|
|
|
|
3680
|
54
|
|
|
|
|
165
|
$bitstring .= $CL_code_lengths_bitstring; |
|
3681
|
54
|
|
|
|
|
122
|
$bitstring .= $LL_code_lengths_bitstring; |
|
3682
|
54
|
|
|
|
|
168
|
$bitstring .= $distance_code_lengths_bitstring; |
|
3683
|
|
|
|
|
|
|
|
|
3684
|
54
|
|
|
|
|
145
|
foreach my $symbol (@len_symbols) { |
|
3685
|
10321
|
100
|
|
|
|
31241
|
if (ref($symbol) eq 'ARRAY') { |
|
3686
|
|
|
|
|
|
|
|
|
3687
|
3740
|
|
|
|
|
8702
|
my ($len, $len_offset) = @$symbol; |
|
3688
|
3740
|
|
|
|
|
7730
|
$bitstring .= $dict->{$len}; |
|
3689
|
3740
|
100
|
|
|
|
9207
|
$bitstring .= substr($offset_bits, 0, $len_offset, '') if ($len_offset > 0); |
|
3690
|
|
|
|
|
|
|
|
|
3691
|
3740
|
|
|
|
|
7090
|
my ($dist, $dist_offset) = @{shift(@dist_symbols)}; |
|
|
3740
|
|
|
|
|
8523
|
|
|
3692
|
3740
|
|
|
|
|
9877
|
$bitstring .= $dist_dict->{$dist}; |
|
3693
|
3740
|
100
|
|
|
|
12386
|
$bitstring .= substr($offset_bits, 0, $dist_offset, '') if ($dist_offset > 0); |
|
3694
|
|
|
|
|
|
|
} |
|
3695
|
|
|
|
|
|
|
else { |
|
3696
|
6581
|
|
|
|
|
17932
|
$bitstring .= $dict->{$symbol}; |
|
3697
|
|
|
|
|
|
|
} |
|
3698
|
|
|
|
|
|
|
} |
|
3699
|
|
|
|
|
|
|
|
|
3700
|
54
|
|
|
|
|
17440
|
return $bitstring; |
|
3701
|
|
|
|
|
|
|
} |
|
3702
|
|
|
|
|
|
|
|
|
3703
|
54
|
|
|
54
|
1
|
172
|
sub deflate_create_block_type_1 ($literals, $distances, $lengths) { |
|
|
54
|
|
|
|
|
153
|
|
|
|
54
|
|
|
|
|
102
|
|
|
|
54
|
|
|
|
|
103
|
|
|
|
54
|
|
|
|
|
101
|
|
|
3704
|
|
|
|
|
|
|
|
|
3705
|
54
|
50
|
|
|
|
291
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
3706
|
54
|
|
|
|
|
124
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
3707
|
54
|
|
|
|
|
142
|
local $LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing |
|
3708
|
|
|
|
|
|
|
|
|
3709
|
54
|
|
|
|
|
135
|
state $deflate_tables = [make_deflate_tables()]; |
|
3710
|
54
|
|
|
|
|
211
|
my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = @$deflate_tables; |
|
3711
|
|
|
|
|
|
|
|
|
3712
|
54
|
|
|
|
|
131
|
state $dict; |
|
3713
|
54
|
|
|
|
|
125
|
state $dist_dict; |
|
3714
|
|
|
|
|
|
|
|
|
3715
|
54
|
100
|
|
|
|
193
|
if (!defined($dict)) { |
|
3716
|
|
|
|
|
|
|
|
|
3717
|
4
|
|
|
|
|
92
|
my @code_lengths = (0) x 288; |
|
3718
|
4
|
|
|
|
|
19
|
foreach my $i (0 .. 143) { |
|
3719
|
576
|
|
|
|
|
4893
|
$code_lengths[$i] = 8; |
|
3720
|
|
|
|
|
|
|
} |
|
3721
|
4
|
|
|
|
|
21
|
foreach my $i (144 .. 255) { |
|
3722
|
448
|
|
|
|
|
851
|
$code_lengths[$i] = 9; |
|
3723
|
|
|
|
|
|
|
} |
|
3724
|
4
|
|
|
|
|
21
|
foreach my $i (256 .. 279) { |
|
3725
|
96
|
|
|
|
|
181
|
$code_lengths[$i] = 7; |
|
3726
|
|
|
|
|
|
|
} |
|
3727
|
4
|
|
|
|
|
16
|
foreach my $i (280 .. 287) { |
|
3728
|
32
|
|
|
|
|
117
|
$code_lengths[$i] = 8; |
|
3729
|
|
|
|
|
|
|
} |
|
3730
|
|
|
|
|
|
|
|
|
3731
|
4
|
|
|
|
|
44
|
($dict) = huffman_from_code_lengths(\@code_lengths); |
|
3732
|
4
|
|
|
|
|
258
|
($dist_dict) = huffman_from_code_lengths([(5) x 32]); |
|
3733
|
|
|
|
|
|
|
} |
|
3734
|
|
|
|
|
|
|
|
|
3735
|
54
|
|
|
|
|
195
|
my $bitstring = '10'; |
|
3736
|
|
|
|
|
|
|
|
|
3737
|
54
|
|
|
|
|
301
|
foreach my $k (0 .. $#$literals) { |
|
3738
|
|
|
|
|
|
|
|
|
3739
|
10267
|
100
|
|
|
|
26506
|
if ($lengths->[$k] == 0) { |
|
3740
|
6527
|
|
|
|
|
21976
|
$bitstring .= $dict->{$literals->[$k]}; |
|
3741
|
6527
|
|
|
|
|
14047
|
next; |
|
3742
|
|
|
|
|
|
|
} |
|
3743
|
|
|
|
|
|
|
|
|
3744
|
3740
|
|
|
|
|
6651
|
my $len = $lengths->[$k]; |
|
3745
|
3740
|
|
|
|
|
7039
|
my $dist = $distances->[$k]; |
|
3746
|
|
|
|
|
|
|
|
|
3747
|
|
|
|
|
|
|
{ |
|
3748
|
3740
|
|
|
|
|
6039
|
my $len_idx = $LENGTH_INDICES->[$len]; |
|
3749
|
3740
|
|
|
|
|
5786
|
my ($min, $bits) = @{$LENGTH_SYMBOLS->[$len_idx]}; |
|
|
3740
|
|
|
|
|
8674
|
|
|
3750
|
|
|
|
|
|
|
|
|
3751
|
3740
|
|
|
|
|
9931
|
$bitstring .= $dict->{$len_idx + 256 - 1}; |
|
3752
|
3740
|
100
|
|
|
|
10461
|
$bitstring .= int2bits_lsb($len - $min, $bits) if ($bits > 0); |
|
3753
|
|
|
|
|
|
|
} |
|
3754
|
|
|
|
|
|
|
|
|
3755
|
|
|
|
|
|
|
{ |
|
3756
|
3740
|
|
|
|
|
5320
|
my $dist_idx = find_deflate_index($dist, $DISTANCE_SYMBOLS); |
|
|
3740
|
|
|
|
|
6803
|
|
|
|
3740
|
|
|
|
|
7872
|
|
|
3757
|
3740
|
|
|
|
|
6177
|
my ($min, $bits) = @{$DISTANCE_SYMBOLS->[$dist_idx]}; |
|
|
3740
|
|
|
|
|
8521
|
|
|
3758
|
|
|
|
|
|
|
|
|
3759
|
3740
|
|
|
|
|
10564
|
$bitstring .= $dist_dict->{$dist_idx - 1}; |
|
3760
|
3740
|
100
|
|
|
|
11381
|
$bitstring .= int2bits_lsb($dist - $min, $bits) if ($bits > 0); |
|
3761
|
|
|
|
|
|
|
} |
|
3762
|
|
|
|
|
|
|
} |
|
3763
|
|
|
|
|
|
|
|
|
3764
|
54
|
|
|
|
|
238
|
$bitstring .= $dict->{256}; # end-of-block symbol |
|
3765
|
|
|
|
|
|
|
|
|
3766
|
54
|
|
|
|
|
1687
|
return $bitstring; |
|
3767
|
|
|
|
|
|
|
} |
|
3768
|
|
|
|
|
|
|
|
|
3769
|
2
|
|
|
2
|
1
|
3775
|
sub deflate_create_block_type_0_header($chunk) { |
|
|
2
|
|
|
|
|
6
|
|
|
|
2
|
|
|
|
|
4
|
|
|
3770
|
|
|
|
|
|
|
|
|
3771
|
2
|
|
|
|
|
7
|
my $chunk_len = length($chunk); |
|
3772
|
2
|
|
|
|
|
10
|
my $len = int2bits_lsb($chunk_len, 16); |
|
3773
|
2
|
|
|
|
|
10
|
my $nlen = int2bits_lsb((~$chunk_len) & 0xffff, 16); |
|
3774
|
|
|
|
|
|
|
|
|
3775
|
2
|
|
|
|
|
10
|
$len . $nlen; |
|
3776
|
|
|
|
|
|
|
} |
|
3777
|
|
|
|
|
|
|
|
|
3778
|
56
|
|
|
56
|
1
|
303900
|
sub gzip_compress ($in_fh, $lzss_encoding_sub = \&lzss_encode) { |
|
|
56
|
|
|
|
|
111
|
|
|
|
56
|
|
|
|
|
149
|
|
|
|
56
|
|
|
|
|
92
|
|
|
3779
|
|
|
|
|
|
|
|
|
3780
|
56
|
100
|
|
|
|
242
|
if (ref($in_fh) eq '') { |
|
3781
|
28
|
50
|
|
|
|
428
|
open(my $fh2, '<:raw', \$in_fh) or confess "error: $!"; |
|
3782
|
28
|
|
|
|
|
118
|
return __SUB__->($fh2, $lzss_encoding_sub); |
|
3783
|
|
|
|
|
|
|
} |
|
3784
|
|
|
|
|
|
|
|
|
3785
|
28
|
|
|
|
|
80
|
my $compressed = ''; |
|
3786
|
|
|
|
|
|
|
|
|
3787
|
28
|
|
|
|
|
191
|
open my $out_fh, '>:raw', \$compressed; |
|
3788
|
|
|
|
|
|
|
|
|
3789
|
28
|
50
|
|
|
|
112
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
3790
|
28
|
|
|
|
|
81
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
3791
|
28
|
|
|
|
|
55
|
local $LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing |
|
3792
|
|
|
|
|
|
|
|
|
3793
|
28
|
|
|
|
|
54
|
state $MAGIC = pack('C*', 0x1f, 0x8b); # magic MIME type |
|
3794
|
28
|
|
|
|
|
57
|
state $CM = chr(0x08); # 0x08 = DEFLATE |
|
3795
|
28
|
|
|
|
|
58
|
state $FLAGS = chr(0x00); # flags |
|
3796
|
28
|
|
|
|
|
75
|
state $MTIME = pack('C*', (0x00) x 4); # modification time |
|
3797
|
28
|
|
|
|
|
53
|
state $XFLAGS = chr(0x00); # extra flags |
|
3798
|
28
|
|
|
|
|
50
|
state $OS = chr(0x03); # 0x03 = Unix |
|
3799
|
|
|
|
|
|
|
|
|
3800
|
28
|
|
|
|
|
115
|
print $out_fh $MAGIC, $CM, $FLAGS, $MTIME, $XFLAGS, $OS; |
|
3801
|
|
|
|
|
|
|
|
|
3802
|
28
|
|
|
|
|
59
|
my $total_length = 0; |
|
3803
|
28
|
|
|
|
|
82
|
my $crc32 = 0; |
|
3804
|
|
|
|
|
|
|
|
|
3805
|
28
|
|
|
|
|
94
|
my $bitstring = ''; |
|
3806
|
|
|
|
|
|
|
|
|
3807
|
28
|
100
|
|
|
|
124
|
if (eof($in_fh)) { # empty file |
|
3808
|
2
|
|
|
|
|
5
|
$bitstring = '1' . '10' . '0000000'; |
|
3809
|
|
|
|
|
|
|
} |
|
3810
|
|
|
|
|
|
|
|
|
3811
|
28
|
|
|
|
|
70
|
state $CHUNK_SIZE = (1 << 15) - 1; |
|
3812
|
|
|
|
|
|
|
|
|
3813
|
28
|
|
|
|
|
195
|
while (read($in_fh, (my $chunk), $CHUNK_SIZE)) { |
|
3814
|
|
|
|
|
|
|
|
|
3815
|
28
|
|
|
|
|
115
|
$crc32 = crc32($chunk, $crc32); |
|
3816
|
28
|
|
|
|
|
146
|
$total_length += length($chunk); |
|
3817
|
28
|
100
|
|
|
|
235
|
$bitstring .= eof($in_fh) ? '1' : '0'; |
|
3818
|
|
|
|
|
|
|
|
|
3819
|
28
|
|
|
|
|
139
|
my ($literals, $distances, $lengths) = $lzss_encoding_sub->($chunk); |
|
3820
|
|
|
|
|
|
|
|
|
3821
|
28
|
|
|
|
|
213
|
my $bt1_bitstring = deflate_create_block_type_1($literals, $distances, $lengths); |
|
3822
|
|
|
|
|
|
|
|
|
3823
|
|
|
|
|
|
|
# When block type 1 is larger than the input, then we have random uncompressible data: use block type 0 |
|
3824
|
28
|
50
|
|
|
|
266
|
if ((length($bt1_bitstring) >> 3) > length($chunk) + 5) { |
|
3825
|
|
|
|
|
|
|
|
|
3826
|
0
|
0
|
|
|
|
0
|
$VERBOSE && say STDERR ":: Using block type: 0"; |
|
3827
|
|
|
|
|
|
|
|
|
3828
|
0
|
|
|
|
|
0
|
$bitstring .= '00'; |
|
3829
|
|
|
|
|
|
|
|
|
3830
|
0
|
|
|
|
|
0
|
print $out_fh pack('b*', $bitstring); # pads to a byte |
|
3831
|
0
|
|
|
|
|
0
|
print $out_fh pack('b*', deflate_create_block_type_0_header($chunk)); |
|
3832
|
0
|
|
|
|
|
0
|
print $out_fh $chunk; |
|
3833
|
|
|
|
|
|
|
|
|
3834
|
0
|
|
|
|
|
0
|
$bitstring = ''; |
|
3835
|
0
|
|
|
|
|
0
|
next; |
|
3836
|
|
|
|
|
|
|
} |
|
3837
|
|
|
|
|
|
|
|
|
3838
|
28
|
|
|
|
|
143
|
my $bt2_bitstring = deflate_create_block_type_2($literals, $distances, $lengths); |
|
3839
|
|
|
|
|
|
|
|
|
3840
|
|
|
|
|
|
|
# When block type 2 is larger than block type 1, then we may have very small data |
|
3841
|
28
|
100
|
|
|
|
247
|
if (length($bt2_bitstring) > length($bt1_bitstring)) { |
|
3842
|
22
|
50
|
|
|
|
78
|
$VERBOSE && say STDERR ":: Using block type: 1"; |
|
3843
|
22
|
|
|
|
|
82
|
$bitstring .= $bt1_bitstring; |
|
3844
|
|
|
|
|
|
|
} |
|
3845
|
|
|
|
|
|
|
else { |
|
3846
|
6
|
50
|
|
|
|
40
|
$VERBOSE && say STDERR ":: Using block type: 2"; |
|
3847
|
6
|
|
|
|
|
5270
|
$bitstring .= $bt2_bitstring; |
|
3848
|
|
|
|
|
|
|
} |
|
3849
|
|
|
|
|
|
|
|
|
3850
|
28
|
|
|
|
|
5324
|
print $out_fh pack('b*', substr($bitstring, 0, length($bitstring) - (length($bitstring) % 8), '')); |
|
3851
|
|
|
|
|
|
|
} |
|
3852
|
|
|
|
|
|
|
|
|
3853
|
28
|
100
|
|
|
|
161
|
if ($bitstring ne '') { |
|
3854
|
25
|
|
|
|
|
127
|
print $out_fh pack('b*', $bitstring); |
|
3855
|
|
|
|
|
|
|
} |
|
3856
|
|
|
|
|
|
|
|
|
3857
|
28
|
|
|
|
|
143
|
print $out_fh int2bytes_lsb($crc32, 4); |
|
3858
|
28
|
|
|
|
|
159
|
print $out_fh int2bytes_lsb($total_length, 4); |
|
3859
|
|
|
|
|
|
|
|
|
3860
|
28
|
|
|
|
|
709
|
return $compressed; |
|
3861
|
|
|
|
|
|
|
} |
|
3862
|
|
|
|
|
|
|
|
|
3863
|
|
|
|
|
|
|
################### |
|
3864
|
|
|
|
|
|
|
# GZIP DECOMPRESSOR |
|
3865
|
|
|
|
|
|
|
################### |
|
3866
|
|
|
|
|
|
|
|
|
3867
|
4
|
|
|
4
|
1
|
33
|
sub deflate_extract_block_type_0 ($in_fh, $buffer, $search_window) { |
|
|
4
|
|
|
|
|
13
|
|
|
|
4
|
|
|
|
|
12
|
|
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
8
|
|
|
3868
|
|
|
|
|
|
|
|
|
3869
|
4
|
50
|
|
|
|
17
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
3870
|
4
|
|
|
|
|
9
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
3871
|
4
|
|
|
|
|
8
|
local $LZ_MAX_DIST = 32768; # maximum allowed back-reference distance in LZ parsing |
|
3872
|
|
|
|
|
|
|
|
|
3873
|
4
|
|
|
|
|
10
|
$$buffer = ''; |
|
3874
|
|
|
|
|
|
|
|
|
3875
|
4
|
|
|
|
|
15
|
my $len = bytes2int_lsb($in_fh, 2); |
|
3876
|
4
|
|
|
|
|
40
|
my $nlen = bytes2int_lsb($in_fh, 2); |
|
3877
|
4
|
|
|
|
|
15
|
my $expected_nlen = (~$len) & 0xffff; |
|
3878
|
|
|
|
|
|
|
|
|
3879
|
4
|
50
|
|
|
|
19
|
if ($expected_nlen != $nlen) { |
|
3880
|
0
|
|
|
|
|
0
|
confess "[!] The ~length value is not correct: $nlen (actual) != $expected_nlen (expected)"; |
|
3881
|
|
|
|
|
|
|
} |
|
3882
|
|
|
|
|
|
|
else { |
|
3883
|
4
|
50
|
|
|
|
17
|
$VERBOSE && print STDERR ":: Chunk length: $len\n"; |
|
3884
|
|
|
|
|
|
|
} |
|
3885
|
|
|
|
|
|
|
|
|
3886
|
4
|
|
33
|
|
|
23
|
read($in_fh, (my $chunk), $len) // confess "Read error: $!"; |
|
3887
|
4
|
|
|
|
|
15
|
$$search_window .= $chunk; |
|
3888
|
|
|
|
|
|
|
|
|
3889
|
4
|
50
|
|
|
|
20
|
$$search_window = substr($$search_window, -$LZ_MAX_DIST) |
|
3890
|
|
|
|
|
|
|
if (length($$search_window) > 2 * $LZ_MAX_DIST); |
|
3891
|
|
|
|
|
|
|
|
|
3892
|
4
|
|
|
|
|
40
|
return $chunk; |
|
3893
|
|
|
|
|
|
|
} |
|
3894
|
|
|
|
|
|
|
|
|
3895
|
83
|
|
|
83
|
|
193
|
sub _deflate_decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window) { |
|
|
83
|
|
|
|
|
245
|
|
|
|
83
|
|
|
|
|
183
|
|
|
|
83
|
|
|
|
|
175
|
|
|
|
83
|
|
|
|
|
184
|
|
|
|
83
|
|
|
|
|
177
|
|
|
|
83
|
|
|
|
|
145
|
|
|
3896
|
|
|
|
|
|
|
|
|
3897
|
83
|
|
|
|
|
190
|
state $deflate_tables = [make_deflate_tables()]; |
|
3898
|
83
|
|
|
|
|
304
|
my ($DISTANCE_SYMBOLS, $LENGTH_SYMBOLS, $LENGTH_INDICES) = @$deflate_tables; |
|
3899
|
|
|
|
|
|
|
|
|
3900
|
83
|
|
|
|
|
238
|
my $data = ''; |
|
3901
|
83
|
|
|
|
|
194
|
my $code = ''; |
|
3902
|
|
|
|
|
|
|
|
|
3903
|
83
|
|
|
|
|
4764
|
my $max_ll_code_len = max(map { length($_) } keys %$rev_dict); |
|
|
21073
|
|
|
|
|
39542
|
|
|
3904
|
83
|
|
|
|
|
2229
|
my $max_dist_code_len = max(map { length($_) } keys %$dist_rev_dict); |
|
|
2398
|
|
|
|
|
3946
|
|
|
3905
|
|
|
|
|
|
|
|
|
3906
|
83
|
|
|
|
|
328
|
while (1) { |
|
3907
|
64701
|
|
|
|
|
119342
|
$code .= read_bit_lsb($in_fh, $buffer); |
|
3908
|
|
|
|
|
|
|
|
|
3909
|
64701
|
50
|
|
|
|
182637
|
if (length($code) > $max_ll_code_len) { |
|
3910
|
0
|
|
|
|
|
0
|
confess "[!] Something went wrong: length of LL code `$code` is > $max_ll_code_len."; |
|
3911
|
|
|
|
|
|
|
} |
|
3912
|
|
|
|
|
|
|
|
|
3913
|
64701
|
100
|
|
|
|
151967
|
if (exists($rev_dict->{$code})) { |
|
3914
|
|
|
|
|
|
|
|
|
3915
|
10617
|
|
|
|
|
24393
|
my $symbol = $rev_dict->{$code}; |
|
3916
|
|
|
|
|
|
|
|
|
3917
|
10617
|
100
|
|
|
|
23187
|
if ($symbol <= 255) { |
|
|
|
100
|
|
|
|
|
|
|
3918
|
6769
|
|
|
|
|
16293
|
$data .= chr($symbol); |
|
3919
|
6769
|
|
|
|
|
15589
|
$$search_window .= chr($symbol); |
|
3920
|
|
|
|
|
|
|
} |
|
3921
|
|
|
|
|
|
|
elsif ($symbol == 256) { # end-of-block marker |
|
3922
|
83
|
|
|
|
|
163
|
$code = ''; |
|
3923
|
83
|
|
|
|
|
230
|
last; |
|
3924
|
|
|
|
|
|
|
} |
|
3925
|
|
|
|
|
|
|
else { # LZSS decoding |
|
3926
|
3765
|
|
|
|
|
5770
|
my ($length, $LL_bits) = @{$LENGTH_SYMBOLS->[$symbol - 256 + 1]}; |
|
|
3765
|
|
|
|
|
10894
|
|
|
3927
|
3765
|
100
|
|
|
|
11600
|
$length += bits2int_lsb($in_fh, $LL_bits, $buffer) if ($LL_bits > 0); |
|
3928
|
|
|
|
|
|
|
|
|
3929
|
3765
|
|
|
|
|
6728
|
my $dist_code = ''; |
|
3930
|
|
|
|
|
|
|
|
|
3931
|
3765
|
|
|
|
|
5814
|
while (1) { |
|
3932
|
16809
|
|
|
|
|
31557
|
$dist_code .= read_bit_lsb($in_fh, $buffer); |
|
3933
|
|
|
|
|
|
|
|
|
3934
|
16809
|
50
|
|
|
|
35502
|
if (length($dist_code) > $max_dist_code_len) { |
|
3935
|
0
|
|
|
|
|
0
|
confess "[!] Something went wrong: length of distance code `$dist_code` is > $max_dist_code_len."; |
|
3936
|
|
|
|
|
|
|
} |
|
3937
|
|
|
|
|
|
|
|
|
3938
|
16809
|
100
|
|
|
|
41821
|
if (exists($dist_rev_dict->{$dist_code})) { |
|
3939
|
3765
|
|
|
|
|
7849
|
last; |
|
3940
|
|
|
|
|
|
|
} |
|
3941
|
|
|
|
|
|
|
} |
|
3942
|
|
|
|
|
|
|
|
|
3943
|
3765
|
|
|
|
|
7187
|
my ($dist, $dist_bits) = @{$DISTANCE_SYMBOLS->[$dist_rev_dict->{$dist_code} + 1]}; |
|
|
3765
|
|
|
|
|
20853
|
|
|
3944
|
3765
|
100
|
|
|
|
13065
|
$dist += bits2int_lsb($in_fh, $dist_bits, $buffer) if ($dist_bits > 0); |
|
3945
|
|
|
|
|
|
|
|
|
3946
|
3765
|
100
|
|
|
|
12998
|
if ($dist == 1) { |
|
|
|
100
|
|
|
|
|
|
|
3947
|
35
|
|
|
|
|
244
|
$$search_window .= substr($$search_window, -1) x $length; |
|
3948
|
|
|
|
|
|
|
} |
|
3949
|
|
|
|
|
|
|
elsif ($dist >= $length) { # non-overlapping matches |
|
3950
|
3585
|
|
|
|
|
15596
|
$$search_window .= substr($$search_window, length($$search_window) - $dist, $length); |
|
3951
|
|
|
|
|
|
|
} |
|
3952
|
|
|
|
|
|
|
else { # overlapping matches |
|
3953
|
145
|
|
|
|
|
446
|
foreach my $i (1 .. $length) { |
|
3954
|
2718
|
|
|
|
|
7207
|
$$search_window .= substr($$search_window, length($$search_window) - $dist, 1); |
|
3955
|
|
|
|
|
|
|
} |
|
3956
|
|
|
|
|
|
|
} |
|
3957
|
|
|
|
|
|
|
|
|
3958
|
3765
|
|
|
|
|
13716
|
$data .= substr($$search_window, -$length); |
|
3959
|
|
|
|
|
|
|
} |
|
3960
|
|
|
|
|
|
|
|
|
3961
|
10534
|
|
|
|
|
23248
|
$code = ''; |
|
3962
|
|
|
|
|
|
|
} |
|
3963
|
|
|
|
|
|
|
} |
|
3964
|
|
|
|
|
|
|
|
|
3965
|
83
|
50
|
|
|
|
270
|
if ($code ne '') { |
|
3966
|
0
|
|
|
|
|
0
|
confess "[!] Something went wrong: code `$code` is not empty!"; |
|
3967
|
|
|
|
|
|
|
} |
|
3968
|
|
|
|
|
|
|
|
|
3969
|
83
|
50
|
|
|
|
381
|
$$search_window = substr($$search_window, -$LZ_MAX_DIST) |
|
3970
|
|
|
|
|
|
|
if (length($$search_window) > 2 * $LZ_MAX_DIST); |
|
3971
|
|
|
|
|
|
|
|
|
3972
|
83
|
|
|
|
|
1993
|
return $data; |
|
3973
|
|
|
|
|
|
|
} |
|
3974
|
|
|
|
|
|
|
|
|
3975
|
71
|
|
|
71
|
1
|
839
|
sub deflate_extract_block_type_1 ($in_fh, $buffer, $search_window) { |
|
|
71
|
|
|
|
|
159
|
|
|
|
71
|
|
|
|
|
115
|
|
|
|
71
|
|
|
|
|
132
|
|
|
|
71
|
|
|
|
|
142
|
|
|
3976
|
|
|
|
|
|
|
|
|
3977
|
71
|
50
|
|
|
|
225
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
3978
|
71
|
|
|
|
|
169
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
3979
|
71
|
|
|
|
|
145
|
local $LZ_MAX_DIST = 32768; # maximum allowed back-reference distance in LZ parsing |
|
3980
|
|
|
|
|
|
|
|
|
3981
|
71
|
|
|
|
|
119
|
state $rev_dict; |
|
3982
|
71
|
|
|
|
|
148
|
state $dist_rev_dict; |
|
3983
|
|
|
|
|
|
|
|
|
3984
|
71
|
100
|
|
|
|
313
|
if (!defined($rev_dict)) { |
|
3985
|
|
|
|
|
|
|
|
|
3986
|
1
|
|
|
|
|
11
|
my @code_lengths = (0) x 288; |
|
3987
|
1
|
|
|
|
|
2
|
foreach my $i (0 .. 143) { |
|
3988
|
144
|
|
|
|
|
149
|
$code_lengths[$i] = 8; |
|
3989
|
|
|
|
|
|
|
} |
|
3990
|
1
|
|
|
|
|
2
|
foreach my $i (144 .. 255) { |
|
3991
|
112
|
|
|
|
|
118
|
$code_lengths[$i] = 9; |
|
3992
|
|
|
|
|
|
|
} |
|
3993
|
1
|
|
|
|
|
2
|
foreach my $i (256 .. 279) { |
|
3994
|
24
|
|
|
|
|
27
|
$code_lengths[$i] = 7; |
|
3995
|
|
|
|
|
|
|
} |
|
3996
|
1
|
|
|
|
|
2
|
foreach my $i (280 .. 287) { |
|
3997
|
8
|
|
|
|
|
10
|
$code_lengths[$i] = 8; |
|
3998
|
|
|
|
|
|
|
} |
|
3999
|
|
|
|
|
|
|
|
|
4000
|
1
|
|
|
|
|
3
|
(undef, $rev_dict) = huffman_from_code_lengths(\@code_lengths); |
|
4001
|
1
|
|
|
|
|
25
|
(undef, $dist_rev_dict) = huffman_from_code_lengths([(5) x 32]); |
|
4002
|
|
|
|
|
|
|
} |
|
4003
|
|
|
|
|
|
|
|
|
4004
|
71
|
|
|
|
|
368
|
_deflate_decode_huffman($in_fh, $buffer, $rev_dict, $dist_rev_dict, $search_window); |
|
4005
|
|
|
|
|
|
|
} |
|
4006
|
|
|
|
|
|
|
|
|
4007
|
24
|
|
|
24
|
|
56
|
sub _decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $size) { |
|
|
24
|
|
|
|
|
63
|
|
|
|
24
|
|
|
|
|
44
|
|
|
|
24
|
|
|
|
|
46
|
|
|
|
24
|
|
|
|
|
46
|
|
|
|
24
|
|
|
|
|
37
|
|
|
4008
|
|
|
|
|
|
|
|
|
4009
|
24
|
|
|
|
|
43
|
my @lengths; |
|
4010
|
24
|
|
|
|
|
50
|
my $code = ''; |
|
4011
|
|
|
|
|
|
|
|
|
4012
|
24
|
|
|
|
|
45
|
while (1) { |
|
4013
|
4309
|
|
|
|
|
13297
|
$code .= read_bit_lsb($in_fh, $buffer); |
|
4014
|
|
|
|
|
|
|
|
|
4015
|
4309
|
50
|
|
|
|
9796
|
if (length($code) > 7) { |
|
4016
|
0
|
|
|
|
|
0
|
confess "[!] Something went wrong: length of CL code `$code` is > 7."; |
|
4017
|
|
|
|
|
|
|
} |
|
4018
|
|
|
|
|
|
|
|
|
4019
|
4309
|
100
|
|
|
|
10127
|
if (exists($CL_rev_dict->{$code})) { |
|
4020
|
1422
|
|
|
|
|
2777
|
my $CL_symbol = $CL_rev_dict->{$code}; |
|
4021
|
|
|
|
|
|
|
|
|
4022
|
1422
|
100
|
|
|
|
2978
|
if ($CL_symbol <= 15) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4023
|
1340
|
|
|
|
|
2794
|
push @lengths, $CL_symbol; |
|
4024
|
|
|
|
|
|
|
} |
|
4025
|
|
|
|
|
|
|
elsif ($CL_symbol == 16) { |
|
4026
|
17
|
|
|
|
|
82
|
push @lengths, ($lengths[-1]) x (3 + bits2int_lsb($in_fh, 2, $buffer)); |
|
4027
|
|
|
|
|
|
|
} |
|
4028
|
|
|
|
|
|
|
elsif ($CL_symbol == 17) { |
|
4029
|
42
|
|
|
|
|
121
|
push @lengths, (0) x (3 + bits2int_lsb($in_fh, 3, $buffer)); |
|
4030
|
|
|
|
|
|
|
} |
|
4031
|
|
|
|
|
|
|
elsif ($CL_symbol == 18) { |
|
4032
|
23
|
|
|
|
|
80
|
push @lengths, (0) x (11 + bits2int_lsb($in_fh, 7, $buffer)); |
|
4033
|
|
|
|
|
|
|
} |
|
4034
|
|
|
|
|
|
|
else { |
|
4035
|
0
|
|
|
|
|
0
|
confess "Unknown CL symbol: $CL_symbol"; |
|
4036
|
|
|
|
|
|
|
} |
|
4037
|
|
|
|
|
|
|
|
|
4038
|
1422
|
|
|
|
|
2271
|
$code = ''; |
|
4039
|
1422
|
100
|
|
|
|
4525
|
last if (scalar(@lengths) >= $size); |
|
4040
|
|
|
|
|
|
|
} |
|
4041
|
|
|
|
|
|
|
} |
|
4042
|
|
|
|
|
|
|
|
|
4043
|
24
|
50
|
|
|
|
85
|
if (scalar(@lengths) != $size) { |
|
4044
|
0
|
|
|
|
|
0
|
confess "Something went wrong: size $size (expected) != ", scalar(@lengths); |
|
4045
|
|
|
|
|
|
|
} |
|
4046
|
|
|
|
|
|
|
|
|
4047
|
24
|
50
|
|
|
|
70
|
if ($code ne '') { |
|
4048
|
0
|
|
|
|
|
0
|
confess "Something went wrong: code `$code` is not empty!"; |
|
4049
|
|
|
|
|
|
|
} |
|
4050
|
|
|
|
|
|
|
|
|
4051
|
24
|
|
|
|
|
1232
|
return @lengths; |
|
4052
|
|
|
|
|
|
|
} |
|
4053
|
|
|
|
|
|
|
|
|
4054
|
12
|
|
|
12
|
1
|
1282
|
sub deflate_extract_block_type_2 ($in_fh, $buffer, $search_window) { |
|
|
12
|
|
|
|
|
25
|
|
|
|
12
|
|
|
|
|
27
|
|
|
|
12
|
|
|
|
|
21
|
|
|
|
12
|
|
|
|
|
28
|
|
|
4055
|
|
|
|
|
|
|
|
|
4056
|
12
|
50
|
|
|
|
76
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
4057
|
12
|
|
|
|
|
32
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
4058
|
12
|
|
|
|
|
32
|
local $LZ_MAX_DIST = 32768; # maximum allowed back-reference distance in LZ parsing |
|
4059
|
|
|
|
|
|
|
|
|
4060
|
|
|
|
|
|
|
# (5 bits) HLIT = (number of LL code entries present) - 257 |
|
4061
|
12
|
|
|
|
|
38
|
my $HLIT = bits2int_lsb($in_fh, 5, $buffer) + 257; |
|
4062
|
|
|
|
|
|
|
|
|
4063
|
|
|
|
|
|
|
# (5 bits) HDIST = (number of distance code entries present) - 1 |
|
4064
|
12
|
|
|
|
|
59
|
my $HDIST = bits2int_lsb($in_fh, 5, $buffer) + 1; |
|
4065
|
|
|
|
|
|
|
|
|
4066
|
|
|
|
|
|
|
# (4 bits) HCLEN = (number of CL code entries present) - 4 |
|
4067
|
12
|
|
|
|
|
36
|
my $HCLEN = bits2int_lsb($in_fh, 4, $buffer) + 4; |
|
4068
|
|
|
|
|
|
|
|
|
4069
|
12
|
50
|
|
|
|
43
|
$VERBOSE && say STDERR ":: Number of LL codes: $HLIT"; |
|
4070
|
12
|
50
|
|
|
|
38
|
$VERBOSE && say STDERR ":: Number of dist codes: $HDIST"; |
|
4071
|
12
|
50
|
|
|
|
48
|
$VERBOSE && say STDERR ":: Number of CL codes: $HCLEN"; |
|
4072
|
|
|
|
|
|
|
|
|
4073
|
12
|
|
|
|
|
65
|
my @CL_code_lenghts = (0) x 19; |
|
4074
|
12
|
|
|
|
|
68
|
my @CL_order = (16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15); |
|
4075
|
|
|
|
|
|
|
|
|
4076
|
12
|
|
|
|
|
46
|
foreach my $i (0 .. $HCLEN - 1) { |
|
4077
|
186
|
|
|
|
|
314
|
$CL_code_lenghts[$CL_order[$i]] = bits2int_lsb($in_fh, 3, $buffer); |
|
4078
|
|
|
|
|
|
|
} |
|
4079
|
|
|
|
|
|
|
|
|
4080
|
12
|
50
|
|
|
|
42
|
$VERBOSE && say STDERR ":: CL code lengths: @CL_code_lenghts"; |
|
4081
|
|
|
|
|
|
|
|
|
4082
|
12
|
|
|
|
|
52
|
my (undef, $CL_rev_dict) = huffman_from_code_lengths(\@CL_code_lenghts); |
|
4083
|
|
|
|
|
|
|
|
|
4084
|
12
|
|
|
|
|
87
|
my @LL_CL_lengths = _decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HLIT); |
|
4085
|
12
|
|
|
|
|
73
|
my @dist_CL_lengths = _decode_CL_lengths($in_fh, $buffer, $CL_rev_dict, $HDIST); |
|
4086
|
|
|
|
|
|
|
|
|
4087
|
12
|
|
|
|
|
63
|
my (undef, $LL_rev_dict) = huffman_from_code_lengths(\@LL_CL_lengths); |
|
4088
|
12
|
|
|
|
|
199
|
my (undef, $dist_rev_dict) = huffman_from_code_lengths(\@dist_CL_lengths); |
|
4089
|
|
|
|
|
|
|
|
|
4090
|
12
|
|
|
|
|
110
|
_deflate_decode_huffman($in_fh, $buffer, $LL_rev_dict, $dist_rev_dict, $search_window); |
|
4091
|
|
|
|
|
|
|
} |
|
4092
|
|
|
|
|
|
|
|
|
4093
|
84
|
|
|
84
|
1
|
240
|
sub deflate_extract_next_block ($in_fh, $buffer, $search_window) { |
|
|
84
|
|
|
|
|
146
|
|
|
|
84
|
|
|
|
|
199
|
|
|
|
84
|
|
|
|
|
148
|
|
|
|
84
|
|
|
|
|
211
|
|
|
4094
|
|
|
|
|
|
|
|
|
4095
|
84
|
50
|
|
|
|
382
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
4096
|
84
|
|
|
|
|
212
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
4097
|
84
|
|
|
|
|
198
|
local $LZ_MAX_DIST = 32768; # maximum allowed back-reference distance in LZ parsing |
|
4098
|
|
|
|
|
|
|
|
|
4099
|
84
|
|
|
|
|
410
|
my $block_type = bits2int_lsb($in_fh, 2, $buffer); |
|
4100
|
|
|
|
|
|
|
|
|
4101
|
84
|
|
|
|
|
200
|
my $chunk = ''; |
|
4102
|
|
|
|
|
|
|
|
|
4103
|
84
|
100
|
|
|
|
393
|
if ($block_type == 0) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
4104
|
3
|
50
|
|
|
|
16
|
$VERBOSE && say STDERR "\n:: Extracting block of type 0"; |
|
4105
|
3
|
|
|
|
|
19
|
$chunk = deflate_extract_block_type_0($in_fh, $buffer, $search_window); |
|
4106
|
|
|
|
|
|
|
} |
|
4107
|
|
|
|
|
|
|
elsif ($block_type == 1) { |
|
4108
|
70
|
50
|
|
|
|
203
|
$VERBOSE && say STDERR "\n:: Extracting block of type 1"; |
|
4109
|
70
|
|
|
|
|
301
|
$chunk = deflate_extract_block_type_1($in_fh, $buffer, $search_window); |
|
4110
|
|
|
|
|
|
|
} |
|
4111
|
|
|
|
|
|
|
elsif ($block_type == 2) { |
|
4112
|
11
|
50
|
|
|
|
63
|
$VERBOSE && say STDERR "\n:: Extracting block of type 2"; |
|
4113
|
11
|
|
|
|
|
74
|
$chunk = deflate_extract_block_type_2($in_fh, $buffer, $search_window); |
|
4114
|
|
|
|
|
|
|
} |
|
4115
|
|
|
|
|
|
|
else { |
|
4116
|
0
|
|
|
|
|
0
|
confess "[!] Unknown block of type: $block_type"; |
|
4117
|
|
|
|
|
|
|
} |
|
4118
|
|
|
|
|
|
|
|
|
4119
|
84
|
|
|
|
|
404
|
return $chunk; |
|
4120
|
|
|
|
|
|
|
} |
|
4121
|
|
|
|
|
|
|
|
|
4122
|
95
|
|
|
95
|
1
|
200
|
sub gzip_decompress ($in_fh) { |
|
|
95
|
|
|
|
|
179
|
|
|
|
95
|
|
|
|
|
147
|
|
|
4123
|
|
|
|
|
|
|
|
|
4124
|
95
|
100
|
|
|
|
309
|
if (ref($in_fh) eq '') { |
|
4125
|
46
|
50
|
|
|
|
856
|
open(my $fh2, '<:raw', \$in_fh) or confess "error: $!"; |
|
4126
|
46
|
|
|
|
|
189
|
return __SUB__->($fh2); |
|
4127
|
|
|
|
|
|
|
} |
|
4128
|
|
|
|
|
|
|
|
|
4129
|
49
|
|
|
|
|
122
|
my $decompressed = ''; |
|
4130
|
|
|
|
|
|
|
|
|
4131
|
49
|
|
|
|
|
370
|
open my $out_fh, '>:raw', \$decompressed; |
|
4132
|
|
|
|
|
|
|
|
|
4133
|
49
|
50
|
|
|
|
190
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
4134
|
49
|
|
|
|
|
121
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
4135
|
49
|
|
|
|
|
137
|
local $LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing |
|
4136
|
|
|
|
|
|
|
|
|
4137
|
49
|
|
33
|
|
|
536
|
my $MAGIC = (getc($in_fh) // confess "error") . (getc($in_fh) // confess "error"); |
|
|
|
|
33
|
|
|
|
|
|
4138
|
|
|
|
|
|
|
|
|
4139
|
49
|
50
|
|
|
|
229
|
if ($MAGIC ne pack('C*', 0x1f, 0x8b)) { |
|
4140
|
0
|
|
|
|
|
0
|
confess "Not a valid GZIP container!"; |
|
4141
|
|
|
|
|
|
|
} |
|
4142
|
|
|
|
|
|
|
|
|
4143
|
49
|
|
33
|
|
|
298
|
my $CM = getc($in_fh) // confess "error"; # 0x08 = DEFLATE |
|
4144
|
49
|
|
33
|
|
|
258
|
my $FLAGS = ord(getc($in_fh) // confess "error"); # flags |
|
4145
|
49
|
|
33
|
|
|
236
|
my $MTIME = join('', map { getc($in_fh) // confess "error" } 1 .. 4); # modification time |
|
|
196
|
|
|
|
|
1041
|
|
|
4146
|
49
|
|
33
|
|
|
399
|
my $XFLAGS = getc($in_fh) // confess "error"; # extra flags |
|
4147
|
49
|
|
33
|
|
|
229
|
my $OS = getc($in_fh) // confess "error"; # 0x03 = Unix |
|
4148
|
|
|
|
|
|
|
|
|
4149
|
49
|
50
|
|
|
|
225
|
if ($CM ne chr(0x08)) { |
|
4150
|
0
|
|
|
|
|
0
|
confess "Only DEFLATE compression method is supported (0x08)! Got: 0x", sprintf('%02x', ord($CM)); |
|
4151
|
|
|
|
|
|
|
} |
|
4152
|
|
|
|
|
|
|
|
|
4153
|
|
|
|
|
|
|
# Reference: |
|
4154
|
|
|
|
|
|
|
# https://web.archive.org/web/20240221024029/https://forensics.wiki/gzip/ |
|
4155
|
|
|
|
|
|
|
|
|
4156
|
49
|
|
|
|
|
124
|
my $has_filename = 0; |
|
4157
|
49
|
|
|
|
|
105
|
my $has_comment = 0; |
|
4158
|
49
|
|
|
|
|
86
|
my $has_header_checksum = 0; |
|
4159
|
49
|
|
|
|
|
115
|
my $has_extra_fields = 0; |
|
4160
|
|
|
|
|
|
|
|
|
4161
|
49
|
100
|
|
|
|
225
|
if ($FLAGS & 0x08) { |
|
4162
|
3
|
|
|
|
|
11
|
$has_filename = 1; |
|
4163
|
|
|
|
|
|
|
} |
|
4164
|
|
|
|
|
|
|
|
|
4165
|
49
|
100
|
|
|
|
207
|
if ($FLAGS & 0x10) { |
|
4166
|
2
|
|
|
|
|
7
|
$has_comment = 1; |
|
4167
|
|
|
|
|
|
|
} |
|
4168
|
|
|
|
|
|
|
|
|
4169
|
49
|
50
|
|
|
|
193
|
if ($FLAGS & 0x02) { |
|
4170
|
0
|
|
|
|
|
0
|
$has_header_checksum = 1; |
|
4171
|
|
|
|
|
|
|
} |
|
4172
|
|
|
|
|
|
|
|
|
4173
|
49
|
50
|
|
|
|
217
|
if ($FLAGS & 0x04) { |
|
4174
|
0
|
|
|
|
|
0
|
$has_extra_fields = 1; |
|
4175
|
|
|
|
|
|
|
} |
|
4176
|
|
|
|
|
|
|
|
|
4177
|
49
|
50
|
|
|
|
174
|
if ($has_extra_fields) { |
|
4178
|
0
|
|
|
|
|
0
|
my $size = bytes2int_lsb($in_fh, 2); |
|
4179
|
0
|
|
0
|
|
|
0
|
read($in_fh, (my $extra_field_data), $size) // confess "can't read extra field data: $!"; |
|
4180
|
0
|
0
|
|
|
|
0
|
$VERBOSE && say STDERR ":: Extra field data: $extra_field_data"; |
|
4181
|
|
|
|
|
|
|
} |
|
4182
|
|
|
|
|
|
|
|
|
4183
|
49
|
100
|
|
|
|
148
|
if ($has_filename) { |
|
4184
|
3
|
|
|
|
|
15
|
my $filename = read_null_terminated($in_fh); # filename |
|
4185
|
3
|
50
|
|
|
|
18
|
$VERBOSE && say STDERR ":: Filename: $filename"; |
|
4186
|
|
|
|
|
|
|
} |
|
4187
|
|
|
|
|
|
|
|
|
4188
|
49
|
100
|
|
|
|
136
|
if ($has_comment) { |
|
4189
|
2
|
|
|
|
|
9
|
my $comment = read_null_terminated($in_fh); # comment |
|
4190
|
2
|
50
|
|
|
|
11
|
$VERBOSE && say STDERR ":: Comment: $comment"; |
|
4191
|
|
|
|
|
|
|
} |
|
4192
|
|
|
|
|
|
|
|
|
4193
|
|
|
|
|
|
|
# TODO: verify the header checksum |
|
4194
|
49
|
50
|
|
|
|
166
|
if ($has_header_checksum) { |
|
4195
|
0
|
|
|
|
|
0
|
my $header_checksum = bytes2int_lsb($in_fh, 2); |
|
4196
|
0
|
0
|
|
|
|
0
|
$VERBOSE && say STDERR ":: Header checksum: $header_checksum"; |
|
4197
|
|
|
|
|
|
|
} |
|
4198
|
|
|
|
|
|
|
|
|
4199
|
49
|
|
|
|
|
124
|
my $crc32 = 0; |
|
4200
|
49
|
|
|
|
|
117
|
my $actual_length = 0; |
|
4201
|
49
|
|
|
|
|
98
|
my $buffer = ''; |
|
4202
|
49
|
|
|
|
|
89
|
my $search_window = ''; |
|
4203
|
|
|
|
|
|
|
|
|
4204
|
49
|
|
|
|
|
93
|
while (1) { |
|
4205
|
|
|
|
|
|
|
|
|
4206
|
51
|
|
|
|
|
203
|
my $is_last = read_bit_lsb($in_fh, \$buffer); |
|
4207
|
51
|
|
|
|
|
225
|
my $chunk = deflate_extract_next_block($in_fh, \$buffer, \$search_window); |
|
4208
|
|
|
|
|
|
|
|
|
4209
|
51
|
|
|
|
|
438
|
print $out_fh $chunk; |
|
4210
|
51
|
|
|
|
|
224
|
$crc32 = crc32($chunk, $crc32); |
|
4211
|
51
|
|
|
|
|
199
|
$actual_length += length($chunk); |
|
4212
|
|
|
|
|
|
|
|
|
4213
|
51
|
100
|
|
|
|
247
|
last if $is_last; |
|
4214
|
|
|
|
|
|
|
} |
|
4215
|
|
|
|
|
|
|
|
|
4216
|
49
|
|
|
|
|
409
|
$buffer = ''; # discard any padding bits |
|
4217
|
|
|
|
|
|
|
|
|
4218
|
49
|
|
|
|
|
193
|
my $stored_crc32 = bits2int_lsb($in_fh, 32, \$buffer); |
|
4219
|
49
|
|
|
|
|
168
|
my $actual_crc32 = $crc32; |
|
4220
|
|
|
|
|
|
|
|
|
4221
|
49
|
50
|
|
|
|
214
|
if ($stored_crc32 != $actual_crc32) { |
|
4222
|
0
|
|
|
|
|
0
|
confess "[!] The CRC32 does not match: $actual_crc32 (actual) != $stored_crc32 (stored)"; |
|
4223
|
|
|
|
|
|
|
} |
|
4224
|
|
|
|
|
|
|
else { |
|
4225
|
49
|
50
|
|
|
|
188
|
$VERBOSE && print STDERR ":: CRC32 value: $actual_crc32\n"; |
|
4226
|
|
|
|
|
|
|
} |
|
4227
|
|
|
|
|
|
|
|
|
4228
|
49
|
|
|
|
|
159
|
my $stored_length = bits2int_lsb($in_fh, 32, \$buffer); |
|
4229
|
|
|
|
|
|
|
|
|
4230
|
49
|
50
|
|
|
|
216
|
if ($stored_length != $actual_length) { |
|
4231
|
0
|
|
|
|
|
0
|
confess "[!] The length does not match: $actual_length (actual) != $stored_length (stored)"; |
|
4232
|
|
|
|
|
|
|
} |
|
4233
|
|
|
|
|
|
|
else { |
|
4234
|
49
|
50
|
|
|
|
179
|
$VERBOSE && print STDERR ":: Total length: $actual_length\n"; |
|
4235
|
|
|
|
|
|
|
} |
|
4236
|
|
|
|
|
|
|
|
|
4237
|
49
|
100
|
|
|
|
215
|
if (eof($in_fh)) { |
|
4238
|
46
|
50
|
|
|
|
137
|
$VERBOSE && print STDERR "\n:: Reached the end of the file.\n"; |
|
4239
|
|
|
|
|
|
|
} |
|
4240
|
|
|
|
|
|
|
else { |
|
4241
|
3
|
50
|
|
|
|
16
|
$VERBOSE && print STDERR "\n:: There is something else in the container! Trying to recurse!\n\n"; |
|
4242
|
3
|
|
|
|
|
24
|
return ($decompressed . __SUB__->($in_fh)); |
|
4243
|
|
|
|
|
|
|
} |
|
4244
|
|
|
|
|
|
|
|
|
4245
|
46
|
|
|
|
|
1426
|
return $decompressed; |
|
4246
|
|
|
|
|
|
|
} |
|
4247
|
|
|
|
|
|
|
|
|
4248
|
|
|
|
|
|
|
############################### |
|
4249
|
|
|
|
|
|
|
# ZLIB compressor |
|
4250
|
|
|
|
|
|
|
############################### |
|
4251
|
|
|
|
|
|
|
|
|
4252
|
52
|
|
|
52
|
1
|
264353
|
sub zlib_compress ($in_fh, $lzss_encoding_sub = \&lzss_encode) { |
|
|
52
|
|
|
|
|
151
|
|
|
|
52
|
|
|
|
|
176
|
|
|
|
52
|
|
|
|
|
101
|
|
|
4253
|
|
|
|
|
|
|
|
|
4254
|
52
|
100
|
|
|
|
202
|
if (ref($in_fh) eq '') { |
|
4255
|
26
|
50
|
|
|
|
413
|
open(my $fh2, '<:raw', \$in_fh) or confess "error: $!"; |
|
4256
|
26
|
|
|
|
|
99
|
return __SUB__->($fh2, $lzss_encoding_sub); |
|
4257
|
|
|
|
|
|
|
} |
|
4258
|
|
|
|
|
|
|
|
|
4259
|
26
|
|
|
|
|
91
|
my $compressed = ''; |
|
4260
|
|
|
|
|
|
|
|
|
4261
|
26
|
|
|
|
|
218
|
open my $out_fh, '>:raw', \$compressed; |
|
4262
|
|
|
|
|
|
|
|
|
4263
|
26
|
50
|
|
|
|
106
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
4264
|
26
|
|
|
|
|
65
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
4265
|
26
|
|
|
|
|
65
|
local $LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsing |
|
4266
|
|
|
|
|
|
|
|
|
4267
|
26
|
|
|
|
|
67
|
my $CMF = (7 << 4) | 8; |
|
4268
|
26
|
|
|
|
|
57
|
my $FLG = 2 << 6; |
|
4269
|
|
|
|
|
|
|
|
|
4270
|
26
|
|
|
|
|
133
|
while (($CMF * 256 + $FLG) % 31 != 0) { |
|
4271
|
728
|
|
|
|
|
1518
|
++$FLG; |
|
4272
|
|
|
|
|
|
|
} |
|
4273
|
|
|
|
|
|
|
|
|
4274
|
26
|
|
|
|
|
63
|
my $bitstring = ''; |
|
4275
|
26
|
|
|
|
|
50
|
my $adler32 = 1; |
|
4276
|
|
|
|
|
|
|
|
|
4277
|
26
|
|
|
|
|
98
|
print $out_fh chr($CMF); |
|
4278
|
26
|
|
|
|
|
83
|
print $out_fh chr($FLG); |
|
4279
|
|
|
|
|
|
|
|
|
4280
|
26
|
100
|
|
|
|
106
|
if (eof($in_fh)) { # empty file |
|
4281
|
2
|
|
|
|
|
5
|
$bitstring = '1' . '10' . '0000000'; |
|
4282
|
|
|
|
|
|
|
} |
|
4283
|
|
|
|
|
|
|
|
|
4284
|
26
|
|
|
|
|
55
|
state $CHUNK_SIZE = (1 << 15) - 1; |
|
4285
|
|
|
|
|
|
|
|
|
4286
|
26
|
|
|
|
|
202
|
while (read($in_fh, (my $chunk), $CHUNK_SIZE)) { |
|
4287
|
|
|
|
|
|
|
|
|
4288
|
24
|
|
|
|
|
102
|
$adler32 = adler32($chunk, $adler32); |
|
4289
|
24
|
50
|
|
|
|
141
|
$bitstring .= eof($in_fh) ? '1' : '0'; |
|
4290
|
|
|
|
|
|
|
|
|
4291
|
24
|
|
|
|
|
121
|
my ($literals, $distances, $lengths) = $lzss_encoding_sub->($chunk); |
|
4292
|
|
|
|
|
|
|
|
|
4293
|
24
|
|
|
|
|
134
|
my $bt1_bitstring = deflate_create_block_type_1($literals, $distances, $lengths); |
|
4294
|
|
|
|
|
|
|
|
|
4295
|
|
|
|
|
|
|
# When block type 1 is larger than the input, then we have random uncompressible data: use block type 0 |
|
4296
|
24
|
50
|
|
|
|
246
|
if ((length($bt1_bitstring) >> 3) > length($chunk) + 5) { |
|
4297
|
|
|
|
|
|
|
|
|
4298
|
0
|
0
|
|
|
|
0
|
$VERBOSE && say STDERR ":: Using block type: 0"; |
|
4299
|
|
|
|
|
|
|
|
|
4300
|
0
|
|
|
|
|
0
|
$bitstring .= '00'; |
|
4301
|
|
|
|
|
|
|
|
|
4302
|
0
|
|
|
|
|
0
|
print $out_fh pack('b*', $bitstring); # pads to a byte |
|
4303
|
0
|
|
|
|
|
0
|
print $out_fh pack('b*', deflate_create_block_type_0_header($chunk)); |
|
4304
|
0
|
|
|
|
|
0
|
print $out_fh $chunk; |
|
4305
|
|
|
|
|
|
|
|
|
4306
|
0
|
|
|
|
|
0
|
$bitstring = ''; |
|
4307
|
0
|
|
|
|
|
0
|
next; |
|
4308
|
|
|
|
|
|
|
} |
|
4309
|
|
|
|
|
|
|
|
|
4310
|
24
|
|
|
|
|
110
|
my $bt2_bitstring = deflate_create_block_type_2($literals, $distances, $lengths); |
|
4311
|
|
|
|
|
|
|
|
|
4312
|
|
|
|
|
|
|
# When block type 2 is larger than block type 1, then we may have very small data |
|
4313
|
24
|
100
|
|
|
|
252
|
if (length($bt2_bitstring) > length($bt1_bitstring)) { |
|
4314
|
23
|
50
|
|
|
|
99
|
$VERBOSE && say STDERR ":: Using block type: 1"; |
|
4315
|
23
|
|
|
|
|
93
|
$bitstring .= $bt1_bitstring; |
|
4316
|
|
|
|
|
|
|
} |
|
4317
|
|
|
|
|
|
|
else { |
|
4318
|
1
|
50
|
|
|
|
5
|
$VERBOSE && say STDERR ":: Using block type: 2"; |
|
4319
|
1
|
|
|
|
|
30
|
$bitstring .= $bt2_bitstring; |
|
4320
|
|
|
|
|
|
|
} |
|
4321
|
|
|
|
|
|
|
|
|
4322
|
24
|
|
|
|
|
687
|
print $out_fh pack('b*', substr($bitstring, 0, length($bitstring) - (length($bitstring) % 8), '')); |
|
4323
|
|
|
|
|
|
|
} |
|
4324
|
|
|
|
|
|
|
|
|
4325
|
26
|
100
|
|
|
|
143
|
if ($bitstring ne '') { |
|
4326
|
24
|
|
|
|
|
111
|
print $out_fh pack('b*', $bitstring); |
|
4327
|
|
|
|
|
|
|
} |
|
4328
|
|
|
|
|
|
|
|
|
4329
|
26
|
|
|
|
|
118
|
print $out_fh int2bytes($adler32, 4); |
|
4330
|
|
|
|
|
|
|
|
|
4331
|
26
|
|
|
|
|
432
|
return $compressed; |
|
4332
|
|
|
|
|
|
|
} |
|
4333
|
|
|
|
|
|
|
|
|
4334
|
|
|
|
|
|
|
############################### |
|
4335
|
|
|
|
|
|
|
# ZLIB decompressor |
|
4336
|
|
|
|
|
|
|
############################### |
|
4337
|
|
|
|
|
|
|
|
|
4338
|
59
|
|
|
59
|
1
|
124
|
sub zlib_decompress($in_fh) { |
|
|
59
|
|
|
|
|
152
|
|
|
|
59
|
|
|
|
|
104
|
|
|
4339
|
|
|
|
|
|
|
|
|
4340
|
59
|
100
|
|
|
|
191
|
if (ref($in_fh) eq '') { |
|
4341
|
29
|
50
|
|
|
|
398
|
open(my $fh2, '<:raw', \$in_fh) or confess "error: $!"; |
|
4342
|
29
|
|
|
|
|
155
|
return __SUB__->($fh2); |
|
4343
|
|
|
|
|
|
|
} |
|
4344
|
|
|
|
|
|
|
|
|
4345
|
30
|
|
|
|
|
91
|
my $decompressed = ''; |
|
4346
|
|
|
|
|
|
|
|
|
4347
|
30
|
|
|
|
|
240
|
open my $out_fh, '>:raw', \$decompressed; |
|
4348
|
|
|
|
|
|
|
|
|
4349
|
30
|
50
|
|
|
|
129
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); # minimum match length in LZ parsing |
|
4350
|
30
|
|
|
|
|
67
|
local $LZ_MAX_LEN = 258; # maximum match length in LZ parsing |
|
4351
|
30
|
|
|
|
|
89
|
local $LZ_MAX_DIST = (1 << 15) - 1; # maximum allowed back-reference distance in LZ parsin |
|
4352
|
|
|
|
|
|
|
|
|
4353
|
30
|
|
|
|
|
66
|
my $adler32 = 1; |
|
4354
|
|
|
|
|
|
|
|
|
4355
|
30
|
|
|
|
|
148
|
my $CMF = ord(getc($in_fh)); |
|
4356
|
30
|
|
|
|
|
128
|
my $FLG = ord(getc($in_fh)); |
|
4357
|
|
|
|
|
|
|
|
|
4358
|
30
|
50
|
|
|
|
190
|
if (($CMF * 256 + $FLG) % 31 != 0) { |
|
4359
|
0
|
|
|
|
|
0
|
confess "Invalid header checksum!\n"; |
|
4360
|
|
|
|
|
|
|
} |
|
4361
|
|
|
|
|
|
|
|
|
4362
|
30
|
|
|
|
|
103
|
my $CINFO = $CMF >> 4; |
|
4363
|
|
|
|
|
|
|
|
|
4364
|
30
|
50
|
|
|
|
126
|
if ($CINFO > 7) { |
|
4365
|
0
|
|
|
|
|
0
|
confess "Values of CINFO above 7 are not supported!\n"; |
|
4366
|
|
|
|
|
|
|
} |
|
4367
|
|
|
|
|
|
|
|
|
4368
|
30
|
|
|
|
|
115
|
my $method = $CMF & 0b1111; |
|
4369
|
|
|
|
|
|
|
|
|
4370
|
30
|
50
|
|
|
|
111
|
if ($method != 8) { |
|
4371
|
0
|
|
|
|
|
0
|
confess "Only method 8 (DEFLATE) is supported!\n"; |
|
4372
|
|
|
|
|
|
|
} |
|
4373
|
|
|
|
|
|
|
|
|
4374
|
30
|
|
|
|
|
92
|
my $buffer = ''; |
|
4375
|
30
|
|
|
|
|
65
|
my $search_window = ''; |
|
4376
|
|
|
|
|
|
|
|
|
4377
|
30
|
|
|
|
|
62
|
while (1) { |
|
4378
|
|
|
|
|
|
|
|
|
4379
|
30
|
|
|
|
|
124
|
my $is_last = read_bit_lsb($in_fh, \$buffer); |
|
4380
|
30
|
|
|
|
|
178
|
my $chunk = deflate_extract_next_block($in_fh, \$buffer, \$search_window); |
|
4381
|
|
|
|
|
|
|
|
|
4382
|
30
|
|
|
|
|
143
|
print $out_fh $chunk; |
|
4383
|
30
|
|
|
|
|
135
|
$adler32 = adler32($chunk, $adler32); |
|
4384
|
|
|
|
|
|
|
|
|
4385
|
30
|
50
|
|
|
|
183
|
last if $is_last; |
|
4386
|
|
|
|
|
|
|
} |
|
4387
|
|
|
|
|
|
|
|
|
4388
|
30
|
|
|
|
|
120
|
my $stored_adler32 = bytes2int($in_fh, 4); |
|
4389
|
|
|
|
|
|
|
|
|
4390
|
30
|
50
|
|
|
|
487
|
if ($adler32 != $stored_adler32) { |
|
4391
|
0
|
|
|
|
|
0
|
confess "Adler32 checksum does not match: $adler32 (actual) != $stored_adler32 (stored)\n"; |
|
4392
|
|
|
|
|
|
|
} |
|
4393
|
|
|
|
|
|
|
|
|
4394
|
30
|
100
|
|
|
|
130
|
if (eof($in_fh)) { |
|
4395
|
29
|
50
|
|
|
|
95
|
$VERBOSE && print STDERR "\n:: Reached the end of the file.\n"; |
|
4396
|
|
|
|
|
|
|
} |
|
4397
|
|
|
|
|
|
|
else { |
|
4398
|
1
|
50
|
|
|
|
6
|
$VERBOSE && print STDERR "\n:: There is something else in the container! Trying to recurse!\n\n"; |
|
4399
|
1
|
|
|
|
|
11
|
return ($decompressed . __SUB__->($in_fh)); |
|
4400
|
|
|
|
|
|
|
} |
|
4401
|
|
|
|
|
|
|
|
|
4402
|
29
|
|
|
|
|
609
|
return $decompressed; |
|
4403
|
|
|
|
|
|
|
} |
|
4404
|
|
|
|
|
|
|
|
|
4405
|
|
|
|
|
|
|
############################### |
|
4406
|
|
|
|
|
|
|
# LZ4 compressor |
|
4407
|
|
|
|
|
|
|
############################### |
|
4408
|
|
|
|
|
|
|
|
|
4409
|
56
|
|
|
56
|
1
|
365924
|
sub lz4_compress($fh, $lzss_encoding_sub = \&lzss_encode) { |
|
|
56
|
|
|
|
|
131
|
|
|
|
56
|
|
|
|
|
130
|
|
|
|
56
|
|
|
|
|
145
|
|
|
4410
|
|
|
|
|
|
|
|
|
4411
|
56
|
100
|
|
|
|
249
|
if (ref($fh) eq '') { |
|
4412
|
28
|
50
|
|
|
|
631
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
4413
|
28
|
|
|
|
|
177
|
return __SUB__->($fh2, $lzss_encoding_sub); |
|
4414
|
|
|
|
|
|
|
} |
|
4415
|
|
|
|
|
|
|
|
|
4416
|
28
|
|
|
|
|
70
|
my $compressed = ''; |
|
4417
|
|
|
|
|
|
|
|
|
4418
|
28
|
|
|
|
|
114
|
$compressed .= int2bytes_lsb(0x184D2204, 4); # LZ4 magic number |
|
4419
|
|
|
|
|
|
|
|
|
4420
|
28
|
|
|
|
|
91
|
my $fd = ''; # frame description |
|
4421
|
28
|
|
|
|
|
64
|
$fd .= chr(0b01_10_00_00); # flags (FLG) |
|
4422
|
28
|
|
|
|
|
47
|
$fd .= chr(0b0_111_0000); # block description (BD) |
|
4423
|
|
|
|
|
|
|
|
|
4424
|
28
|
|
|
|
|
76
|
$compressed .= $fd; |
|
4425
|
28
|
|
|
|
|
95
|
$compressed .= chr(115); # header checksum |
|
4426
|
|
|
|
|
|
|
|
|
4427
|
28
|
|
|
|
|
94
|
state $CHUNK_SIZE = 1 << 17; |
|
4428
|
|
|
|
|
|
|
|
|
4429
|
28
|
|
|
|
|
273
|
while (read($fh, (my $chunk), $CHUNK_SIZE)) { |
|
4430
|
|
|
|
|
|
|
|
|
4431
|
26
|
|
|
|
|
66
|
my ($literals, $distances, $lengths) = do { |
|
4432
|
26
|
50
|
|
|
|
104
|
local $LZ_MIN_LEN = 4 if ($LZ_MIN_LEN < 4); |
|
4433
|
26
|
|
|
|
|
66
|
local $LZ_MAX_LEN = ~0; |
|
4434
|
26
|
|
|
|
|
48
|
local $LZ_MAX_DIST = (1 << 16) - 1; |
|
4435
|
26
|
|
|
|
|
127
|
$lzss_encoding_sub->(substr($chunk, 0, -5)); |
|
4436
|
|
|
|
|
|
|
}; |
|
4437
|
|
|
|
|
|
|
|
|
4438
|
|
|
|
|
|
|
# The last 5 bytes of each block must be literals |
|
4439
|
|
|
|
|
|
|
# https://github.com/lz4/lz4/issues/1495 |
|
4440
|
26
|
|
|
|
|
275
|
push @$literals, unpack('C*', substr($chunk, -5)); |
|
4441
|
|
|
|
|
|
|
|
|
4442
|
26
|
|
|
|
|
78
|
my $literals_end = $#{$literals}; |
|
|
26
|
|
|
|
|
96
|
|
|
4443
|
|
|
|
|
|
|
|
|
4444
|
26
|
|
|
|
|
68
|
my $block = ''; |
|
4445
|
|
|
|
|
|
|
|
|
4446
|
26
|
|
|
|
|
103
|
for (my $i = 0 ; $i <= $literals_end ; ++$i) { |
|
4447
|
|
|
|
|
|
|
|
|
4448
|
3594
|
|
|
|
|
5996
|
my @uncompressed; |
|
4449
|
3594
|
|
100
|
|
|
18140
|
while ($i <= $literals_end and defined($literals->[$i])) { |
|
4450
|
5577
|
|
|
|
|
14030
|
push @uncompressed, $literals->[$i]; |
|
4451
|
5577
|
|
|
|
|
27735
|
++$i; |
|
4452
|
|
|
|
|
|
|
} |
|
4453
|
|
|
|
|
|
|
|
|
4454
|
3594
|
|
|
|
|
9090
|
my $literals_string = pack('C*', @uncompressed); |
|
4455
|
3594
|
|
|
|
|
6670
|
my $literals_length = scalar(@uncompressed); |
|
4456
|
|
|
|
|
|
|
|
|
4457
|
3594
|
100
|
|
|
|
9154
|
my $match_len = $lengths->[$i] ? ($lengths->[$i] - 4) : 0; |
|
4458
|
|
|
|
|
|
|
|
|
4459
|
3594
|
100
|
|
|
|
11381
|
$block .= chr((($literals_length >= 15 ? 15 : $literals_length) << 4) | ($match_len >= 15 ? 15 : $match_len)); |
|
|
|
100
|
|
|
|
|
|
|
4460
|
|
|
|
|
|
|
|
|
4461
|
3594
|
|
|
|
|
5959
|
$literals_length -= 15; |
|
4462
|
3594
|
|
|
|
|
5834
|
$match_len -= 15; |
|
4463
|
|
|
|
|
|
|
|
|
4464
|
3594
|
|
|
|
|
8757
|
while ($literals_length >= 0) { |
|
4465
|
72
|
50
|
|
|
|
268
|
$block .= ($literals_length >= 255 ? "\xff" : chr($literals_length)); |
|
4466
|
72
|
|
|
|
|
236
|
$literals_length -= 255; |
|
4467
|
|
|
|
|
|
|
} |
|
4468
|
|
|
|
|
|
|
|
|
4469
|
3594
|
|
|
|
|
9287
|
$block .= $literals_string; |
|
4470
|
|
|
|
|
|
|
|
|
4471
|
3594
|
|
100
|
|
|
9337
|
my $dist = $distances->[$i] // last; |
|
4472
|
3568
|
|
|
|
|
12009
|
$block .= pack('b*', scalar reverse sprintf('%016b', $dist)); |
|
4473
|
|
|
|
|
|
|
|
|
4474
|
3568
|
|
|
|
|
13594
|
while ($match_len >= 0) { |
|
4475
|
995
|
100
|
|
|
|
2790
|
$block .= ($match_len >= 255 ? "\xff" : chr($match_len)); |
|
4476
|
995
|
|
|
|
|
4217
|
$match_len -= 255; |
|
4477
|
|
|
|
|
|
|
} |
|
4478
|
|
|
|
|
|
|
} |
|
4479
|
|
|
|
|
|
|
|
|
4480
|
26
|
50
|
|
|
|
94
|
if ($block ne '') { |
|
4481
|
26
|
|
|
|
|
138
|
$compressed .= int2bytes_lsb(length($block), 4); |
|
4482
|
26
|
|
|
|
|
3455
|
$compressed .= $block; |
|
4483
|
|
|
|
|
|
|
} |
|
4484
|
|
|
|
|
|
|
} |
|
4485
|
|
|
|
|
|
|
|
|
4486
|
28
|
|
|
|
|
99
|
$compressed .= int2bytes_lsb(0x00000000, 4); # EndMark |
|
4487
|
28
|
|
|
|
|
610
|
return $compressed; |
|
4488
|
|
|
|
|
|
|
} |
|
4489
|
|
|
|
|
|
|
|
|
4490
|
|
|
|
|
|
|
############################### |
|
4491
|
|
|
|
|
|
|
# LZ4 decompressor |
|
4492
|
|
|
|
|
|
|
############################### |
|
4493
|
|
|
|
|
|
|
|
|
4494
|
94
|
|
|
94
|
1
|
208
|
sub lz4_decompress($fh) { |
|
|
94
|
|
|
|
|
193
|
|
|
|
94
|
|
|
|
|
213
|
|
|
4495
|
|
|
|
|
|
|
|
|
4496
|
94
|
100
|
|
|
|
348
|
if (ref($fh) eq '') { |
|
4497
|
47
|
50
|
|
|
|
682
|
open(my $fh2, '<:raw', \$fh) or confess "error: $!"; |
|
4498
|
47
|
|
|
|
|
184
|
return __SUB__->($fh2); |
|
4499
|
|
|
|
|
|
|
} |
|
4500
|
|
|
|
|
|
|
|
|
4501
|
47
|
|
|
|
|
135
|
my $decompressed = ''; |
|
4502
|
|
|
|
|
|
|
|
|
4503
|
47
|
|
|
|
|
192
|
while (!eof($fh)) { |
|
4504
|
|
|
|
|
|
|
|
|
4505
|
51
|
50
|
|
|
|
172
|
bytes2int_lsb($fh, 4) == 0x184D2204 or confess "Incorrect LZ4 Frame magic number"; |
|
4506
|
|
|
|
|
|
|
|
|
4507
|
51
|
|
|
|
|
232
|
my $FLG = ord(getc($fh)); |
|
4508
|
51
|
|
|
|
|
219
|
my $BD = ord(getc($fh)); |
|
4509
|
|
|
|
|
|
|
|
|
4510
|
51
|
|
|
|
|
198
|
my $version = $FLG & 0b11_00_00_00; |
|
4511
|
51
|
|
|
|
|
156
|
my $B_indep = $FLG & 0b00_10_00_00; |
|
4512
|
51
|
|
|
|
|
128
|
my $B_checksum = $FLG & 0b00_01_00_00; |
|
4513
|
51
|
|
|
|
|
125
|
my $C_size = $FLG & 0b00_00_10_00; |
|
4514
|
51
|
|
|
|
|
168
|
my $C_checksum = $FLG & 0b00_00_01_00; |
|
4515
|
51
|
|
|
|
|
143
|
my $DictID = $FLG & 0b00_00_00_01; |
|
4516
|
|
|
|
|
|
|
|
|
4517
|
51
|
|
|
|
|
139
|
my $Block_MaxSize = $BD & 0b0_111_0000; |
|
4518
|
|
|
|
|
|
|
|
|
4519
|
51
|
50
|
|
|
|
176
|
$VERBOSE && say STDERR "Maximum block size: $Block_MaxSize"; |
|
4520
|
|
|
|
|
|
|
|
|
4521
|
51
|
50
|
|
|
|
195
|
if ($version != 0b01_00_00_00) { |
|
4522
|
0
|
|
|
|
|
0
|
confess "Error: Invalid version number"; |
|
4523
|
|
|
|
|
|
|
} |
|
4524
|
|
|
|
|
|
|
|
|
4525
|
51
|
50
|
|
|
|
202
|
if ($C_size) { |
|
4526
|
0
|
|
|
|
|
0
|
my $content_size = bytes2int_lsb($fh, 8); |
|
4527
|
0
|
0
|
|
|
|
0
|
$VERBOSE && say STDERR "Content size: ", $content_size; |
|
4528
|
|
|
|
|
|
|
} |
|
4529
|
|
|
|
|
|
|
|
|
4530
|
51
|
50
|
|
|
|
168
|
if ($DictID) { |
|
4531
|
0
|
|
|
|
|
0
|
my $dict_id = bytes2int_lsb($fh, 4); |
|
4532
|
0
|
0
|
|
|
|
0
|
$VERBOSE && say STDERR "Dictionary ID: ", $dict_id; |
|
4533
|
|
|
|
|
|
|
} |
|
4534
|
|
|
|
|
|
|
|
|
4535
|
51
|
|
|
|
|
170
|
my $header_checksum = ord(getc($fh)); |
|
4536
|
|
|
|
|
|
|
|
|
4537
|
|
|
|
|
|
|
# TODO: compute and verify the header checksum |
|
4538
|
51
|
50
|
|
|
|
138
|
$VERBOSE && say STDERR "Header checksum: ", $header_checksum; |
|
4539
|
|
|
|
|
|
|
|
|
4540
|
51
|
|
|
|
|
109
|
my $decoded = ''; |
|
4541
|
|
|
|
|
|
|
|
|
4542
|
51
|
|
|
|
|
170
|
while (!eof($fh)) { |
|
4543
|
|
|
|
|
|
|
|
|
4544
|
98
|
|
|
|
|
268
|
my $block_size = bytes2int_lsb($fh, 4); |
|
4545
|
|
|
|
|
|
|
|
|
4546
|
98
|
100
|
|
|
|
410
|
if ($block_size == 0x00000000) { # signifies an EndMark |
|
4547
|
51
|
50
|
|
|
|
143
|
$VERBOSE && say STDERR "Block size == 0"; |
|
4548
|
51
|
|
|
|
|
138
|
last; |
|
4549
|
|
|
|
|
|
|
} |
|
4550
|
|
|
|
|
|
|
|
|
4551
|
47
|
50
|
|
|
|
133
|
$VERBOSE && say STDERR "Block size: $block_size"; |
|
4552
|
|
|
|
|
|
|
|
|
4553
|
47
|
100
|
|
|
|
180
|
if ($block_size >> 31) { |
|
4554
|
3
|
50
|
|
|
|
16
|
$VERBOSE && say STDERR "Highest bit set: ", $block_size; |
|
4555
|
3
|
|
|
|
|
8
|
$block_size &= ((1 << 31) - 1); |
|
4556
|
3
|
50
|
|
|
|
12
|
$VERBOSE && say STDERR "Block size: ", $block_size; |
|
4557
|
3
|
|
|
|
|
7
|
my $uncompressed = ''; |
|
4558
|
3
|
|
|
|
|
14
|
read($fh, $uncompressed, $block_size); |
|
4559
|
3
|
|
|
|
|
11
|
$decoded .= $uncompressed; |
|
4560
|
|
|
|
|
|
|
} |
|
4561
|
|
|
|
|
|
|
else { |
|
4562
|
|
|
|
|
|
|
|
|
4563
|
44
|
|
|
|
|
121
|
my $compressed = ''; |
|
4564
|
44
|
|
|
|
|
605
|
read($fh, $compressed, $block_size); |
|
4565
|
|
|
|
|
|
|
|
|
4566
|
44
|
|
|
|
|
173
|
while ($compressed ne '') { |
|
4567
|
3636
|
|
|
|
|
11117
|
my $len_byte = ord(substr($compressed, 0, 1, '')); |
|
4568
|
|
|
|
|
|
|
|
|
4569
|
3636
|
|
|
|
|
9270
|
my $literals_length = $len_byte >> 4; |
|
4570
|
3636
|
|
|
|
|
9238
|
my $match_len = $len_byte & 0b1111; |
|
4571
|
|
|
|
|
|
|
|
|
4572
|
|
|
|
|
|
|
## say STDERR "Literal: ", $literals_length; |
|
4573
|
|
|
|
|
|
|
## say STDERR "Match len: ", $match_len; |
|
4574
|
|
|
|
|
|
|
|
|
4575
|
3636
|
100
|
|
|
|
10353
|
if ($literals_length == 15) { |
|
4576
|
72
|
|
|
|
|
171
|
while (1) { |
|
4577
|
72
|
|
|
|
|
243
|
my $byte_len = ord(substr($compressed, 0, 1, '')); |
|
4578
|
72
|
|
|
|
|
274
|
$literals_length += $byte_len; |
|
4579
|
72
|
50
|
|
|
|
374
|
last if $byte_len != 255; |
|
4580
|
|
|
|
|
|
|
} |
|
4581
|
|
|
|
|
|
|
} |
|
4582
|
|
|
|
|
|
|
|
|
4583
|
|
|
|
|
|
|
## say STDERR "Total literals length: ", $literals_length; |
|
4584
|
|
|
|
|
|
|
|
|
4585
|
3636
|
|
|
|
|
6853
|
my $literals = ''; |
|
4586
|
|
|
|
|
|
|
|
|
4587
|
3636
|
100
|
|
|
|
9583
|
if ($literals_length > 0) { |
|
4588
|
1140
|
|
|
|
|
3137
|
$literals = substr($compressed, 0, $literals_length, ''); |
|
4589
|
|
|
|
|
|
|
} |
|
4590
|
|
|
|
|
|
|
|
|
4591
|
3636
|
100
|
|
|
|
10272
|
if ($compressed eq '') { # end of block |
|
4592
|
44
|
|
|
|
|
115
|
$decoded .= $literals; |
|
4593
|
44
|
|
|
|
|
187
|
last; |
|
4594
|
|
|
|
|
|
|
} |
|
4595
|
|
|
|
|
|
|
|
|
4596
|
3592
|
|
|
|
|
15916
|
my $offset = oct('0b' . reverse unpack('b16', substr($compressed, 0, 2, ''))); |
|
4597
|
|
|
|
|
|
|
|
|
4598
|
3592
|
50
|
|
|
|
11362
|
if ($offset == 0) { |
|
4599
|
0
|
|
|
|
|
0
|
confess "Corrupted block"; |
|
4600
|
|
|
|
|
|
|
} |
|
4601
|
|
|
|
|
|
|
|
|
4602
|
|
|
|
|
|
|
## say STDERR "Offset: $offset"; |
|
4603
|
|
|
|
|
|
|
|
|
4604
|
3592
|
100
|
|
|
|
9882
|
if ($match_len == 15) { |
|
4605
|
983
|
|
|
|
|
1888
|
while (1) { |
|
4606
|
995
|
|
|
|
|
2887
|
my $byte_len = ord(substr($compressed, 0, 1, '')); |
|
4607
|
995
|
|
|
|
|
2288
|
$match_len += $byte_len; |
|
4608
|
995
|
100
|
|
|
|
3724
|
last if $byte_len != 255; |
|
4609
|
|
|
|
|
|
|
} |
|
4610
|
|
|
|
|
|
|
} |
|
4611
|
|
|
|
|
|
|
|
|
4612
|
3592
|
|
|
|
|
8382
|
$decoded .= $literals; |
|
4613
|
3592
|
|
|
|
|
7631
|
$match_len += 4; |
|
4614
|
|
|
|
|
|
|
|
|
4615
|
|
|
|
|
|
|
## say STDERR "Total match len: $match_len\n"; |
|
4616
|
|
|
|
|
|
|
|
|
4617
|
3592
|
100
|
|
|
|
9857
|
if ($offset >= $match_len) { # non-overlapping matches |
|
|
|
100
|
|
|
|
|
|
|
4618
|
3447
|
|
|
|
|
22976
|
$decoded .= substr($decoded, length($decoded) - $offset, $match_len); |
|
4619
|
|
|
|
|
|
|
} |
|
4620
|
|
|
|
|
|
|
elsif ($offset == 1) { |
|
4621
|
25
|
|
|
|
|
194
|
$decoded .= substr($decoded, -1) x $match_len; |
|
4622
|
|
|
|
|
|
|
} |
|
4623
|
|
|
|
|
|
|
else { # overlapping matches |
|
4624
|
120
|
|
|
|
|
510
|
foreach my $i (1 .. $match_len) { |
|
4625
|
2438
|
|
|
|
|
9103
|
$decoded .= substr($decoded, length($decoded) - $offset, 1); |
|
4626
|
|
|
|
|
|
|
} |
|
4627
|
|
|
|
|
|
|
} |
|
4628
|
|
|
|
|
|
|
} |
|
4629
|
|
|
|
|
|
|
} |
|
4630
|
|
|
|
|
|
|
|
|
4631
|
47
|
100
|
|
|
|
169
|
if ($B_checksum) { |
|
4632
|
5
|
|
|
|
|
20
|
my $content_checksum = bytes2int_lsb($fh, 4); |
|
4633
|
5
|
50
|
|
|
|
26
|
$VERBOSE && say STDERR "Block checksum: $content_checksum"; |
|
4634
|
|
|
|
|
|
|
} |
|
4635
|
|
|
|
|
|
|
|
|
4636
|
47
|
50
|
|
|
|
151
|
if ($B_indep) { # blocks are independent of each other |
|
|
|
0
|
|
|
|
|
|
|
4637
|
47
|
|
|
|
|
197
|
$decompressed .= $decoded; |
|
4638
|
47
|
|
|
|
|
270
|
$decoded = ''; |
|
4639
|
|
|
|
|
|
|
} |
|
4640
|
|
|
|
|
|
|
elsif (length($decoded) > 2**16) { # blocks are dependent |
|
4641
|
0
|
|
|
|
|
0
|
$decompressed .= substr($decoded, 0, -(2**16), ''); |
|
4642
|
|
|
|
|
|
|
} |
|
4643
|
|
|
|
|
|
|
} |
|
4644
|
|
|
|
|
|
|
|
|
4645
|
|
|
|
|
|
|
# TODO: compute and verify checksum |
|
4646
|
51
|
100
|
|
|
|
160
|
if ($C_checksum) { |
|
4647
|
12
|
|
|
|
|
33
|
my $content_checksum = bytes2int_lsb($fh, 4); |
|
4648
|
12
|
50
|
|
|
|
49
|
$VERBOSE && say STDERR "Content checksum: $content_checksum"; |
|
4649
|
|
|
|
|
|
|
} |
|
4650
|
|
|
|
|
|
|
|
|
4651
|
51
|
|
|
|
|
424
|
$decompressed .= $decoded; |
|
4652
|
|
|
|
|
|
|
} |
|
4653
|
|
|
|
|
|
|
|
|
4654
|
47
|
|
|
|
|
608
|
return $decompressed; |
|
4655
|
|
|
|
|
|
|
} |
|
4656
|
|
|
|
|
|
|
|
|
4657
|
|
|
|
|
|
|
1; |
|
4658
|
|
|
|
|
|
|
|
|
4659
|
|
|
|
|
|
|
__END__ |