File Coverage

blib/lib/Gzip/BinarySearch.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				package Gzip::BinarySearch;
2
3	9	9	291045	use strict;
	9		24
	9		291
4	9	9	47	use warnings;
	9		18
	9		465
5	9	9	46	use Carp;
	9		20
	9		931
6	9	9	52	use List::Util qw(min max);
	9		16
	9		1958
7	9	9	15683	use Gzip::RandomAccess;
	0
	0
8
9				our $VERSION = '0.91';
10
11				BEGIN {
12				use Exporter;
13				our @ISA = qw(Exporter);
14				our @EXPORT_OK = qw(fs_column tsv_column);
15				}
16
17				# When binary searching, assume this is the biggest line we'll find.
18				# We'll try this first, then try a bigger decompression if it fails.
19				my $DEFAULT_EST_LINE_LENGTH = 512;
20
21				# When looking for adjacent identical lines (find_all) extract this
22				# many bytes at a time.
23				my $DEFAULT_SURROUNDING_LINES_BLOCKSIZE = 4096;
24
25				my @GRA_ALLOWED_ARGS = qw(file index_file index_span cleanup);
26				my %ALLOWED_ARGS = map { $_ => 1 } (
27				@GRA_ALLOWED_ARGS, qw(key_func cmp_func),
28				qw(est_line_length surrounding_lines_blocksize),
29				);
30
31				sub new {
32				my ($class, %args) = @_;
33				my $gzip = $class->_build_gzip(%args);
34				for my $key (keys %args) {
35				$ALLOWED_ARGS{$key} or croak "Invalid argument '$key'";
36				}
37
38				my $key_func = $args{key_func} \|\| fs_column(qr/\s+/, 1);
39				my $cmp_func = $args{cmp_func} \|\| sub { $main::a cmp $main::b };
40
41				my $est_line_length = $args{est_line_length}
42				\|\| $DEFAULT_EST_LINE_LENGTH;
43
44				my $surrounding_lines_blocksize = $args{surrounding_lines_blocksize}
45				\|\| $DEFAULT_SURROUNDING_LINES_BLOCKSIZE;
46
47				bless {
48				gzip => $gzip,
49				max_offset => $gzip->uncompressed_size - 1,
50				est_line_length => $est_line_length,
51				surrounding_lines_blocksize => $surrounding_lines_blocksize,
52				key_func => $key_func,
53				cmp_func => $cmp_func,
54				}, $class;
55				}
56
57				sub find {
58				my ($self, $key) = @_;
59
60				my ($line, $mid) = $self->_find($key);
61				return $line if defined $line;
62				return;
63				}
64
65				sub find_all {
66				my ($self, $key) = @_;
67
68				my ($line, $mid) = $self->_find($key);
69				if (defined $line) {
70				return $self->_search_surrounding_lines($key, $line, $mid);
71				}
72				return;
73				}
74
75				sub gzip { shift->{gzip} }
76				sub est_line_length { shift->{est_line_length} }
77				sub surrounding_lines_blocksize { shift->{surrounding_lines_blocksize} }
78
79				# Convenience functions
80				sub fs_column {
81				my ($field_sep, $column_number) = @_;
82				croak "Invalid column number, should be 1-based"
83				if $column_number < 1;
84
85				if (!ref $field_sep && $field_sep eq ' ') {
86				# Force Perl to match a space, not \s+
87				$field_sep = qr/ /;
88				}
89
90				return sub {
91				chomp;
92				my @f = split $field_sep, $_, -1;
93				return $f[$column_number - 1];
94				}
95				}
96
97				sub tsv_column {
98				my $column_number = shift;
99				return fs_column("\t", $column_number);
100				}
101
102				# Given a key, binary search for the first line encountered with this key,
103				# and return both the line and its offset in the uncompressed data.
104				# Return nothing if the key was not found.
105				sub _find {
106				my ($self, $key) = @_;
107				my ($low, $high, $mid) = (0, $self->{max_offset}, undef);
108
109				while ($low <= $high) {
110				$mid = int(($low + $high) / 2);
111				my $line = $self->_get_line_at($mid);
112				my $line_key = $self->_get_key($line);
113				my $cmp = $self->_compare_keys($line_key, $key);
114
115				if ($cmp < 0) {
116				$low = $mid + 1;
117				}
118				elsif ($cmp > 0) {
119				$high = $mid - 1;
120				}
121				else {
122				return ($line, $mid);
123				}
124				}
125
126				# Key not found
127				return;
128				}
129
130				# Given an offset, return the entire line 'around' that offset.
131				# This includes any trailing newline character.
132				# If the offset is a newline, it will be counted as part of the preceding line.
133				sub _get_line_at {
134				my ($self, $offset) = @_;
135				my $L = $self->{est_line_length};
136
137				my $seek_start = max($offset - $L, 0); # global offset
138				my $seek_length = $L * 2;
139				my $midpoint = $offset - $seek_start; # relative to extracted $block, not global
140
141				my $line;
142				while (!defined $line) {
143				my $block = $self->{gzip}->extract($seek_start, $seek_length);
144
145				my $prev_nl = rindex($block, "\n", $midpoint - 1); # -1 if no match
146				my $next_nl = index($block, "\n", $midpoint);
147
148				if ($prev_nl == -1 && $seek_start > 0) {
149				my $extension = min($L, $seek_start);
150				$seek_start -= $extension;
151				$seek_length += $extension;
152				$midpoint += $extension;
153				}
154				elsif ($next_nl == -1 && $seek_start + $seek_length < $self->{max_offset}) {
155				$seek_length += $L;
156				}
157				else {
158				$next_nl = length($block) if $next_nl == -1; # no EOF newline - use last char
159				$line = substr($block, $prev_nl + 1, $next_nl - $prev_nl);
160				}
161				}
162
163				return $line;
164				}
165
166				sub _get_key {
167				my ($self, $line) = @_;
168				local *::_ = \$line;
169				return $self->{key_func}->();
170				}
171
172				sub _compare_keys {
173				my ($self, $key1, $key2) = @_;
174				local (::a, ::b) = (\$key1, \$key2);
175				return $self->{cmp_func}->();
176
177				}
178
179				# Given a line and a seekpoint in the uncompressed data, look forwards and backwards
180				# for adjacent lines with the same key. Return matching lines in same order as file.
181				sub _search_surrounding_lines {
182				my ($self, $query, $mid_line, $mid) = @_;
183				# Problem description:
184				# when searching for 'zits', the algorithm expands, then shrinks line-by-line
185				# at the end it trims the string to "zings\nzits\n..." and for some reason the
186				# zits offset isn't used to trim it further.
187				# you'll notice the _compare_keys thing which means $result_start is never set
188				# if we're shrinking (?)
189
190				my $block_start = max($mid - length($mid_line) + 1, 0);
191				my $block_end = min($mid + length($mid_line) - 1, $self->{max_offset});
192				my $block = $self->{gzip}->extract($block_start, $block_end - $block_start);
193
194				# offsets for the result range, relative to block_start
195				my ($result_start, $result_end) = (0, $block_end - $block_start);
196
197				my $shrinking = 0;
198				while (1) {
199				my ($line, $offset) = $self->_first_line($block, $result_start, $result_end, $block_start == 0);
200				if (!defined $line \|\| $self->_compare_keys($self->_get_key($line), $query) == 0) {
201				last if $shrinking;
202				last if $block_start == 0; # can't expand further
203				# Expand left
204				my $ex = min($self->{surrounding_lines_blocksize}, $block_start);
205				$block_start -= $ex;
206				$result_end += $ex;
207				my $chunk = $self->{gzip}->extract($block_start, $ex);
208				$block = $chunk . $block;
209				}
210				else {
211				# Shrink
212				$result_start = $offset + length($line);
213				$shrinking = 1;
214				}
215				}
216				my ($line, $offset) = $self->_first_line($block, $result_start, $result_end, $block_start == 0);
217				$result_start = $offset;
218
219				$shrinking = 0;
220				while (1) {
221				my ($line, $offset) = $self->_last_line($block, $result_start, $result_end, $result_end == $self->{max_offset});
222				if (!defined $line \|\| $self->_compare_keys($self->_get_key($line), $query) == 0) {
223				last if $shrinking;
224				last if $block_end == $self->{max_offset};
225				# Expand right
226				my $ex = min($self->{surrounding_lines_blocksize}, $self->{max_offset} - $block_end);
227				my $chunk = $self->{gzip}->extract($block_end, $ex);
228				$block_end += $ex;
229				$result_end += $ex;
230				$block .= $chunk;
231				}
232				else {
233				$result_end = $offset - 1;
234				$shrinking = 1;
235				}
236				}
237
238				my $result = substr($block, $result_start, $result_end - $result_start);
239				return map { "$_\n" } split /\n/, $result;
240				}
241
242				# Get the first complete line from the block, limiting to a range from
243				# $start to $end (inclusive). If $allow_incomplete, we instead return
244				# all data up to the first \n.
245				# Also return the offset of the start of this line, relative to the
246				# entire text block.
247				# Return nothing if we need more data (left/right) to get a full line.
248				sub _first_line {
249				my ($self, $block, $start, $end, $allow_incomplete) = @_;
250				my $nl = index($block, "\n", $start);
251				$nl = -1 if $nl > $end;
252				if ($allow_incomplete) {
253				$nl = $end if $nl == -1;
254				return (substr($block, $start, $nl - $start + 1), $start);
255				}
256				else {
257				return if $nl == -1; # more data needed
258				my $nl2 = index($block, "\n", $nl + 1);
259				die if $nl2 < $start && $nl2 != -1;
260				return if $nl2 == -1 \|\| $nl2 > $end;
261				return (substr($block, $nl + 1, $nl2 - $nl), $nl + 1);
262				}
263				}
264
265				# Same deal as above, but for the last line. $allow_incomplete
266				# returns all data from the last \n onwards, and ignores a
267				# trailing newline at the end of the range.
268				sub _last_line {
269				my ($self, $block, $start, $end, $allow_incomplete) = @_;
270				my $nl = rindex($block, "\n", $end);
271				$nl = -1 if $nl < $start;
272				if ($allow_incomplete) {
273				$nl = rindex($block, "\n", $nl - 1) if $nl == $end; # ignore trailing newline
274				$nl = $start - 1 if $nl == -1; # pretend there's one
275				return (substr($block, $nl + 1, $end - $nl), $nl + 1);
276				}
277				else {
278				return if $nl == -1; # more data needed
279				my $nl2 = rindex($block, "\n", $nl - 1);
280				die if $nl2 > $end && $nl2 != -1;
281				return if $nl2 == -1 \|\| $nl2 < $start;
282				return (substr($block, $nl2 + 1, $nl - $nl2), $nl2 + 1);
283				}
284				}
285
286				# Build and return the Gzip::RandomAccess object.
287				sub _build_gzip {
288				my ($class, %args) = @_;
289
290				my %gra_args;
291				for my $key (@GRA_ALLOWED_ARGS) {
292				$gra_args{$key} = $args{$key} if exists $args{$key};
293				}
294
295				return Gzip::RandomAccess->new(%gra_args);
296				}
297
298				__END__