| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Gzip::BinarySearch; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 9 |  |  | 9 |  | 291045 | use strict; | 
|  | 9 |  |  |  |  | 24 |  | 
|  | 9 |  |  |  |  | 291 |  | 
| 4 | 9 |  |  | 9 |  | 47 | use warnings; | 
|  | 9 |  |  |  |  | 18 |  | 
|  | 9 |  |  |  |  | 465 |  | 
| 5 | 9 |  |  | 9 |  | 46 | use Carp; | 
|  | 9 |  |  |  |  | 20 |  | 
|  | 9 |  |  |  |  | 931 |  | 
| 6 | 9 |  |  | 9 |  | 52 | use List::Util qw(min max); | 
|  | 9 |  |  |  |  | 16 |  | 
|  | 9 |  |  |  |  | 1958 |  | 
| 7 | 9 |  |  | 9 |  | 15683 | use Gzip::RandomAccess; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | our $VERSION = '0.91'; | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | BEGIN { | 
| 12 |  |  |  |  |  |  | use Exporter; | 
| 13 |  |  |  |  |  |  | our @ISA = qw(Exporter); | 
| 14 |  |  |  |  |  |  | our @EXPORT_OK = qw(fs_column tsv_column); | 
| 15 |  |  |  |  |  |  | } | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | # When binary searching, assume this is the biggest line we'll find. | 
| 18 |  |  |  |  |  |  | # We'll try this first, then try a bigger decompression if it fails. | 
| 19 |  |  |  |  |  |  | my $DEFAULT_EST_LINE_LENGTH = 512; | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | # When looking for adjacent identical lines (find_all) extract this | 
| 22 |  |  |  |  |  |  | # many bytes at a time. | 
| 23 |  |  |  |  |  |  | my $DEFAULT_SURROUNDING_LINES_BLOCKSIZE = 4096; | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | my @GRA_ALLOWED_ARGS = qw(file index_file index_span cleanup); | 
| 26 |  |  |  |  |  |  | my %ALLOWED_ARGS = map { $_ => 1 } ( | 
| 27 |  |  |  |  |  |  | @GRA_ALLOWED_ARGS, qw(key_func cmp_func), | 
| 28 |  |  |  |  |  |  | qw(est_line_length surrounding_lines_blocksize), | 
| 29 |  |  |  |  |  |  | ); | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | sub new { | 
| 32 |  |  |  |  |  |  | my ($class, %args) = @_; | 
| 33 |  |  |  |  |  |  | my $gzip = $class->_build_gzip(%args); | 
| 34 |  |  |  |  |  |  | for my $key (keys %args) { | 
| 35 |  |  |  |  |  |  | $ALLOWED_ARGS{$key} or croak "Invalid argument '$key'"; | 
| 36 |  |  |  |  |  |  | } | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | my $key_func = $args{key_func} || fs_column(qr/\s+/, 1); | 
| 39 |  |  |  |  |  |  | my $cmp_func = $args{cmp_func} || sub { $main::a cmp $main::b }; | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | my $est_line_length = $args{est_line_length} | 
| 42 |  |  |  |  |  |  | || $DEFAULT_EST_LINE_LENGTH; | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  | my $surrounding_lines_blocksize = $args{surrounding_lines_blocksize} | 
| 45 |  |  |  |  |  |  | || $DEFAULT_SURROUNDING_LINES_BLOCKSIZE; | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | bless { | 
| 48 |  |  |  |  |  |  | gzip => $gzip, | 
| 49 |  |  |  |  |  |  | max_offset => $gzip->uncompressed_size - 1, | 
| 50 |  |  |  |  |  |  | est_line_length => $est_line_length, | 
| 51 |  |  |  |  |  |  | surrounding_lines_blocksize => $surrounding_lines_blocksize, | 
| 52 |  |  |  |  |  |  | key_func => $key_func, | 
| 53 |  |  |  |  |  |  | cmp_func => $cmp_func, | 
| 54 |  |  |  |  |  |  | }, $class; | 
| 55 |  |  |  |  |  |  | } | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | sub find { | 
| 58 |  |  |  |  |  |  | my ($self, $key) = @_; | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | my ($line, $mid) = $self->_find($key); | 
| 61 |  |  |  |  |  |  | return $line if defined $line; | 
| 62 |  |  |  |  |  |  | return; | 
| 63 |  |  |  |  |  |  | } | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | sub find_all { | 
| 66 |  |  |  |  |  |  | my ($self, $key) = @_; | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | my ($line, $mid) = $self->_find($key); | 
| 69 |  |  |  |  |  |  | if (defined $line) { | 
| 70 |  |  |  |  |  |  | return $self->_search_surrounding_lines($key, $line, $mid); | 
| 71 |  |  |  |  |  |  | } | 
| 72 |  |  |  |  |  |  | return; | 
| 73 |  |  |  |  |  |  | } | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | sub gzip { shift->{gzip} } | 
| 76 |  |  |  |  |  |  | sub est_line_length { shift->{est_line_length} } | 
| 77 |  |  |  |  |  |  | sub surrounding_lines_blocksize { shift->{surrounding_lines_blocksize} } | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | # Convenience functions | 
| 80 |  |  |  |  |  |  | sub fs_column { | 
| 81 |  |  |  |  |  |  | my ($field_sep, $column_number) = @_; | 
| 82 |  |  |  |  |  |  | croak "Invalid column number, should be 1-based" | 
| 83 |  |  |  |  |  |  | if $column_number < 1; | 
| 84 |  |  |  |  |  |  |  | 
| 85 |  |  |  |  |  |  | if (!ref $field_sep && $field_sep eq ' ') { | 
| 86 |  |  |  |  |  |  | # Force Perl to match a space, not \s+ | 
| 87 |  |  |  |  |  |  | $field_sep = qr/ /; | 
| 88 |  |  |  |  |  |  | } | 
| 89 |  |  |  |  |  |  |  | 
| 90 |  |  |  |  |  |  | return sub { | 
| 91 |  |  |  |  |  |  | chomp; | 
| 92 |  |  |  |  |  |  | my @f = split $field_sep, $_, -1; | 
| 93 |  |  |  |  |  |  | return $f[$column_number - 1]; | 
| 94 |  |  |  |  |  |  | } | 
| 95 |  |  |  |  |  |  | } | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | sub tsv_column { | 
| 98 |  |  |  |  |  |  | my $column_number = shift; | 
| 99 |  |  |  |  |  |  | return fs_column("\t", $column_number); | 
| 100 |  |  |  |  |  |  | } | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | # Given a key, binary search for the first line encountered with this key, | 
| 103 |  |  |  |  |  |  | # and return both the line and its offset in the uncompressed data. | 
| 104 |  |  |  |  |  |  | # Return nothing if the key was not found. | 
| 105 |  |  |  |  |  |  | sub _find { | 
| 106 |  |  |  |  |  |  | my ($self, $key) = @_; | 
| 107 |  |  |  |  |  |  | my ($low, $high, $mid) = (0, $self->{max_offset}, undef); | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | while ($low <= $high) { | 
| 110 |  |  |  |  |  |  | $mid = int(($low + $high) / 2); | 
| 111 |  |  |  |  |  |  | my $line = $self->_get_line_at($mid); | 
| 112 |  |  |  |  |  |  | my $line_key = $self->_get_key($line); | 
| 113 |  |  |  |  |  |  | my $cmp = $self->_compare_keys($line_key, $key); | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | if ($cmp < 0) { | 
| 116 |  |  |  |  |  |  | $low = $mid + 1; | 
| 117 |  |  |  |  |  |  | } | 
| 118 |  |  |  |  |  |  | elsif ($cmp > 0) { | 
| 119 |  |  |  |  |  |  | $high = $mid - 1; | 
| 120 |  |  |  |  |  |  | } | 
| 121 |  |  |  |  |  |  | else { | 
| 122 |  |  |  |  |  |  | return ($line, $mid); | 
| 123 |  |  |  |  |  |  | } | 
| 124 |  |  |  |  |  |  | } | 
| 125 |  |  |  |  |  |  |  | 
| 126 |  |  |  |  |  |  | # Key not found | 
| 127 |  |  |  |  |  |  | return; | 
| 128 |  |  |  |  |  |  | } | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | # Given an offset, return the entire line 'around' that offset. | 
| 131 |  |  |  |  |  |  | # This includes any trailing newline character. | 
| 132 |  |  |  |  |  |  | # If the offset is a newline, it will be counted as part of the preceding line. | 
| 133 |  |  |  |  |  |  | sub _get_line_at { | 
| 134 |  |  |  |  |  |  | my ($self, $offset) = @_; | 
| 135 |  |  |  |  |  |  | my $L = $self->{est_line_length}; | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | my $seek_start = max($offset - $L, 0); # global offset | 
| 138 |  |  |  |  |  |  | my $seek_length = $L * 2; | 
| 139 |  |  |  |  |  |  | my $midpoint = $offset - $seek_start;  # relative to extracted $block, not global | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  | my $line; | 
| 142 |  |  |  |  |  |  | while (!defined $line) { | 
| 143 |  |  |  |  |  |  | my $block = $self->{gzip}->extract($seek_start, $seek_length); | 
| 144 |  |  |  |  |  |  |  | 
| 145 |  |  |  |  |  |  | my $prev_nl = rindex($block, "\n", $midpoint - 1);  # -1 if no match | 
| 146 |  |  |  |  |  |  | my $next_nl = index($block, "\n", $midpoint); | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | if ($prev_nl == -1 && $seek_start > 0) { | 
| 149 |  |  |  |  |  |  | my $extension = min($L, $seek_start); | 
| 150 |  |  |  |  |  |  | $seek_start -= $extension; | 
| 151 |  |  |  |  |  |  | $seek_length += $extension; | 
| 152 |  |  |  |  |  |  | $midpoint += $extension; | 
| 153 |  |  |  |  |  |  | } | 
| 154 |  |  |  |  |  |  | elsif ($next_nl == -1 && $seek_start + $seek_length < $self->{max_offset}) { | 
| 155 |  |  |  |  |  |  | $seek_length += $L; | 
| 156 |  |  |  |  |  |  | } | 
| 157 |  |  |  |  |  |  | else { | 
| 158 |  |  |  |  |  |  | $next_nl = length($block) if $next_nl == -1;  # no EOF newline - use last char | 
| 159 |  |  |  |  |  |  | $line = substr($block, $prev_nl + 1, $next_nl - $prev_nl); | 
| 160 |  |  |  |  |  |  | } | 
| 161 |  |  |  |  |  |  | } | 
| 162 |  |  |  |  |  |  |  | 
| 163 |  |  |  |  |  |  | return $line; | 
| 164 |  |  |  |  |  |  | } | 
| 165 |  |  |  |  |  |  |  | 
| 166 |  |  |  |  |  |  | sub _get_key { | 
| 167 |  |  |  |  |  |  | my ($self, $line) = @_; | 
| 168 |  |  |  |  |  |  | local *::_ = \$line; | 
| 169 |  |  |  |  |  |  | return $self->{key_func}->(); | 
| 170 |  |  |  |  |  |  | } | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  | sub _compare_keys { | 
| 173 |  |  |  |  |  |  | my ($self, $key1, $key2) = @_; | 
| 174 |  |  |  |  |  |  | local (*::a, *::b) = (\$key1, \$key2); | 
| 175 |  |  |  |  |  |  | return $self->{cmp_func}->(); | 
| 176 |  |  |  |  |  |  |  | 
| 177 |  |  |  |  |  |  | } | 
| 178 |  |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | # Given a line and a seekpoint in the uncompressed data, look forwards and backwards | 
| 180 |  |  |  |  |  |  | # for adjacent lines with the same key. Return matching lines in same order as file. | 
| 181 |  |  |  |  |  |  | sub _search_surrounding_lines { | 
| 182 |  |  |  |  |  |  | my ($self, $query, $mid_line, $mid) = @_; | 
| 183 |  |  |  |  |  |  | # Problem description: | 
| 184 |  |  |  |  |  |  | # when searching for 'zits', the algorithm expands, then shrinks line-by-line | 
| 185 |  |  |  |  |  |  | # at the end it trims the string to "zings\nzits\n..." and for some reason the | 
| 186 |  |  |  |  |  |  | # zits offset isn't used to trim it further. | 
| 187 |  |  |  |  |  |  | # you'll notice the _compare_keys thing which means $result_start is never set | 
| 188 |  |  |  |  |  |  | # if we're shrinking (?) | 
| 189 |  |  |  |  |  |  |  | 
| 190 |  |  |  |  |  |  | my $block_start = max($mid - length($mid_line) + 1, 0); | 
| 191 |  |  |  |  |  |  | my $block_end = min($mid + length($mid_line) - 1, $self->{max_offset}); | 
| 192 |  |  |  |  |  |  | my $block = $self->{gzip}->extract($block_start, $block_end - $block_start); | 
| 193 |  |  |  |  |  |  |  | 
| 194 |  |  |  |  |  |  | # offsets for the result range, relative to block_start | 
| 195 |  |  |  |  |  |  | my ($result_start, $result_end) = (0, $block_end - $block_start); | 
| 196 |  |  |  |  |  |  |  | 
| 197 |  |  |  |  |  |  | my $shrinking = 0; | 
| 198 |  |  |  |  |  |  | while (1) { | 
| 199 |  |  |  |  |  |  | my ($line, $offset) = $self->_first_line($block, $result_start, $result_end, $block_start == 0); | 
| 200 |  |  |  |  |  |  | if (!defined $line || $self->_compare_keys($self->_get_key($line), $query) == 0) { | 
| 201 |  |  |  |  |  |  | last if $shrinking; | 
| 202 |  |  |  |  |  |  | last if $block_start == 0;  # can't expand further | 
| 203 |  |  |  |  |  |  | # Expand left | 
| 204 |  |  |  |  |  |  | my $ex = min($self->{surrounding_lines_blocksize}, $block_start); | 
| 205 |  |  |  |  |  |  | $block_start -= $ex; | 
| 206 |  |  |  |  |  |  | $result_end += $ex; | 
| 207 |  |  |  |  |  |  | my $chunk = $self->{gzip}->extract($block_start, $ex); | 
| 208 |  |  |  |  |  |  | $block = $chunk . $block; | 
| 209 |  |  |  |  |  |  | } | 
| 210 |  |  |  |  |  |  | else { | 
| 211 |  |  |  |  |  |  | # Shrink | 
| 212 |  |  |  |  |  |  | $result_start = $offset + length($line); | 
| 213 |  |  |  |  |  |  | $shrinking = 1; | 
| 214 |  |  |  |  |  |  | } | 
| 215 |  |  |  |  |  |  | } | 
| 216 |  |  |  |  |  |  | my ($line, $offset) = $self->_first_line($block, $result_start, $result_end, $block_start == 0); | 
| 217 |  |  |  |  |  |  | $result_start = $offset; | 
| 218 |  |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | $shrinking = 0; | 
| 220 |  |  |  |  |  |  | while (1) { | 
| 221 |  |  |  |  |  |  | my ($line, $offset) = $self->_last_line($block, $result_start, $result_end, $result_end == $self->{max_offset}); | 
| 222 |  |  |  |  |  |  | if (!defined $line || $self->_compare_keys($self->_get_key($line), $query) == 0) { | 
| 223 |  |  |  |  |  |  | last if $shrinking; | 
| 224 |  |  |  |  |  |  | last if $block_end == $self->{max_offset}; | 
| 225 |  |  |  |  |  |  | # Expand right | 
| 226 |  |  |  |  |  |  | my $ex = min($self->{surrounding_lines_blocksize}, $self->{max_offset} - $block_end); | 
| 227 |  |  |  |  |  |  | my $chunk = $self->{gzip}->extract($block_end, $ex); | 
| 228 |  |  |  |  |  |  | $block_end += $ex; | 
| 229 |  |  |  |  |  |  | $result_end += $ex; | 
| 230 |  |  |  |  |  |  | $block .= $chunk; | 
| 231 |  |  |  |  |  |  | } | 
| 232 |  |  |  |  |  |  | else { | 
| 233 |  |  |  |  |  |  | $result_end = $offset - 1; | 
| 234 |  |  |  |  |  |  | $shrinking = 1; | 
| 235 |  |  |  |  |  |  | } | 
| 236 |  |  |  |  |  |  | } | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | my $result = substr($block, $result_start, $result_end - $result_start); | 
| 239 |  |  |  |  |  |  | return map { "$_\n" } split /\n/, $result; | 
| 240 |  |  |  |  |  |  | } | 
| 241 |  |  |  |  |  |  |  | 
| 242 |  |  |  |  |  |  | # Get the first complete line from the block, limiting to a range from | 
| 243 |  |  |  |  |  |  | # $start to $end (inclusive). If $allow_incomplete, we instead return | 
| 244 |  |  |  |  |  |  | # all data up to the first \n. | 
| 245 |  |  |  |  |  |  | # Also return the offset of the start of this line, relative to the | 
| 246 |  |  |  |  |  |  | # entire text block. | 
| 247 |  |  |  |  |  |  | # Return nothing if we need more data (left/right) to get a full line. | 
| 248 |  |  |  |  |  |  | sub _first_line { | 
| 249 |  |  |  |  |  |  | my ($self, $block, $start, $end, $allow_incomplete) = @_; | 
| 250 |  |  |  |  |  |  | my $nl = index($block, "\n", $start); | 
| 251 |  |  |  |  |  |  | $nl = -1 if $nl > $end; | 
| 252 |  |  |  |  |  |  | if ($allow_incomplete) { | 
| 253 |  |  |  |  |  |  | $nl = $end if $nl == -1; | 
| 254 |  |  |  |  |  |  | return (substr($block, $start, $nl - $start + 1), $start); | 
| 255 |  |  |  |  |  |  | } | 
| 256 |  |  |  |  |  |  | else { | 
| 257 |  |  |  |  |  |  | return if $nl == -1;  # more data needed | 
| 258 |  |  |  |  |  |  | my $nl2 = index($block, "\n", $nl + 1); | 
| 259 |  |  |  |  |  |  | die if $nl2 < $start && $nl2 != -1; | 
| 260 |  |  |  |  |  |  | return if $nl2 == -1 || $nl2 > $end; | 
| 261 |  |  |  |  |  |  | return (substr($block, $nl + 1, $nl2 - $nl), $nl + 1); | 
| 262 |  |  |  |  |  |  | } | 
| 263 |  |  |  |  |  |  | } | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | # Same deal as above, but for the last line. $allow_incomplete | 
| 266 |  |  |  |  |  |  | # returns all data from the last \n onwards, and ignores a | 
| 267 |  |  |  |  |  |  | # trailing newline at the end of the range. | 
| 268 |  |  |  |  |  |  | sub _last_line { | 
| 269 |  |  |  |  |  |  | my ($self, $block, $start, $end, $allow_incomplete) = @_; | 
| 270 |  |  |  |  |  |  | my $nl = rindex($block, "\n", $end); | 
| 271 |  |  |  |  |  |  | $nl = -1 if $nl < $start; | 
| 272 |  |  |  |  |  |  | if ($allow_incomplete) { | 
| 273 |  |  |  |  |  |  | $nl = rindex($block, "\n", $nl - 1) if $nl == $end;  # ignore trailing newline | 
| 274 |  |  |  |  |  |  | $nl = $start - 1 if $nl == -1;  # pretend there's one | 
| 275 |  |  |  |  |  |  | return (substr($block, $nl + 1, $end - $nl), $nl + 1); | 
| 276 |  |  |  |  |  |  | } | 
| 277 |  |  |  |  |  |  | else { | 
| 278 |  |  |  |  |  |  | return if $nl == -1;  # more data needed | 
| 279 |  |  |  |  |  |  | my $nl2 = rindex($block, "\n", $nl - 1); | 
| 280 |  |  |  |  |  |  | die if $nl2 > $end && $nl2 != -1; | 
| 281 |  |  |  |  |  |  | return if $nl2 == -1 || $nl2 < $start; | 
| 282 |  |  |  |  |  |  | return (substr($block, $nl2 + 1, $nl - $nl2), $nl2 + 1); | 
| 283 |  |  |  |  |  |  | } | 
| 284 |  |  |  |  |  |  | } | 
| 285 |  |  |  |  |  |  |  | 
| 286 |  |  |  |  |  |  | # Build and return the Gzip::RandomAccess object. | 
| 287 |  |  |  |  |  |  | sub _build_gzip { | 
| 288 |  |  |  |  |  |  | my ($class, %args) = @_; | 
| 289 |  |  |  |  |  |  |  | 
| 290 |  |  |  |  |  |  | my %gra_args; | 
| 291 |  |  |  |  |  |  | for my $key (@GRA_ALLOWED_ARGS) { | 
| 292 |  |  |  |  |  |  | $gra_args{$key} = $args{$key} if exists $args{$key}; | 
| 293 |  |  |  |  |  |  | } | 
| 294 |  |  |  |  |  |  |  | 
| 295 |  |  |  |  |  |  | return Gzip::RandomAccess->new(%gra_args); | 
| 296 |  |  |  |  |  |  | } | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | __END__ |