File Coverage

blib/lib/WARC/Record/Logical/Heuristics.pm
Criterion Covered Total %
statement 216 216 100.0
branch 72 72 100.0
condition 24 24 100.0
subroutine 14 14 100.0
pod 2 2 100.0
total 328 328 100.0


line stmt bran cond sub pod time code
1             package WARC::Record::Logical::Heuristics; # -*- CPerl -*-
2              
3 2     2   69458 use strict;
  2         12  
  2         53  
4 2     2   9 use warnings;
  2         4  
  2         74  
5              
6             our @ISA = qw();
7              
8 2     2   415 use WARC; *WARC::Record::Logical::Heuristics::VERSION = \$WARC::VERSION;
  2         5  
  2         70  
9              
10 2     2   10 use Carp;
  2         4  
  2         92  
11 2     2   10 use File::Spec;
  2         4  
  2         4559  
12              
13             =head1 NAME
14              
15             WARC::Record::Logical::Heuristics - heuristics for locating record segments
16              
17             =head1 SYNOPSIS
18              
19             use WARC::Record::Logical::Heuristics;
20              
21             =head1 DESCRIPTION
22              
23             This is an internal module that provides functions for locating record
24             segments when the needed information is not available from an index.
25              
26             These mostly assume that IIPC WARC guidelines have been followed, as
27             otherwise there simply is no efficient solution.
28              
29             Implementations vary, however, with some using only an incrementing serial
30             number and a constant timestamp from the initiation of the crawl job, while
31             the guidelines and specification envision a timestamp reflecting the first
32             write to that specific file rather than the start of the crawl. Constant
33             timestamps are checked first, since the search is simpler.
34              
35             =over
36              
37             =item $WARC::Record::Logical::Heuristics::Patience
38              
39             This variable sets a threshold used to limit the reach of an unproductive
40             search. This module tracks the "effort" expended (I/O performed) during a
41             search and abandons the search if the threshold is exceeded. Finding
42             results dynamically (and temporarily) increases this threshold during a
43             search, such that this really sets how far the search will go between
44             results before giving up and concluding that there are no more results.
45              
46             The search will reach farther if either the WARC files are not compressed,
47             or the "sl" GZIP extension documented in L is used.
48             Decompressing record data to find the next record is considerable effort
49             for larger records, but is not counted for very small records that the
50             system is likely to already have cached after the header has been read.
51              
52             =cut
53              
54             # These provide a simple mechanism to limit the scope of a search that is
55             # not producing results. Both are localized in the top-level calls.
56              
57             our $Patience = 10000; # How much effort to put into a search?
58             our $Effort = 0; # How much have we done so far during this search?
59              
60             # Most I/O incurs "effort", represented by incrementing $Effort, while
61             # partial success (finding an interesting record) increases "patience",
62             # represented by incrementing $Patience. The search stops when either
63             # there are no more places to look or $Effort exceeds $Patience.
64              
65             =item %WARC::Record::Logical::Heuristics::Effort
66              
67             This internal hash indicates how costly certain operations should be
68             considered. The keys and their meanings are subject to change at whim, but
69             this is available for quick tuning if needed. Generally, the better
70             solution is to index your data rather than spend time tuning heuristics.
71              
72             =cut
73              
74             our %Effort =
75             (# read_record:
76             # effort incurred to read a record header, regardless of compression
77             read_record => 5,
78              
79             # gzread_data_per_tick:
80             # number of bytes to read while advancing past a compressed record to
81             # incur one effort point; effort incurred rounds down, even to zero
82             #
83             # this value is a shot-in-the-dark estimate that gunzipping 320 KiB is
84             # equivalent to the open/seek/read process for loading a record header
85             gzread_data_per_tick => 64 * 1024,
86              
87             # readdir_files_per_tick:
88             # number of file names to read and check while scanning a directory for
89             # to incur one effort point; effort incurred rounds down, even to zero
90             #
91             # this value is a shot-in-the-dark estimate that reading/matching 1600
92             # file names is equivalent to loading a record header; this estimate
93             # may be high or low depending on the number of axes used in the search
94             readdir_files_per_tick => 320,
95             );
96              
97             # Internal functions:
98              
99             ## @axes = _split_digit_spans( $filename )
100             ##
101             ## Extract possible sequence numbers from $filename and return list of
102             ## array references [PREFIX, NUMBER, SUFFIX] where NUMBER is a field that
103             ## can be adjusted to find "nearby" files if NUMBER turns out to actually
104             ## be a sequence number. Finds numerous false matches in normal use, but
105             ## broad searches cost only time while excessive narrowing causes failure.
106             ##
107             ## Does not perform I/O; does not increment $Effort.
108              
109             sub _split_digit_spans ($) {
110 38     38   6464 my $name = shift;
111 38         74 my @axes = ();
112              
113             # Split on zero-width boundaries between digits and non-digits.
114 38         1061 my @pieces = split /(?=[0-9])(?<=[^0-9])|(?=[^0-9])(?<=[0-9])/, $name;
115             # The @pieces array now contains alternating spans of digits and non-digits.
116              
117 38         124 for (my $i = 0; $i < @pieces; $i++) {
118 524 100 100     1818 next unless ($pieces[$i] =~ /^[0-9]+$/ && length($pieces[$i]) < 9);
119             # More than 8 digits is probably not a sequence number and may be
120             # beyond the range of an integer anyway. Use indexes instead of
121             # heuristics if you need to work with a billion WARC files.
122 237         1339 push @axes, [join('', @pieces[0..($i-1)]),
123             $pieces[$i], join('', @pieces[($i+1)..$#pieces])];
124             }
125              
126 38         200 return @axes;
127             }
128              
129             ## @found = _find_nearby_files( $direction, @axes )
130             ##
131             ## Locate existing files that appear to be part of a contiguous sequence
132             ## along an axis in @axes. The $direction argument is either +1 to search
133             ## for higher numbers or -1 to search for lower numbers. A direction
134             ## value with a magnitude greater than 1 results in skipping possibilities
135             ## during the search.
136             ##
137             ## Returns a list of array references reflecting the files along each axis
138             ## from the argument list but omitting axes on which no files were found.
139             ##
140             ## Performs only directory lookups, which have highly unpredictable costs
141             ## and are usually cached by the system; does not increment $Effort.
142              
143             sub _find_nearby_files ($@) {
144 20     20   3034 my $direction = shift;
145 20         44 my @found = ();
146              
147 20         44 foreach my $axis (@_) {
148 180         299 my @files = ();
149 180         367 my $i = $axis->[1] + $direction; my $file;
  180         227  
150 180         2559 while (-f ($file = join '', ($axis->[0],
151             sprintf('%0*d', length $axis->[1], $i),
152             $axis->[2])))
153 42         133 { push @files, $file; $i += $direction }
  42         600  
154 180 100       655 push @found, \@files if scalar @files;
155             }
156              
157 20         64 return @found;
158             }
159              
160             ## @found = _scan_directory_for_axes( $dirname, @axes )
161             ##
162             ## Locate existing files that may appear to be part of a sequence along an
163             ## axis in @axes, using wildcards for long digit spans.
164             ##
165             ## The $dirname argument specifies the name of a directory to search and
166             ## all @axes are interpreted relative to $dirname. This differs from
167             ## _find_nearby_files where each axis specifies full absolute filenames.
168             ## For this function, the axes are strictly filenames with no directory.
169             ##
170             ## Returns a list of array references reflecting the files along each axis
171             ## from the argument list but omitting axes on which no files were found.
172             ##
173             ## Performs directory reads; increments $Effort to count file names read.
174              
175             sub _scan_directory_for_axes ($@) {
176 16     16   1217 my $dirname = shift;
177 16         25 my $read_count = 0;
178              
179             my @re = map {
180 16         38 my $pre = quotemeta $_->[0]; my $post = quotemeta $_->[2];
  60         150  
  60         108  
181 60         164 $pre =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg;
  14         59  
182 60         156 $post =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg;
  2         16  
183 60         92 my $midlen = length $_->[1]; qr/^$pre[0-9]{$midlen}$post/ } @_;
  60         993  
184              
185 16         31 my $filename;
186 16         22 my @found = ();
187 16 100       961 opendir my $dir, $dirname or croak "$dirname: $!";
188 15         567 while (defined ($filename = readdir $dir)) {
189 435         853 foreach (0 .. $#re) {
190 1740 100       4569 if ($filename =~ $re[$_])
191 148         196 { push @{$found[$_]}, $filename }
  148         350  
192             }
193 435         1123 $read_count++;
194             }
195 15         235 closedir $dir;
196              
197 15         108 $Effort += int($read_count / $Effort{readdir_files_per_tick});
198 15         42 return grep {scalar @$_} @found;
  60         213  
199             }
200              
201             ## @similar = _find_similar_files( $seed )
202             ##
203             ## Locate existing files that may appear to be part of a sequence involving
204             ## any digit span in $seed, using wildcards for long digit spans and
205             ## searching only the directory containing $seed.
206             ##
207             ## Returns a list of array references, each containing two array references
208             ## for files sorting before and after $seed, reflecting the files along
209             ## each axis derived from $seed on which files other than $seed were found.
210             ##
211             ## Uses _scan_directory_for_axes; does not perform I/O directly.
212              
213             sub _find_similar_files ($) {
214 12     12   6342 my $seedfile = shift;
215              
216 12         59 my $fs_volname; my $dirname; my $filename;
  12         0  
217 12         140 ($fs_volname, $dirname, $filename) = File::Spec->splitpath($seedfile);
218              
219 12         90 my @found = _scan_directory_for_axes
220             (File::Spec->catpath($fs_volname, $dirname, ''),
221             _split_digit_spans $filename);
222 12         48 my @similar = ();
223 12         29 foreach my $axis_files (@found) {
224 48         74 my @before = (); my @after = ();
  48         62  
225 48         69 foreach my $fname (@$axis_files) {
226 114 100       270 if ($fname lt $filename) {
    100          
227 32         178 push @before, File::Spec->catpath($fs_volname, $dirname, $fname);
228             } elsif ($fname gt $filename) {
229 34         176 push @after, File::Spec->catpath($fs_volname, $dirname, $fname);
230             }
231             }
232 18         43 push @similar, [[sort {$a cmp $b} @before],
233 48 100       182 [sort {$a cmp $b} @after]] if @before + @after;
  28         52  
234             }
235              
236 12         59 return @similar;
237             }
238              
239             ## ($checkpoint, @records) =
240             ## _scan_volume( $volume, $start, $end, [$field, $value]... )
241             ##
242             ## Search $volume for segment records where any $field matches $value
243             ## starting at offset $start and ending at or after offset $end. If $end
244             ## is an undefined value, searches until end-of-file.
245             ##
246             ## Only returns records that have a 'WARC-Segment-Number' header.
247             ##
248             ## The returned $checkpoint is the last record examined, regardless of
249             ## header values, and provides a valid offset for resuming a search.
250              
251             sub _scan_volume ($$$@) {
252 97     97   5086 my $volume = shift;
253 97         149 my $start = shift;
254 97         136 my $end = shift;
255              
256 97         233 my $record = $volume->record_at($start);
257 97         401 my @records = ();
258              
259 97   100     396 while ($record && (!defined $end || $record->offset <= $end)) {
      100        
260 885         2203 $Effort += $Effort{read_record};
261             next unless (defined $record->field('WARC-Segment-Number')
262 885 100 100     1817 && grep {defined $record->field($_->[0])} @_);
  327         772  
263 224 100       477 push @records, $record if grep {defined $record->field($_->[0])
  286 100       633  
264             && $record->field($_->[0]) eq $_->[1]} @_;
265             } continue { $Effort += int($record->field('Content-Length')
266             / $Effort{gzread_data_per_tick})
267             if (defined $record->{compression}
268 885 100 100     2053 && !defined $record->{sl_packed_size});
269 885         1783 $record = $record->next }
270              
271 97         390 return $record, @records;
272             }
273              
274             =item ( $first_segment, @clues ) = find_first_segment( $record )
275              
276             Attempt to locate the first segment of the logical record suggested by the
277             given record without using indexes. Croaks if given a record that does not
278             appear to have been written using WARC segmentation. Returns a
279             C object for the first record and a list of other objects
280             that may be useful for locating continuation records. Returns undef in the
281             first slot if no clear first segment was found, but can still return other
282             records encountered during the search even if the search was ultimately
283             unsuccessful.
284              
285             =cut
286              
287             ## Each "clue" can be a WARC::Record, or a hint in the form of [key => value].
288             ##
289             ## The hint keys currently are:
290             ##
291             ## tail => $record
292             ## last record examined in initial volume
293             ## (a good starting point to search for more segments)
294             ##
295             ## files_on_axes => [$filename, ...]...
296             ## array of arrays from _find_nearby_files
297             ## files_from_dir => [[$filename...], [$filename...]]...
298             ## array of arrays from _find_similar_files
299             ## Note that the filenames are set to undef in these hints as the
300             ## corresponding WARC volumes are scanned, with any relevant records
301             ## added directly to the clue list as they are found.
302              
303             sub find_first_segment {
304 11     11 1 899 local $Patience = $Patience;
305 11         21 local $Effort = 0;
306              
307 11         20 my $initial = shift;
308              
309 11 100       37 croak 'searching for segments for unsegmented record'
310             unless defined $initial->field('WARC-Segment-Number');
311              
312 10         28 my $origin_id = $initial->field('WARC-Segment-Origin-ID');
313 10         23 my @clues = (); my $point; my @records;
  10         18  
314              
315             # First we search the volume containing the initial record, since
316             # multiple WARC files may have been concatenated together after writing.
317 10         29 ($point, @records) = _scan_volume $initial->volume, 0, $initial->offset,
318             [WARC_Segment_Origin_ID => $origin_id], [WARC_Record_ID => $origin_id];
319             # ... @records will always include $initial ...
320 10         40 push @clues, @records, [tail => $point];
321              
322 10         23 foreach my $record (@records) {
323 10 100       28 return $record, @clues if $record->field('WARC-Record-ID') eq $origin_id;
324             }
325 7         18 $Patience += $Effort * ((scalar @records) - 1);
326 7 100       27 return undef, @clues if $Effort > $Patience;
327              
328             # If we get this far, the first segment must be in another volume.
329             {
330 6         18 my @simple_axes = _split_digit_spans $initial->volume->filename;
331 6         18 my @nearby = _find_nearby_files -1, @simple_axes;
332              
333             # A simple sequence number may be in use; we can check these volumes
334             # before reading the directory to handle varying timestamps.
335 6 100       24 push @clues, [files_on_axes => @nearby] if scalar @nearby;
336 6         29 foreach my $axis_files (reverse @nearby) {
337             # Work backwards on the assumption that sequence numbers are nearer
338             # to the end of the filename. (Correct for Wget and Wpull.)
339 4         23 foreach my $name (@$axis_files) {
340 6         11 my $previousEffort = $Effort;
341 6         18 my $volume = mount WARC::Volume ($name);
342 6         121 (undef, @records) = _scan_volume $volume, 0, undef,
343             [WARC_Segment_Origin_ID => $origin_id],
344             [WARC_Record_ID => $origin_id];
345 6         18 push @clues, @records; $name = undef;
  6         10  
346 6         13 foreach my $record (@records) {
347 3 100       11 return $record, @clues
348             if $record->field('WARC-Record-ID') eq $origin_id;
349             }
350 5         13 $Patience += ($Effort - $previousEffort) * scalar @records;
351 5 100       26 return undef, @clues if $Effort > $Patience;
352             }
353             }
354             }
355              
356             # If we get this far, the first segment is in another volume and multiple
357             # numbers must change to find that other volume. Assume that timestamps
358             # are in use in the file names, confounding the simple sequence search.
359             {
360 6         14 my @nearby = _find_similar_files $initial->volume->filename;
  4         10  
  4         17  
361              
362 4 100       16 push @clues, [files_from_dir => @nearby] if scalar @nearby;
363             # Work forwards on the assumption that sequence numbers are nearer to
364             # the beginning of the filename. (Correct in Internet Archive samples.)
365 4         14 foreach my $fname ((map {reverse @{$_->[0]}} @nearby),
  8         11  
  8         19  
366             # work backwards within the "before" list on each axis
367             # ... and forwards within the "after" list on each axis
368 8         12 (map {@{$_->[1]}} @nearby)) {
  8         17  
369 9         16 my $previousEffort = $Effort;
370 9         33 my $volume = mount WARC::Volume ($fname);
371 9         167 (undef, @records) = _scan_volume $volume, 0, undef,
372             [WARC_Segment_Origin_ID => $origin_id],
373             [WARC_Record_ID => $origin_id];
374 9         26 push @clues, @records; $fname = undef;
  9         17  
375 9         21 foreach my $record (@records) {
376 4 100       12 return $record, @clues
377             if $record->field('WARC-Record-ID') eq $origin_id;
378             }
379 7         18 $Patience += ($Effort - $previousEffort) * scalar @records;
380 7 100       25 return undef, @clues if $Effort > $Patience;
381             }
382             }
383              
384             # If we get this far, we have run out of places to look and the user will
385             # need to build an index instead of relying on heuristics.
386 1         7 return undef, @clues;
387             }
388              
389             =item ( @segments ) = find_continuation( $first_segment, @clues )
390              
391             Attempt to locate the continuation segments of a logical record without
392             using indexes. Uses the clues returned from C to aid
393             in the search and returns a list of continuation records found that appear
394             to be part of the same logical record as the given first segment.
395              
396             =cut
397              
398             sub _add_segments (\$\@\@) {
399 78     78   117 my $total_segment_count_ref = shift;
400 78         109 my $have_segments_ref = shift;
401 78         100 my $new_segments_ref = shift;
402              
403 78         177 foreach (@$new_segments_ref) {
404 38         87 $have_segments_ref->[$_->field('WARC-Segment-Number')]++;
405 38 100       107 $$total_segment_count_ref = $_->field('WARC-Segment-Number')
406             if defined $_->field('WARC-Segment-Total-Length');
407             }
408             }
409             sub _have_all_segments_p ($@) {
410 86     86   135 my $total_segment_count = shift;
411              
412             # We cannot have all segments if we have not seen the last segment yet.
413 86 100       380 return 0 unless defined $total_segment_count;
414              
415             # We have seen the last segment, do we have all of the others?
416 24 100       59 for (my $i = 2; $i < $total_segment_count; $i++) { return 0 unless $_[$i] }
  59         187  
417             # Start the search at 2 because offsets 0 and 1 are not used here.
418              
419 9         83 return 1;
420             }
421              
422             sub find_continuation {
423 16     16 1 6355 local $Patience = $Patience;
424 16         33 local $Effort = 0;
425              
426 16         27 my $first_segment = shift; my $origin_id = $first_segment->id;
  16         51  
427              
428             # First we unpack the clues and check if all segments were found while
429             # searching for the first segment.
430 16         37 my @segments = (); my @nearby_volume_files = ();
  16         26  
431 16         27 my $have_tail = 0; my $point = undef;
  16         26  
432 16         24 my @similar_volume_files_before = (); my @similar_volume_files_after = ();
  16         30  
433 16         39 foreach my $clue (@_) {
434 35 100       107 if (UNIVERSAL::isa($clue, 'WARC::Record')) {
    100          
435 21 100       58 push @segments, $clue unless $clue == $first_segment;
436             } elsif (ref $clue eq 'ARRAY') {
437 13         30 my $tag = shift @$clue;
438 13 100       40 if ($tag eq 'tail') {
    100          
    100          
439 7         11 $have_tail = 1;
440 7         14 $point = shift @$clue;
441             } elsif ($tag eq 'files_on_axes') {
442 3         9 push @nearby_volume_files, map {[grep defined, @$_]} @$clue;
  6         19  
443             } elsif ($tag eq 'files_from_dir') {
444 2         8 foreach (@$clue) {
445 6         8 push @similar_volume_files_before, [grep defined, @{$_->[0]}];
  6         19  
446 6         10 push @similar_volume_files_after, [grep defined, @{$_->[1]}];
  6         17  
447             }
448 1         25 } else { die "unrecognized hint tag: $tag" }
449 1         10 } else { die "unrecognized clue" }
450 33         76 $clue = undef;
451             }
452 14         36 @similar_volume_files_before = grep {scalar @$_} @similar_volume_files_before;
  6         12  
453 14         25 @similar_volume_files_after = grep {scalar @$_} @similar_volume_files_after;
  6         13  
454              
455 14         17 my @have_segments = (); my $total_segment_count = undef;
  14         23  
456 14         44 _add_segments $total_segment_count, @have_segments, @segments;
457              
458 14 100       37 return @segments if _have_all_segments_p $total_segment_count, @have_segments;
459              
460             # If we get to here, at least one segment was not found while searching
461             # for the first segment, so we will need to search too.
462 12         24 my @records = ();
463              
464             # Pick up where find_first_segment left off...
465 12 100       35 if ($point) {
    100          
466 4         13 (undef, @records) = _scan_volume $point->volume, $point->offset, undef,
467             [WARC_Segment_Origin_ID => $origin_id];
468 4         17 _add_segments $total_segment_count, @have_segments, @records;
469 4         9 push @segments, @records;
470             } elsif (!$have_tail) {
471             # The search may have begun with the first segment directly; ensure
472             # that we scan the entire volume containing the first segment later.
473 7         23 push @nearby_volume_files, [$first_segment->volume->filename];
474             }
475 12         137 $Patience += $Effort * scalar @records;
476             return @segments
477 12 100 100     26 if (_have_all_segments_p $total_segment_count, @have_segments
478             or $Effort > $Patience);
479              
480             # Search for more volumes in a simple sequence...
481             {
482 10         23 my @simple_axes = _split_digit_spans $first_segment->volume->filename;
483 10         28 my @nearby = _find_nearby_files 1, @simple_axes;
484              
485             # Were more volumes found in the simple sequence search now or previously?
486 10         33 foreach my $axis_files ((reverse @nearby),
487             (reverse @nearby_volume_files)) {
488             # Work backwards on the assumption that sequence numbers are nearer
489             # to the end of the filename. (Correct for Wget and Wpull.)
490 17         34 foreach my $name (@$axis_files) {
491 27         46 my $previousEffort = $Effort;
492 27         423 my $volume = mount WARC::Volume ($name);
493 27         507 (undef, @records) = _scan_volume $volume, 0, undef,
494             [WARC_Segment_Origin_ID => $origin_id];
495 27         93 _add_segments $total_segment_count, @have_segments, @records;
496 27         50 push @segments, @records;
497 27         57 $Patience += ($Effort - $previousEffort) * scalar @records;
498             return @segments
499 27 100 100     61 if (_have_all_segments_p $total_segment_count, @have_segments
500             or $Effort > $Patience);
501             }
502             }
503             }
504              
505             # Search for more volumes by directory scan...
506             {
507 10 100       21 unless (@similar_volume_files_before + @similar_volume_files_after) {
  6         18  
  6         18  
508             # Unlike the simple sequence search, the directory scan finds files
509             # in both directions from the starting point on all axes, but it may
510             # not have been needed to find the first segment. Do it now if not.
511 5         18 my @nearby = _find_similar_files $first_segment->volume->filename;
512 5         11 foreach (@nearby) {
513 13         19 push @similar_volume_files_before, $_->[0];
514 13         25 push @similar_volume_files_after, $_->[1];
515             }
516             }
517             # Any interesting records in volumes before the volume containing the
518             # initial record were probably found while locating the first segment.
519 6         13 foreach my $axis_files (@similar_volume_files_after,
520             reverse @similar_volume_files_before) {
521             # Work forwards on the assumption that sequence numbers are nearer to
522             # the beginning of the filename. (Correct in Internet Archive samples.)
523 23         42 foreach my $fname (@$axis_files) {
524 33         52 my $previousEffort = $Effort;
525 33         96 my $volume = mount WARC::Volume ($fname);
526 33         574 (undef, @records) = _scan_volume $volume, 0, undef,
527             [WARC_Segment_Origin_ID => $origin_id];
528 33         116 _add_segments $total_segment_count, @have_segments, @records;
529 33         64 push @segments, @records;
530 33         64 $Patience += ($Effort - $previousEffort) * scalar @records;
531             return @segments
532 33 100 100     69 if (_have_all_segments_p $total_segment_count, @have_segments
533             or $Effort > $Patience);
534             }
535             }
536             }
537              
538             # If we get to here, we have run out of places to look and the user will
539             # need to build an index instead of relying on heuristics.
540 2         16 return @segments;
541             }
542              
543             =back
544              
545             =cut
546              
547             1;
548             __END__