File Coverage

blib/lib/WARC/Record/Logical/Heuristics.pm

Criterion	Covered	Total	%
statement	216	216	100.0
branch	72	72	100.0
condition	24	24	100.0
subroutine	14	14	100.0
pod	2	2	100.0
total	328	328	100.0

line	stmt	bran	cond	sub	pod	time	code
1							package WARC::Record::Logical::Heuristics; # -- CPerl --
2
3	2			2		69458	use strict;
	2					12
	2					53
4	2			2		9	use warnings;
	2					4
	2					74
5
6							our @ISA = qw();
7
8	2			2		415	use WARC; *WARC::Record::Logical::Heuristics::VERSION = \$WARC::VERSION;
	2					5
	2					70
9
10	2			2		10	use Carp;
	2					4
	2					92
11	2			2		10	use File::Spec;
	2					4
	2					4559
12
13							=head1 NAME
14
15							WARC::Record::Logical::Heuristics - heuristics for locating record segments
16
17							=head1 SYNOPSIS
18
19							use WARC::Record::Logical::Heuristics;
20
21							=head1 DESCRIPTION
22
23							This is an internal module that provides functions for locating record
24							segments when the needed information is not available from an index.
25
26							These mostly assume that IIPC WARC guidelines have been followed, as
27							otherwise there simply is no efficient solution.
28
29							Implementations vary, however, with some using only an incrementing serial
30							number and a constant timestamp from the initiation of the crawl job, while
31							the guidelines and specification envision a timestamp reflecting the first
32							write to that specific file rather than the start of the crawl. Constant
33							timestamps are checked first, since the search is simpler.
34
35							=over
36
37							=item $WARC::Record::Logical::Heuristics::Patience
38
39							This variable sets a threshold used to limit the reach of an unproductive
40							search. This module tracks the "effort" expended (I/O performed) during a
41							search and abandons the search if the threshold is exceeded. Finding
42							results dynamically (and temporarily) increases this threshold during a
43							search, such that this really sets how far the search will go between
44							results before giving up and concluding that there are no more results.
45
46							The search will reach farther if either the WARC files are not compressed,
47							or the "sl" GZIP extension documented in L is used.
48							Decompressing record data to find the next record is considerable effort
49							for larger records, but is not counted for very small records that the
50							system is likely to already have cached after the header has been read.
51
52							=cut
53
54							# These provide a simple mechanism to limit the scope of a search that is
55							# not producing results. Both are localized in the top-level calls.
56
57							our $Patience = 10000; # How much effort to put into a search?
58							our $Effort = 0; # How much have we done so far during this search?
59
60							# Most I/O incurs "effort", represented by incrementing $Effort, while
61							# partial success (finding an interesting record) increases "patience",
62							# represented by incrementing $Patience. The search stops when either
63							# there are no more places to look or $Effort exceeds $Patience.
64
65							=item %WARC::Record::Logical::Heuristics::Effort
66
67							This internal hash indicates how costly certain operations should be
68							considered. The keys and their meanings are subject to change at whim, but
69							this is available for quick tuning if needed. Generally, the better
70							solution is to index your data rather than spend time tuning heuristics.
71
72							=cut
73
74							our %Effort =
75							(# read_record:
76							# effort incurred to read a record header, regardless of compression
77							read_record => 5,
78
79							# gzread_data_per_tick:
80							# number of bytes to read while advancing past a compressed record to
81							# incur one effort point; effort incurred rounds down, even to zero
82							#
83							# this value is a shot-in-the-dark estimate that gunzipping 320 KiB is
84							# equivalent to the open/seek/read process for loading a record header
85							gzread_data_per_tick => 64 * 1024,
86
87							# readdir_files_per_tick:
88							# number of file names to read and check while scanning a directory for
89							# to incur one effort point; effort incurred rounds down, even to zero
90							#
91							# this value is a shot-in-the-dark estimate that reading/matching 1600
92							# file names is equivalent to loading a record header; this estimate
93							# may be high or low depending on the number of axes used in the search
94							readdir_files_per_tick => 320,
95							);
96
97							# Internal functions:
98
99							## @axes = _split_digit_spans( $filename )
100							##
101							## Extract possible sequence numbers from $filename and return list of
102							## array references [PREFIX, NUMBER, SUFFIX] where NUMBER is a field that
103							## can be adjusted to find "nearby" files if NUMBER turns out to actually
104							## be a sequence number. Finds numerous false matches in normal use, but
105							## broad searches cost only time while excessive narrowing causes failure.
106							##
107							## Does not perform I/O; does not increment $Effort.
108
109							sub _split_digit_spans ($) {
110	38			38		6464	my $name = shift;
111	38					74	my @axes = ();
112
113							# Split on zero-width boundaries between digits and non-digits.
114	38					1061	my @pieces = split /(?=[0-9])(?<=[^0-9])\|(?=[^0-9])(?<=[0-9])/, $name;
115							# The @pieces array now contains alternating spans of digits and non-digits.
116
117	38					124	for (my $i = 0; $i < @pieces; $i++) {
118	524	100	100			1818	next unless ($pieces[$i] =~ /^[0-9]+$/ && length($pieces[$i]) < 9);
119							# More than 8 digits is probably not a sequence number and may be
120							# beyond the range of an integer anyway. Use indexes instead of
121							# heuristics if you need to work with a billion WARC files.
122	237					1339	push @axes, [join('', @pieces[0..($i-1)]),
123							$pieces[$i], join('', @pieces[($i+1)..$#pieces])];
124							}
125
126	38					200	return @axes;
127							}
128
129							## @found = _find_nearby_files( $direction, @axes )
130							##
131							## Locate existing files that appear to be part of a contiguous sequence
132							## along an axis in @axes. The $direction argument is either +1 to search
133							## for higher numbers or -1 to search for lower numbers. A direction
134							## value with a magnitude greater than 1 results in skipping possibilities
135							## during the search.
136							##
137							## Returns a list of array references reflecting the files along each axis
138							## from the argument list but omitting axes on which no files were found.
139							##
140							## Performs only directory lookups, which have highly unpredictable costs
141							## and are usually cached by the system; does not increment $Effort.
142
143							sub _find_nearby_files ($@) {
144	20			20		3034	my $direction = shift;
145	20					44	my @found = ();
146
147	20					44	foreach my $axis (@_) {
148	180					299	my @files = ();
149	180					367	my $i = $axis->[1] + $direction; my $file;
	180					227
150	180					2559	while (-f ($file = join '', ($axis->[0],
151							sprintf('%0*d', length $axis->[1], $i),
152							$axis->[2])))
153	42					133	{ push @files, $file; $i += $direction }
	42					600
154	180	100				655	push @found, \@files if scalar @files;
155							}
156
157	20					64	return @found;
158							}
159
160							## @found = _scan_directory_for_axes( $dirname, @axes )
161							##
162							## Locate existing files that may appear to be part of a sequence along an
163							## axis in @axes, using wildcards for long digit spans.
164							##
165							## The $dirname argument specifies the name of a directory to search and
166							## all @axes are interpreted relative to $dirname. This differs from
167							## _find_nearby_files where each axis specifies full absolute filenames.
168							## For this function, the axes are strictly filenames with no directory.
169							##
170							## Returns a list of array references reflecting the files along each axis
171							## from the argument list but omitting axes on which no files were found.
172							##
173							## Performs directory reads; increments $Effort to count file names read.
174
175							sub _scan_directory_for_axes ($@) {
176	16			16		1217	my $dirname = shift;
177	16					25	my $read_count = 0;
178
179							my @re = map {
180	16					38	my $pre = quotemeta $_->[0]; my $post = quotemeta $_->[2];
	60					150
	60					108
181	60					164	$pre =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg;
	14					59
182	60					156	$post =~ s/(?<=[^0-9])([0-9]{9,})(?=[^0-9])/'[0-9]{'.(length $1).'}'/eg;
	2					16
183	60					92	my $midlen = length $_->[1]; qr/^$pre[0-9]{$midlen}$post/ } @_;
	60					993
184
185	16					31	my $filename;
186	16					22	my @found = ();
187	16	100				961	opendir my $dir, $dirname or croak "$dirname: $!";
188	15					567	while (defined ($filename = readdir $dir)) {
189	435					853	foreach (0 .. $#re) {
190	1740	100				4569	if ($filename =~ $re[$_])
191	148					196	{ push @{$found[$_]}, $filename }
	148					350
192							}
193	435					1123	$read_count++;
194							}
195	15					235	closedir $dir;
196
197	15					108	$Effort += int($read_count / $Effort{readdir_files_per_tick});
198	15					42	return grep {scalar @$_} @found;
	60					213
199							}
200
201							## @similar = _find_similar_files( $seed )
202							##
203							## Locate existing files that may appear to be part of a sequence involving
204							## any digit span in $seed, using wildcards for long digit spans and
205							## searching only the directory containing $seed.
206							##
207							## Returns a list of array references, each containing two array references
208							## for files sorting before and after $seed, reflecting the files along
209							## each axis derived from $seed on which files other than $seed were found.
210							##
211							## Uses _scan_directory_for_axes; does not perform I/O directly.
212
213							sub _find_similar_files ($) {
214	12			12		6342	my $seedfile = shift;
215
216	12					59	my $fs_volname; my $dirname; my $filename;
	12					0
217	12					140	($fs_volname, $dirname, $filename) = File::Spec->splitpath($seedfile);
218
219	12					90	my @found = _scan_directory_for_axes
220							(File::Spec->catpath($fs_volname, $dirname, ''),
221							_split_digit_spans $filename);
222	12					48	my @similar = ();
223	12					29	foreach my $axis_files (@found) {
224	48					74	my @before = (); my @after = ();
	48					62
225	48					69	foreach my $fname (@$axis_files) {
226	114	100				270	if ($fname lt $filename) {
		100
227	32					178	push @before, File::Spec->catpath($fs_volname, $dirname, $fname);
228							} elsif ($fname gt $filename) {
229	34					176	push @after, File::Spec->catpath($fs_volname, $dirname, $fname);
230							}
231							}
232	18					43	push @similar, [[sort {$a cmp $b} @before],
233	48	100				182	[sort {$a cmp $b} @after]] if @before + @after;
	28					52
234							}
235
236	12					59	return @similar;
237							}
238
239							## ($checkpoint, @records) =
240							## _scan_volume( $volume, $start, $end, [$field, $value]... )
241							##
242							## Search $volume for segment records where any $field matches $value
243							## starting at offset $start and ending at or after offset $end. If $end
244							## is an undefined value, searches until end-of-file.
245							##
246							## Only returns records that have a 'WARC-Segment-Number' header.
247							##
248							## The returned $checkpoint is the last record examined, regardless of
249							## header values, and provides a valid offset for resuming a search.
250
251							sub _scan_volume ($$$@) {
252	97			97		5086	my $volume = shift;
253	97					149	my $start = shift;
254	97					136	my $end = shift;
255
256	97					233	my $record = $volume->record_at($start);
257	97					401	my @records = ();
258
259	97		100			396	while ($record && (!defined $end \|\| $record->offset <= $end)) {
			100
260	885					2203	$Effort += $Effort{read_record};
261							next unless (defined $record->field('WARC-Segment-Number')
262	885	100	100			1817	&& grep {defined $record->field($_->[0])} @_);
	327					772
263	224	100				477	push @records, $record if grep {defined $record->field($_->[0])
	286	100				633
264							&& $record->field($_->[0]) eq $_->[1]} @_;
265							} continue { $Effort += int($record->field('Content-Length')
266							/ $Effort{gzread_data_per_tick})
267							if (defined $record->{compression}
268	885	100	100			2053	&& !defined $record->{sl_packed_size});
269	885					1783	$record = $record->next }
270
271	97					390	return $record, @records;
272							}
273
274							=item ( $first_segment, @clues ) = find_first_segment( $record )
275
276							Attempt to locate the first segment of the logical record suggested by the
277							given record without using indexes. Croaks if given a record that does not
278							appear to have been written using WARC segmentation. Returns a
279							C object for the first record and a list of other objects
280							that may be useful for locating continuation records. Returns undef in the
281							first slot if no clear first segment was found, but can still return other
282							records encountered during the search even if the search was ultimately
283							unsuccessful.
284
285							=cut
286
287							## Each "clue" can be a WARC::Record, or a hint in the form of [key => value].
288							##
289							## The hint keys currently are:
290							##
291							## tail => $record
292							## last record examined in initial volume
293							## (a good starting point to search for more segments)
294							##
295							## files_on_axes => [$filename, ...]...
296							## array of arrays from _find_nearby_files
297							## files_from_dir => [[$filename...], [$filename...]]...
298							## array of arrays from _find_similar_files
299							## Note that the filenames are set to undef in these hints as the
300							## corresponding WARC volumes are scanned, with any relevant records
301							## added directly to the clue list as they are found.
302
303							sub find_first_segment {
304	11			11	1	899	local $Patience = $Patience;
305	11					21	local $Effort = 0;
306
307	11					20	my $initial = shift;
308
309	11	100				37	croak 'searching for segments for unsegmented record'
310							unless defined $initial->field('WARC-Segment-Number');
311
312	10					28	my $origin_id = $initial->field('WARC-Segment-Origin-ID');
313	10					23	my @clues = (); my $point; my @records;
	10					18
314
315							# First we search the volume containing the initial record, since
316							# multiple WARC files may have been concatenated together after writing.
317	10					29	($point, @records) = _scan_volume $initial->volume, 0, $initial->offset,
318							[WARC_Segment_Origin_ID => $origin_id], [WARC_Record_ID => $origin_id];
319							# ... @records will always include $initial ...
320	10					40	push @clues, @records, [tail => $point];
321
322	10					23	foreach my $record (@records) {
323	10	100				28	return $record, @clues if $record->field('WARC-Record-ID') eq $origin_id;
324							}
325	7					18	$Patience += $Effort * ((scalar @records) - 1);
326	7	100				27	return undef, @clues if $Effort > $Patience;
327
328							# If we get this far, the first segment must be in another volume.
329							{
330	6					18	my @simple_axes = _split_digit_spans $initial->volume->filename;
331	6					18	my @nearby = _find_nearby_files -1, @simple_axes;
332
333							# A simple sequence number may be in use; we can check these volumes
334							# before reading the directory to handle varying timestamps.
335	6	100				24	push @clues, [files_on_axes => @nearby] if scalar @nearby;
336	6					29	foreach my $axis_files (reverse @nearby) {
337							# Work backwards on the assumption that sequence numbers are nearer
338							# to the end of the filename. (Correct for Wget and Wpull.)
339	4					23	foreach my $name (@$axis_files) {
340	6					11	my $previousEffort = $Effort;
341	6					18	my $volume = mount WARC::Volume ($name);
342	6					121	(undef, @records) = _scan_volume $volume, 0, undef,
343							[WARC_Segment_Origin_ID => $origin_id],
344							[WARC_Record_ID => $origin_id];
345	6					18	push @clues, @records; $name = undef;
	6					10
346	6					13	foreach my $record (@records) {
347	3	100				11	return $record, @clues
348							if $record->field('WARC-Record-ID') eq $origin_id;
349							}
350	5					13	$Patience += ($Effort - $previousEffort) * scalar @records;
351	5	100				26	return undef, @clues if $Effort > $Patience;
352							}
353							}
354							}
355
356							# If we get this far, the first segment is in another volume and multiple
357							# numbers must change to find that other volume. Assume that timestamps
358							# are in use in the file names, confounding the simple sequence search.
359							{
360	6					14	my @nearby = _find_similar_files $initial->volume->filename;
	4					10
	4					17
361
362	4	100				16	push @clues, [files_from_dir => @nearby] if scalar @nearby;
363							# Work forwards on the assumption that sequence numbers are nearer to
364							# the beginning of the filename. (Correct in Internet Archive samples.)
365	4					14	foreach my $fname ((map {reverse @{$_->[0]}} @nearby),
	8					11
	8					19
366							# work backwards within the "before" list on each axis
367							# ... and forwards within the "after" list on each axis
368	8					12	(map {@{$_->[1]}} @nearby)) {
	8					17
369	9					16	my $previousEffort = $Effort;
370	9					33	my $volume = mount WARC::Volume ($fname);
371	9					167	(undef, @records) = _scan_volume $volume, 0, undef,
372							[WARC_Segment_Origin_ID => $origin_id],
373							[WARC_Record_ID => $origin_id];
374	9					26	push @clues, @records; $fname = undef;
	9					17
375	9					21	foreach my $record (@records) {
376	4	100				12	return $record, @clues
377							if $record->field('WARC-Record-ID') eq $origin_id;
378							}
379	7					18	$Patience += ($Effort - $previousEffort) * scalar @records;
380	7	100				25	return undef, @clues if $Effort > $Patience;
381							}
382							}
383
384							# If we get this far, we have run out of places to look and the user will
385							# need to build an index instead of relying on heuristics.
386	1					7	return undef, @clues;
387							}
388
389							=item ( @segments ) = find_continuation( $first_segment, @clues )
390
391							Attempt to locate the continuation segments of a logical record without
392							using indexes. Uses the clues returned from C to aid
393							in the search and returns a list of continuation records found that appear
394							to be part of the same logical record as the given first segment.
395
396							=cut
397
398							sub _add_segments (\$\@\@) {
399	78			78		117	my $total_segment_count_ref = shift;
400	78					109	my $have_segments_ref = shift;
401	78					100	my $new_segments_ref = shift;
402
403	78					177	foreach (@$new_segments_ref) {
404	38					87	$have_segments_ref->[$_->field('WARC-Segment-Number')]++;
405	38	100				107	$$total_segment_count_ref = $_->field('WARC-Segment-Number')
406							if defined $_->field('WARC-Segment-Total-Length');
407							}
408							}
409							sub _have_all_segments_p ($@) {
410	86			86		135	my $total_segment_count = shift;
411
412							# We cannot have all segments if we have not seen the last segment yet.
413	86	100				380	return 0 unless defined $total_segment_count;
414
415							# We have seen the last segment, do we have all of the others?
416	24	100				59	for (my $i = 2; $i < $total_segment_count; $i++) { return 0 unless $_[$i] }
	59					187
417							# Start the search at 2 because offsets 0 and 1 are not used here.
418
419	9					83	return 1;
420							}
421
422							sub find_continuation {
423	16			16	1	6355	local $Patience = $Patience;
424	16					33	local $Effort = 0;
425
426	16					27	my $first_segment = shift; my $origin_id = $first_segment->id;
	16					51
427
428							# First we unpack the clues and check if all segments were found while
429							# searching for the first segment.
430	16					37	my @segments = (); my @nearby_volume_files = ();
	16					26
431	16					27	my $have_tail = 0; my $point = undef;
	16					26
432	16					24	my @similar_volume_files_before = (); my @similar_volume_files_after = ();
	16					30
433	16					39	foreach my $clue (@_) {
434	35	100				107	if (UNIVERSAL::isa($clue, 'WARC::Record')) {
		100
435	21	100				58	push @segments, $clue unless $clue == $first_segment;
436							} elsif (ref $clue eq 'ARRAY') {
437	13					30	my $tag = shift @$clue;
438	13	100				40	if ($tag eq 'tail') {
		100
		100
439	7					11	$have_tail = 1;
440	7					14	$point = shift @$clue;
441							} elsif ($tag eq 'files_on_axes') {
442	3					9	push @nearby_volume_files, map {[grep defined, @$_]} @$clue;
	6					19
443							} elsif ($tag eq 'files_from_dir') {
444	2					8	foreach (@$clue) {
445	6					8	push @similar_volume_files_before, [grep defined, @{$_->[0]}];
	6					19
446	6					10	push @similar_volume_files_after, [grep defined, @{$_->[1]}];
	6					17
447							}
448	1					25	} else { die "unrecognized hint tag: $tag" }
449	1					10	} else { die "unrecognized clue" }
450	33					76	$clue = undef;
451							}
452	14					36	@similar_volume_files_before = grep {scalar @$_} @similar_volume_files_before;
	6					12
453	14					25	@similar_volume_files_after = grep {scalar @$_} @similar_volume_files_after;
	6					13
454
455	14					17	my @have_segments = (); my $total_segment_count = undef;
	14					23
456	14					44	_add_segments $total_segment_count, @have_segments, @segments;
457
458	14	100				37	return @segments if _have_all_segments_p $total_segment_count, @have_segments;
459
460							# If we get to here, at least one segment was not found while searching
461							# for the first segment, so we will need to search too.
462	12					24	my @records = ();
463
464							# Pick up where find_first_segment left off...
465	12	100				35	if ($point) {
		100
466	4					13	(undef, @records) = _scan_volume $point->volume, $point->offset, undef,
467							[WARC_Segment_Origin_ID => $origin_id];
468	4					17	_add_segments $total_segment_count, @have_segments, @records;
469	4					9	push @segments, @records;
470							} elsif (!$have_tail) {
471							# The search may have begun with the first segment directly; ensure
472							# that we scan the entire volume containing the first segment later.
473	7					23	push @nearby_volume_files, [$first_segment->volume->filename];
474							}
475	12					137	$Patience += $Effort * scalar @records;
476							return @segments
477	12	100	100			26	if (_have_all_segments_p $total_segment_count, @have_segments
478							or $Effort > $Patience);
479
480							# Search for more volumes in a simple sequence...
481							{
482	10					23	my @simple_axes = _split_digit_spans $first_segment->volume->filename;
483	10					28	my @nearby = _find_nearby_files 1, @simple_axes;
484
485							# Were more volumes found in the simple sequence search now or previously?
486	10					33	foreach my $axis_files ((reverse @nearby),
487							(reverse @nearby_volume_files)) {
488							# Work backwards on the assumption that sequence numbers are nearer
489							# to the end of the filename. (Correct for Wget and Wpull.)
490	17					34	foreach my $name (@$axis_files) {
491	27					46	my $previousEffort = $Effort;
492	27					423	my $volume = mount WARC::Volume ($name);
493	27					507	(undef, @records) = _scan_volume $volume, 0, undef,
494							[WARC_Segment_Origin_ID => $origin_id];
495	27					93	_add_segments $total_segment_count, @have_segments, @records;
496	27					50	push @segments, @records;
497	27					57	$Patience += ($Effort - $previousEffort) * scalar @records;
498							return @segments
499	27	100	100			61	if (_have_all_segments_p $total_segment_count, @have_segments
500							or $Effort > $Patience);
501							}
502							}
503							}
504
505							# Search for more volumes by directory scan...
506							{
507	10	100				21	unless (@similar_volume_files_before + @similar_volume_files_after) {
	6					18
	6					18
508							# Unlike the simple sequence search, the directory scan finds files
509							# in both directions from the starting point on all axes, but it may
510							# not have been needed to find the first segment. Do it now if not.
511	5					18	my @nearby = _find_similar_files $first_segment->volume->filename;
512	5					11	foreach (@nearby) {
513	13					19	push @similar_volume_files_before, $_->[0];
514	13					25	push @similar_volume_files_after, $_->[1];
515							}
516							}
517							# Any interesting records in volumes before the volume containing the
518							# initial record were probably found while locating the first segment.
519	6					13	foreach my $axis_files (@similar_volume_files_after,
520							reverse @similar_volume_files_before) {
521							# Work forwards on the assumption that sequence numbers are nearer to
522							# the beginning of the filename. (Correct in Internet Archive samples.)
523	23					42	foreach my $fname (@$axis_files) {
524	33					52	my $previousEffort = $Effort;
525	33					96	my $volume = mount WARC::Volume ($fname);
526	33					574	(undef, @records) = _scan_volume $volume, 0, undef,
527							[WARC_Segment_Origin_ID => $origin_id];
528	33					116	_add_segments $total_segment_count, @have_segments, @records;
529	33					64	push @segments, @records;
530	33					64	$Patience += ($Effort - $previousEffort) * scalar @records;
531							return @segments
532	33	100	100			69	if (_have_all_segments_p $total_segment_count, @have_segments
533							or $Effort > $Patience);
534							}
535							}
536							}
537
538							# If we get to here, we have run out of places to look and the user will
539							# need to build an index instead of relying on heuristics.
540	2					16	return @segments;
541							}
542
543							=back
544
545							=cut
546
547							1;
548							__END__