File Coverage

blib/lib/WARC/Index/Entry.pm
Criterion Covered Total %
statement 66 66 100.0
branch 36 36 100.0
condition 6 6 100.0
subroutine 13 13 100.0
pod 7 7 100.0
total 128 128 100.0


line stmt bran cond sub pod time code
1             package WARC::Index::Entry; # -*- CPerl -*-
2              
3 27     27   70399 use strict;
  27         64  
  27         770  
4 27     27   138 use warnings;
  27         51  
  27         1558  
5              
6             our @ISA = qw();
7              
8             require WARC; *WARC::Index::Entry::VERSION = \$WARC::VERSION;
9              
10 27     27   171 use Carp;
  27         68  
  27         24647  
11              
12             require WARC::Record::Stub;
13              
14             =head1 NAME
15              
16             WARC::Index::Entry - abstract base class for WARC::Index entries
17              
18             =head1 SYNOPSIS
19              
20             use WARC; # or ...
21             use WARC::Index;
22              
23             # WARC::Index::Entry objects are returned from directly searching an index
24              
25             # match search criteria against entry
26             $distance = $entry->distance( ... );
27             @report = $entry->distance( ... );
28              
29             $index = $entry->index; # get index containing entry
30             $volume = $entry->volume; # get WARC::Volume containing record
31             $record = $entry->record; # get WARC record
32              
33             =head1 DESCRIPTION
34              
35             =head2 Common Methods
36              
37             Entries from all index systems support these methods:
38              
39             =over
40              
41             =item @report = $entry-Edistance( ... )
42              
43             =item $distance = $entry-Edistance( ... )
44              
45             In list context, return a detailed report mapping each search I to a
46             distance value. In scalar context, return an overall summary distance,
47             such that sorting entries by the return values of this method in ascending
48             order will place the closest matches at the top of the list.
49              
50             A valid distance is non-negative. Negative distance values indicate that
51             the record does not match the criteria at all. An undefined value
52             indicates that the entry is from an index that does not store the
53             information needed to evaluate distance for that search key. Undefined
54             values are ignored when computing the summarized distance, but the
55             summarized distance will be negative if any keys do not match at all and
56             itself undefined if none of the requested keys can be evaluated.
57              
58             For details on available search keys, see the L<"Search Keys"
59             section|WARC::Collection/"Search Keys"> of the C page.
60             If multiple values are given in an arrayref, the best match is reported.
61              
62             =cut
63              
64             sub distance {
65 698     698 1 44643 my $self = shift;
66              
67 698 100       1321 unless (defined wantarray)
68 1         224 { carp "calling 'distance' method in void context"; return }
  1         69  
69              
70 697 100       1272 croak "no arguments given to 'distance' method"
71             unless scalar @_;
72 696 100       1464 croak "odd number of arguments given to 'distance' method"
73             if scalar @_ % 2;
74              
75 695 100       1089 if (wantarray) { return $self->_distance_report(@_) }
  20         43  
76 675         1158 else { return $self->_distance_summary(@_) }
77             }
78              
79             sub _distance_report {
80 20     20   27 my $self = shift;
81 20         28 my @report = ();
82              
83 20         44 for (my $i = 0; $i < @_; $i += 2)
84 24         60 { push @report, $_[$i] => $self->_distance_for_item($_[$i] => $_[1+$i]) }
85              
86 20         62 return @report;
87             }
88              
89             sub _distance_summary {
90 675     675   871 my $self = shift;
91              
92 675         817 my $summary = 0;
93 675         774 my $match = 1;
94 675         796 my $seen = 0;
95              
96 675         1126 while (@_) {
97 936         1701 my $distance = $self->_distance_for_item(splice @_, 0, 2);
98 934 100       1571 next unless defined $distance;
99 920         1141 $seen++;
100 920 100       1463 if ($distance < 0) { $match = 0 }
  475         869  
101 445         853 else { $summary += $distance }
102             }
103              
104 673 100       1373 return undef unless $seen;
105 660 100       1991 return $match ? $summary : -(1+$summary);
106             }
107              
108             # Single Point of Truth for index key definitions
109             our %_distance_value_map =
110             ( time => [numeric => 'time'],
111             record_id => [exact => 'record_id'],
112             segment_origin_id => [exact => 'segment_origin_id'],
113             url => [exact => 'url'],
114             url_prefix => [prefix => 'url'],
115             );
116              
117             sub _distance_for_item {
118 960     960   1249 my $self = shift;
119 960         1207 my $item = shift;
120 960         1565 my @sought = (scalar shift);
121 960 100       3234 @sought = @{$sought[0]} if UNIVERSAL::isa($sought[0], 'ARRAY');
  55         136  
122              
123             croak "index distance requested for unknown item $item"
124 960 100       1909 unless defined $_distance_value_map{$item};
125 959         1940 my $actual = $self->value($_distance_value_map{$item}[1]);
126              
127 959 100       2647 return undef unless defined $actual;
128              
129 943         1479 my $mode = $_distance_value_map{$item}[0];
130              
131 943         1189 my $distance = -1;
132 943 100       1695 if ($mode eq 'exact') {
    100          
    100          
133 725         1108 foreach my $sought (@sought) {
134 758 100       1531 $distance = 0 if $sought eq $actual;
135             }
136             } elsif ($mode eq 'numeric') {
137 124         235 foreach my $sought (@sought) {
138 128         357 my $here = abs($actual - $sought);
139 128 100 100     2671 $distance = $here if $distance < 0 || $here < $distance;
140             }
141             } elsif ($mode eq 'prefix') {
142 93         140 foreach my $sought (@sought) {
143 97 100       239 next unless $sought eq substr $actual, 0, length $sought;
144 56         81 my $here = length($actual) - length($sought);
145 56 100 100     179 $distance = $here if $distance < 0 || $here < $distance;
146             }
147 1         10 } else { die "unknown mode '$mode' for item '$item'" }
148 942         1780 return $distance;
149             }
150              
151             =item $index = $entry-Eindex
152              
153             Return the C containing this entry.
154              
155             =cut
156              
157             sub index {
158 1     1 1 919 die __PACKAGE__." is an abstract base class and "
159             .(ref shift)." must override the 'index' method"
160             }
161              
162             =item $volume = $entry-Evolume
163              
164             Return the C object representing the file in which this index
165             entry's record is located.
166              
167             =cut
168              
169             sub volume {
170 1     1 1 549 die __PACKAGE__." is an abstract base class and "
171             .(ref shift)." must override the 'volume' method"
172             }
173              
174             =item $record = $entry-Erecord( ... )
175              
176             Return the C this index entry represents. Arguments if given
177             are additional key =E value pairs for the record object.
178              
179             =cut
180              
181             sub record {
182 235     235 1 10551 my $self = shift;
183 235         465 return new WARC::Record::Stub ($self->volume, $self->record_offset, @_);
184             }
185              
186             =item $record_offset = $entry-Erecord_offset
187              
188             Return the file offset at which this index entry's record is located.
189              
190             =cut
191              
192             sub record_offset {
193 1     1 1 428 die __PACKAGE__." is an abstract base class and "
194             .(ref shift)." must override the 'offset' method"
195             }
196              
197             =item $value = $entry-Evalue( $key )
198              
199             Return the value this index entry holds for a given search key.
200              
201             =cut
202              
203             sub value {
204 1     1 1 422 die __PACKAGE__." is an abstract base class and "
205             .(ref shift)." must override the 'value' method"
206             }
207              
208             =item $tag = $entry-Etag
209              
210             Return a tag for this index entry. The exact format of the tag is
211             unspecified and platform-dependent. Two index entries that refer to
212             different records are guaranteed (if the underlying system software behaves
213             correctly) to have different tag values, while two entries that refer to
214             the same record in the same volume will normally have the same tag value,
215             except in edge cases.
216              
217             =cut
218              
219             sub tag {
220 44     44 1 865 my $self = shift;
221 44         164 return (($self->volume->_file_tag).':'.($self->record_offset));
222             }
223              
224             =back
225              
226             =head2 Optional Methods
227              
228             Some index entries may additionally support any of these methods:
229              
230             =over
231              
232             =item $next_entry = $entry-Enext
233              
234             Indexes with an inherent sequence of entries may provide a method to obtain
235             the next entry in the index. Some index systems have this, while others do
236             not have a meaningful order amongst their entries.
237              
238             =item $position = $entry-Eentry_position
239              
240             Indexes with an inherent sequence of entries may provide a method to obtain
241             some kind of index-specific entry number or location parameter. This is
242             most useful for metaindexes to record the location of an index entry.
243              
244             =back
245              
246             =cut
247              
248             1;
249             __END__