File Coverage

blib/lib/WARC/Collection.pm
Criterion Covered Total %
statement 70 70 100.0
branch 24 24 100.0
condition 6 6 100.0
subroutine 6 6 100.0
pod 3 3 100.0
total 109 109 100.0


line stmt bran cond sub pod time code
1             package WARC::Collection; # -*- CPerl -*-
2              
3 27     27   70799 use strict;
  27         71  
  27         804  
4 27     27   134 use warnings;
  27         53  
  27         1494  
5              
6             our @ISA = qw();
7              
8             require WARC; *WARC::Collection::VERSION = \$WARC::VERSION;
9              
10 27     27   195 use Carp;
  27         76  
  27         21773  
11              
12             require WARC::Index;
13             require WARC::Index::Entries;
14              
15             =head1 NAME
16              
17             WARC::Collection - Interface to a group of WARC files
18              
19             =head1 SYNOPSIS
20              
21             use WARC::Collection;
22              
23             $collection = assemble WARC::Collection ($index_1, $index_2, ...);
24             $collection = assemble WARC::Collection from => ($index_1, ...);
25              
26             $yes_or_no = $collection->searchable( $key );
27              
28             $record = $collection->search(url => $url, time => $when);
29             @records = $collection->search(url => $url, time => $when);
30              
31             =cut
32              
33             # This implementation uses a hash as the underlying structure.
34             # Keys defined by this class:
35             #
36             # indexes
37             # Array of indexes used for this collection.
38              
39             =head1 DESCRIPTION
40              
41             The C class is the primary means by which user code is
42             expected to use the WARC library. This class uses indexes to efficiently
43             search for records in one or more WARC files.
44              
45             =head2 Search Keys
46              
47             The C method accepts a list of parameters as I =E I
48             pairs with each pair narrowing the search, sorting the results, or both,
49             indicated in the following list with S<"C<[N ]>">, S<"C<[ S]>">, or "C<[NS]>",
50             respectively.
51              
52             Supplying an array reference as a I indicates a search where any of
53             the values in the array are acceptable. This does not affect sorting.
54              
55             The same search keys documented here are used for searching indexes, since
56             C is a wrapper around one or more indexes, but index
57             support modules do not sort their results. Only C sorts
58             the returned entries, so keys listed below as "sort-only" are ignored by
59             the index support modules.
60              
61             The keys supported are:
62              
63             =over
64              
65             =item C<[N ]> url
66              
67             An exact match for a URL.
68              
69             =item C<[NS]> url_prefix
70              
71             A prefix match for a URL. Prefers records with shorter URLs.
72              
73             =item C<[ S]> time
74              
75             Prefer records collected nearer to the requested time.
76              
77             =item C<[N ]> record_id
78              
79             An exact match for a (presumably unique) WARC-Record-ID.
80              
81             =item C<[N ]> segment_origin_id
82              
83             Exact match for continuation records for a WARC-Record-ID that identifies a
84             logical record stored using WARC record segmentation. Searching on this
85             key returns only the continuation records.
86              
87             =back
88              
89             =for comment
90             Matching these keys is implemented in WARC::Index::Entry::_distance_for_item
91             via %_distance_value_map and in various index support modules.
92              
93             =head2 Methods
94              
95             =over
96              
97             =item $collection = assemble WARC::Collection ($index_1, $index_2, ...);
98              
99             =item $collection = assemble WARC::Collection from =E ($index_1, ...);
100              
101             Assemble a collection of WARC files from one index or multiple indexes,
102             specified either as objects derived from C or filenames.
103              
104             While multiple indexes can be used in a collection, note that searching a
105             collection requires individually searching every index in the collection.
106              
107             =cut
108              
109             sub assemble {
110 20     20 1 7341 my $class = shift;
111 20 100 100     133 shift if scalar @_ and $_[0] eq 'from'; # discard optional noise word
112              
113 20 100       394 carp "assembling empty collection" unless scalar @_;
114              
115 20         46 my @indexes = ();
116              
117 20         49 while (@_) {
118 23         93 my $index = shift;
119              
120 23 100       106 if (UNIVERSAL::isa($index, 'WARC::Index'))
121 10         31 { push @indexes, $index } # add index object to list
122             else { # or assume filename and find an index handler
123 13         34 my $isys = WARC::Index::find_handler($index);
124 13 100       175 croak "no known handler for index '$index'" unless $isys;
125 12         34 push @indexes, (attach $isys $index);
126             }
127             }
128              
129 19         194 bless { indexes => \@indexes }, $class
130             }
131              
132             =item $yes_or_no = $collection-Esearchable( $key )
133              
134             Return true or false to reflect if any index in the collection can search
135             for the requested key.
136              
137             =cut
138              
139             sub searchable {
140 15     15 1 440 my $self = shift;
141 15         26 my $key = shift;
142              
143 15         23 foreach my $index (@{$self->{indexes}})
  15         40  
144 18 100       57 { return 1 if $index->searchable($key) }
145              
146             # none of the indexes recognize $key
147 4         36 return 0;
148             }
149              
150             =item $record = $collection-Esearch( ... )
151              
152             =item @records = $collection-Esearch( ... )
153              
154             Search the indexes for records matching the parameters and return the best
155             match in scalar context or a list of all matches in list context. The
156             returned values are C objects.
157              
158             See L for more information about the parameters.
159              
160             =cut
161              
162             sub search {
163 66     66 1 14426 my $self = shift;
164              
165 66 100       154 unless (defined wantarray)
166 1         142 { carp "calling 'search' method in void context"; return }
  1         66  
167              
168 65 100       225 croak "no arguments given to 'search' method"
169             unless scalar @_;
170 64 100       242 croak "odd number of arguments given to 'search' method"
171             if scalar @_ % 2;
172              
173             # collect all matches from all indexes
174 63         99 my %results = (); # map: tag => array of index entries for record
175 63 100       450 if (grep UNIVERSAL::isa($_, 'ARRAY'), @_) {
176             # at least one parameter is an arrayref; perform nested loop join
177 19         52 my @step = @_; my @state = (); my @varpos = ();
  19         33  
  19         31  
178 19         53 for (my $i = 1; $i <= $#step; $i += 2) {
179 29 100       94 next unless UNIVERSAL::isa($step[$i], 'ARRAY');
180 20         35 push @state, 0;
181 20         31 push @varpos, $i;
182 20         97 $step[$i] = $step[$i]->[0];
183             }
184             # search indexes with all combinations from the input
185 19         42 while ($state[0] <= $#{$_[$varpos[0]]}) {
  54         149  
186 35         56 foreach my $index (@{$self->{indexes}})
  35         84  
187 35         86 { foreach my $entry ($index->search(@step))
188 38         317 { push @{$results{$entry->tag}}, $entry } }
  38         80  
189             } continue {
190             # count in variable base in @state using the input arrayrefs
191 35         287 my $i = $#state;
192 35   100     91 $i-- while $i > 0 && $state[$i] >= $#{$_[$varpos[$i]]};
  8         31  
193 35         117 $step[$varpos[$i]] = $_[$varpos[$i]]->[++$state[$i]];
194 35         106 $step[$varpos[$i]] = $_[$varpos[$i]]->[$state[$i] = 0]
195             while ++$i <= $#state;
196             }
197             } else {
198             # simple case with single values; only one scan needed
199 44         64 foreach my $index (@{$self->{indexes}})
  44         106  
200 66         275 { foreach my $entry ($index->search(@_))
201 70         436 { push @{$results{$entry->tag}}, $entry } }
  70         156  
202             }
203              
204             # coalesce and sort the collected index entries
205             my @results =
206 63         286 map {coalesce WARC::Index::Entries ($results{$_})} keys %results;
  108         347  
207             @results = # sort by distance using Schwartzian transform
208 108         212 (map { $_->[0] } sort { $a->[1] <=> $b->[1] }
  89         169  
209 63         130 map { [$_, scalar $_->distance(@_)] } @results);
  108         248  
210              
211             # return either the entire sorted list or the best match
212 63 100       158 if (wantarray) { return map {$_->record(collection => $self)} @results }
  31         97  
  67         1169  
213             else { return (scalar @results
214 32 100       117 ? $results[0]->record(collection => $self) : undef) }
215             }
216              
217             =back
218              
219             =cut
220              
221             1;
222             __END__