| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WARC::Collection;					# -*- CPerl -*- | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 29 |  |  | 29 |  | 70785 | use strict; | 
|  | 29 |  |  |  |  | 60 |  | 
|  | 29 |  |  |  |  | 836 |  | 
| 4 | 29 |  |  | 29 |  | 158 | use warnings; | 
|  | 29 |  |  |  |  | 52 |  | 
|  | 29 |  |  |  |  | 1466 |  | 
| 5 |  |  |  |  |  |  |  | 
| 6 |  |  |  |  |  |  | our @ISA = qw(); | 
| 7 |  |  |  |  |  |  |  | 
| 8 |  |  |  |  |  |  | require WARC; *WARC::Collection::VERSION = \$WARC::VERSION; | 
| 9 |  |  |  |  |  |  |  | 
| 10 | 29 |  |  | 29 |  | 160 | use Carp; | 
|  | 29 |  |  |  |  | 64 |  | 
|  | 29 |  |  |  |  | 21641 |  | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | require WARC::Index; | 
| 13 |  |  |  |  |  |  | require WARC::Index::Entries; | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | =head1 NAME | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | WARC::Collection - Interface to a group of WARC files | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | use WARC::Collection; | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | $collection = assemble WARC::Collection ($index_1, $index_2, ...); | 
| 24 |  |  |  |  |  |  | $collection = assemble WARC::Collection from => ($index_1, ...); | 
| 25 |  |  |  |  |  |  |  | 
| 26 |  |  |  |  |  |  | $yes_or_no = $collection->searchable( $key ); | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | $record = $collection->search(url => $url, time => $when); | 
| 29 |  |  |  |  |  |  | @records = $collection->search(url => $url, time => $when); | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | =cut | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | # This implementation uses a hash as the underlying structure. | 
| 34 |  |  |  |  |  |  | #  Keys defined by this class: | 
| 35 |  |  |  |  |  |  | # | 
| 36 |  |  |  |  |  |  | #   indexes | 
| 37 |  |  |  |  |  |  | #	Array of indexes used for this collection. | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | The C class is the primary means by which user code is | 
| 42 |  |  |  |  |  |  | expected to use the WARC library.  This class uses indexes to efficiently | 
| 43 |  |  |  |  |  |  | search for records in one or more WARC files. | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | =head2 Search Keys | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | The C method accepts a list of parameters as I =E I | 
| 48 |  |  |  |  |  |  | pairs with each pair narrowing the search, sorting the results, or both, | 
| 49 |  |  |  |  |  |  | indicated in the following list with S<"C<[N ]>">, S<"C<[ S]>">, or "C<[NS]>", | 
| 50 |  |  |  |  |  |  | respectively. | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | Supplying an array reference as a I indicates a search where any of | 
| 53 |  |  |  |  |  |  | the values in the array are acceptable.  This does not affect sorting. | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | The same search keys documented here are used for searching indexes, since | 
| 56 |  |  |  |  |  |  | C is a wrapper around one or more indexes, but index | 
| 57 |  |  |  |  |  |  | support modules do not sort their results.  Only C sorts | 
| 58 |  |  |  |  |  |  | the returned entries, so keys listed below as "sort-only" are ignored by | 
| 59 |  |  |  |  |  |  | the index support modules. | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | The keys supported are: | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | =over | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | =item C<[N ]> url | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | An exact match for a URL. | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | =item C<[NS]> url_prefix | 
| 70 |  |  |  |  |  |  |  | 
| 71 |  |  |  |  |  |  | A prefix match for a URL.  Prefers records with shorter URLs. | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | =item C<[ S]> time | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | Prefer records collected nearer to the requested time. | 
| 76 |  |  |  |  |  |  |  | 
| 77 |  |  |  |  |  |  | =item C<[N ]> record_id | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | An exact match for a (presumably unique) WARC-Record-ID. | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | =item C<[N ]> segment_origin_id | 
| 82 |  |  |  |  |  |  |  | 
| 83 |  |  |  |  |  |  | Exact match for continuation records for a WARC-Record-ID that identifies a | 
| 84 |  |  |  |  |  |  | logical record stored using WARC record segmentation.  Searching on this | 
| 85 |  |  |  |  |  |  | key returns only the continuation records. | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | =back | 
| 88 |  |  |  |  |  |  |  | 
| 89 |  |  |  |  |  |  | =for comment | 
| 90 |  |  |  |  |  |  | Matching these keys is implemented in WARC::Index::Entry::_distance_for_item | 
| 91 |  |  |  |  |  |  | via %_distance_value_map and in various index support modules. | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | =head2 Methods | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | =over | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | =item $collection = assemble WARC::Collection ($index_1, $index_2, ...); | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | =item $collection = assemble WARC::Collection from =E ($index_1, ...); | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | Assemble a collection of WARC files from one index or multiple indexes, | 
| 102 |  |  |  |  |  |  | specified either as objects derived from C or filenames. | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | While multiple indexes can be used in a collection, note that searching a | 
| 105 |  |  |  |  |  |  | collection requires individually searching every index in the collection. | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | =cut | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | sub assemble { | 
| 110 | 20 |  |  | 20 | 1 | 7220 | my $class = shift; | 
| 111 | 20 | 100 | 100 |  |  | 108 | shift if scalar @_ and $_[0] eq 'from';	# discard optional noise word | 
| 112 |  |  |  |  |  |  |  | 
| 113 | 20 | 100 |  |  |  | 378 | carp "assembling empty collection" unless scalar @_; | 
| 114 |  |  |  |  |  |  |  | 
| 115 | 20 |  |  |  |  | 51 | my @indexes = (); | 
| 116 |  |  |  |  |  |  |  | 
| 117 | 20 |  |  |  |  | 41 | while (@_) { | 
| 118 | 23 |  |  |  |  | 92 | my $index = shift; | 
| 119 |  |  |  |  |  |  |  | 
| 120 | 23 | 100 |  |  |  | 92 | if (UNIVERSAL::isa($index, 'WARC::Index')) | 
| 121 | 10 |  |  |  |  | 26 | { push @indexes, $index }	# add index object to list | 
| 122 |  |  |  |  |  |  | else {			# or assume filename and find an index handler | 
| 123 | 13 |  |  |  |  | 36 | my $isys = WARC::Index::find_handler($index); | 
| 124 | 13 | 100 |  |  |  | 174 | croak "no known handler for index '$index'" unless $isys; | 
| 125 | 12 |  |  |  |  | 35 | push @indexes, (attach $isys $index); | 
| 126 |  |  |  |  |  |  | } | 
| 127 |  |  |  |  |  |  | } | 
| 128 |  |  |  |  |  |  |  | 
| 129 | 19 |  |  |  |  | 182 | bless { indexes => \@indexes }, $class | 
| 130 |  |  |  |  |  |  | } | 
| 131 |  |  |  |  |  |  |  | 
| 132 |  |  |  |  |  |  | =item $yes_or_no = $collection-Esearchable( $key ) | 
| 133 |  |  |  |  |  |  |  | 
| 134 |  |  |  |  |  |  | Return true or false to reflect if any index in the collection can search | 
| 135 |  |  |  |  |  |  | for the requested key. | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | =cut | 
| 138 |  |  |  |  |  |  |  | 
| 139 |  |  |  |  |  |  | sub searchable { | 
| 140 | 15 |  |  | 15 | 1 | 346 | my $self = shift; | 
| 141 | 15 |  |  |  |  | 20 | my $key = shift; | 
| 142 |  |  |  |  |  |  |  | 
| 143 | 15 |  |  |  |  | 18 | foreach my $index (@{$self->{indexes}}) | 
|  | 15 |  |  |  |  | 39 |  | 
| 144 | 18 | 100 |  |  |  | 54 | { return 1 if $index->searchable($key) } | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | # none of the indexes recognize $key | 
| 147 | 4 |  |  |  |  | 25 | return 0; | 
| 148 |  |  |  |  |  |  | } | 
| 149 |  |  |  |  |  |  |  | 
| 150 |  |  |  |  |  |  | =item $record = $collection-Esearch( ... ) | 
| 151 |  |  |  |  |  |  |  | 
| 152 |  |  |  |  |  |  | =item @records = $collection-Esearch( ... ) | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | Search the indexes for records matching the parameters and return the best | 
| 155 |  |  |  |  |  |  | match in scalar context or a list of all matches in list context.  The | 
| 156 |  |  |  |  |  |  | returned values are C objects. | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | See L"Search Keys"> for more information about the parameters. | 
| 159 |  |  |  |  |  |  |  | 
| 160 |  |  |  |  |  |  | =cut | 
| 161 |  |  |  |  |  |  |  | 
| 162 |  |  |  |  |  |  | sub search { | 
| 163 | 66 |  |  | 66 | 1 | 13980 | my $self = shift; | 
| 164 |  |  |  |  |  |  |  | 
| 165 | 66 | 100 |  |  |  | 142 | unless (defined wantarray) | 
| 166 | 1 |  |  |  |  | 117 | { carp "calling 'search' method in void context"; return } | 
|  | 1 |  |  |  |  | 73 |  | 
| 167 |  |  |  |  |  |  |  | 
| 168 | 65 | 100 |  |  |  | 210 | croak "no arguments given to 'search' method" | 
| 169 |  |  |  |  |  |  | unless scalar @_; | 
| 170 | 64 | 100 |  |  |  | 231 | croak "odd number of arguments given to 'search' method" | 
| 171 |  |  |  |  |  |  | if scalar @_ % 2; | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  | # collect all matches from all indexes | 
| 174 | 63 |  |  |  |  | 89 | my %results = ();	# map:  tag => array of index entries for record | 
| 175 | 63 | 100 |  |  |  | 395 | if (grep UNIVERSAL::isa($_, 'ARRAY'), @_) { | 
| 176 |  |  |  |  |  |  | # at least one parameter is an arrayref; perform nested loop join | 
| 177 | 19 |  |  |  |  | 46 | my @step = @_; my @state = (); my @varpos = (); | 
|  | 19 |  |  |  |  | 29 |  | 
|  | 19 |  |  |  |  | 25 |  | 
| 178 | 19 |  |  |  |  | 46 | for (my $i = 1; $i <= $#step; $i += 2) { | 
| 179 | 29 | 100 |  |  |  | 83 | next unless UNIVERSAL::isa($step[$i], 'ARRAY'); | 
| 180 | 20 |  |  |  |  | 33 | push @state, 0; | 
| 181 | 20 |  |  |  |  | 28 | push @varpos, $i; | 
| 182 | 20 |  |  |  |  | 85 | $step[$i] = $step[$i]->[0]; | 
| 183 |  |  |  |  |  |  | } | 
| 184 |  |  |  |  |  |  | # search indexes with all combinations from the input | 
| 185 | 19 |  |  |  |  | 27 | while ($state[0] <= $#{$_[$varpos[0]]}) { | 
|  | 54 |  |  |  |  | 129 |  | 
| 186 | 35 |  |  |  |  | 44 | foreach my $index (@{$self->{indexes}}) | 
|  | 35 |  |  |  |  | 66 |  | 
| 187 | 35 |  |  |  |  | 77 | { foreach my $entry ($index->search(@step)) | 
| 188 | 38 |  |  |  |  | 265 | { push @{$results{$entry->tag}}, $entry } } | 
|  | 38 |  |  |  |  | 82 |  | 
| 189 |  |  |  |  |  |  | } continue { | 
| 190 |  |  |  |  |  |  | # count in variable base in @state using the input arrayrefs | 
| 191 | 35 |  |  |  |  | 271 | my $i = $#state; | 
| 192 | 35 |  | 100 |  |  | 83 | $i-- while $i > 0 && $state[$i] >= $#{$_[$varpos[$i]]}; | 
|  | 8 |  |  |  |  | 30 |  | 
| 193 | 35 |  |  |  |  | 101 | $step[$varpos[$i]] = $_[$varpos[$i]]->[++$state[$i]]; | 
| 194 | 35 |  |  |  |  | 99 | $step[$varpos[$i]] = $_[$varpos[$i]]->[$state[$i] = 0] | 
| 195 |  |  |  |  |  |  | while ++$i <= $#state; | 
| 196 |  |  |  |  |  |  | } | 
| 197 |  |  |  |  |  |  | } else { | 
| 198 |  |  |  |  |  |  | # simple case with single values; only one scan needed | 
| 199 | 44 |  |  |  |  | 61 | foreach my $index (@{$self->{indexes}}) | 
|  | 44 |  |  |  |  | 94 |  | 
| 200 | 66 |  |  |  |  | 269 | { foreach my $entry ($index->search(@_)) | 
| 201 | 70 |  |  |  |  | 395 | { push @{$results{$entry->tag}}, $entry } } | 
|  | 70 |  |  |  |  | 135 |  | 
| 202 |  |  |  |  |  |  | } | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | # coalesce and sort the collected index entries | 
| 205 |  |  |  |  |  |  | my @results = | 
| 206 | 63 |  |  |  |  | 251 | map {coalesce WARC::Index::Entries ($results{$_})} keys %results; | 
|  | 108 |  |  |  |  | 302 |  | 
| 207 |  |  |  |  |  |  | @results =	# sort by distance using Schwartzian transform | 
| 208 | 108 |  |  |  |  | 203 | (map { $_->[0] } sort { $a->[1] <=> $b->[1] } | 
|  | 89 |  |  |  |  | 131 |  | 
| 209 | 63 |  |  |  |  | 114 | map { [$_, scalar $_->distance(@_)] } @results); | 
|  | 108 |  |  |  |  | 230 |  | 
| 210 |  |  |  |  |  |  |  | 
| 211 |  |  |  |  |  |  | # return either the entire sorted list or the best match | 
| 212 | 63 | 100 |  |  |  | 154 | if (wantarray) { return map {$_->record(collection => $self)} @results } | 
|  | 31 |  |  |  |  | 48 |  | 
|  | 67 |  |  |  |  | 1124 |  | 
| 213 |  |  |  |  |  |  | else		 { return (scalar @results | 
| 214 | 32 | 100 |  |  |  | 102 | ? $results[0]->record(collection => $self) : undef) } | 
| 215 |  |  |  |  |  |  | } | 
| 216 |  |  |  |  |  |  |  | 
| 217 |  |  |  |  |  |  | =back | 
| 218 |  |  |  |  |  |  |  | 
| 219 |  |  |  |  |  |  | =cut | 
| 220 |  |  |  |  |  |  |  | 
| 221 |  |  |  |  |  |  | 1; | 
| 222 |  |  |  |  |  |  | __END__ |