| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WARC::Index;						# -*- CPerl -*- | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 27 |  |  | 27 |  | 66162 | use strict; | 
|  | 27 |  |  |  |  | 57 |  | 
|  | 27 |  |  |  |  | 786 |  | 
| 4 | 27 |  |  | 27 |  | 127 | use warnings; | 
|  | 27 |  |  |  |  | 44 |  | 
|  | 27 |  |  |  |  | 652 |  | 
| 5 |  |  |  |  |  |  |  | 
| 6 | 27 |  |  | 27 |  | 187 | use Carp; | 
|  | 27 |  |  |  |  | 58 |  | 
|  | 27 |  |  |  |  | 11421 |  | 
| 7 |  |  |  |  |  |  |  | 
| 8 |  |  |  |  |  |  | our @ISA = qw(); | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | require WARC; *WARC::Index::VERSION = \$WARC::VERSION; | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | =head1 NAME | 
| 13 |  |  |  |  |  |  |  | 
| 14 |  |  |  |  |  |  | WARC::Index - base class for WARC index classes | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | use WARC::Index::File::CDX;	# or ... | 
| 19 |  |  |  |  |  |  | use WARC::Index::File::SDBM; | 
| 20 |  |  |  |  |  |  | # or some other WARC::Index::File::* implementation | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | $index = attach WARC::Index::File::CDX (...);	# or ... | 
| 23 |  |  |  |  |  |  | $index = attach WARC::Index::File::SDBM (...); | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | $record = $index->search(url => $url, time => $when); | 
| 26 |  |  |  |  |  |  | @records = $index->search(url => $url, time => $when); | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | build WARC::Index::File::CDX (...);	# or ... | 
| 29 |  |  |  |  |  |  | build WARC::Index::File::SDBM (...); | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | C is an abstract base class for indexes on WARC files and | 
| 34 |  |  |  |  |  |  | WARC-alike files.  This class establishes the expected interface and | 
| 35 |  |  |  |  |  |  | provides a simple interface for building indexes. | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | =head2 Methods | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | =over | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | =item $index = attach WARC::Index::File::* (...) | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | Construct an index object using the indicated technology and whatever | 
| 44 |  |  |  |  |  |  | parameters the index implementation needs. | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  | Typically, indexes are file-based and a single parameter is the name of an | 
| 47 |  |  |  |  |  |  | index file which in turn contains the names of the indexed WARC files. | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  | =cut | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | sub attach { | 
| 52 | 1 |  |  | 1 | 1 | 98 | die __PACKAGE__." is an abstract base class and " | 
| 53 |  |  |  |  |  |  | .(shift)." must override the 'attach' method" | 
| 54 |  |  |  |  |  |  | } | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | =item $yes_or_no = $index-Esearchable( $key ) | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | Return true or false to reflect if the index can search for the requested | 
| 59 |  |  |  |  |  |  | key.  Indexes may be able to search for keys that are not present in | 
| 60 |  |  |  |  |  |  | entries returned from those indexes. | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  | See the L<"Search Keys" section|WARC::Collection/"Search Keys"> of the | 
| 63 |  |  |  |  |  |  | C page for details on the implemented search keys. | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | =cut | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | sub searchable { | 
| 68 | 1 |  |  | 1 | 1 | 777 | die __PACKAGE__." is an abstract base class and " | 
| 69 |  |  |  |  |  |  | .(ref shift)." must override the 'searchable' method" | 
| 70 |  |  |  |  |  |  | } | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | =item $record = $index-Esearch( ... ) | 
| 73 |  |  |  |  |  |  |  | 
| 74 |  |  |  |  |  |  | =item @records = $index-Esearch( ... ) | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | Search an index for records matching parameters.  The C | 
| 77 |  |  |  |  |  |  | class uses this method to search each index in a collection. | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | If the none of the requested search keys are searchable, returns an | 
| 80 |  |  |  |  |  |  | undefined value in scalar context and the empty list in list context. | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | The details of the parameters for this method are documented in the | 
| 83 |  |  |  |  |  |  | L<"Search Keys" section|WARC::Collection/"Search Keys"> of the | 
| 84 |  |  |  |  |  |  | C page. | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | =cut | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | sub search { | 
| 89 | 1 |  |  | 1 | 1 | 429 | die __PACKAGE__." is an abstract base class and " | 
| 90 |  |  |  |  |  |  | .(ref shift)." must override the 'search' method" | 
| 91 |  |  |  |  |  |  | } | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | =item build WARC::Index::File::* (into =E $dest, from =E ...) | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | =item build WARC::Index::File::* (from =E [...], into =E $dest) | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | The C base class B provide this method, however.  The | 
| 98 |  |  |  |  |  |  | C method works by loading the corresponding index builder class and | 
| 99 |  |  |  |  |  |  | driving the process or simply returning the newly-constructed object. | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | The C method itself handles the C key for specifying the files | 
| 102 |  |  |  |  |  |  | to index.  The C key can be given an array reference, after which | 
| 103 |  |  |  |  |  |  | more key =E value pairs may follow, or can simply use the rest of the | 
| 104 |  |  |  |  |  |  | argument list as its value. | 
| 105 |  |  |  |  |  |  |  | 
| 106 |  |  |  |  |  |  | If the C key is given, the C method will read the indicated | 
| 107 |  |  |  |  |  |  | files, construct an index, and return nothing.  If the C key is not | 
| 108 |  |  |  |  |  |  | given, the C method will construct and return an index builder. | 
| 109 |  |  |  |  |  |  |  | 
| 110 |  |  |  |  |  |  | All index builders accept at least the C key for specifying where to | 
| 111 |  |  |  |  |  |  | store the index.  See the documentation for WARC::Index::File::*::Builder | 
| 112 |  |  |  |  |  |  | for more information. | 
| 113 |  |  |  |  |  |  |  | 
| 114 |  |  |  |  |  |  | =cut | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  | 0 | 1 |  | sub build { | 
| 117 |  |  |  |  |  |  | } | 
| 118 |  |  |  |  |  |  |  | 
| 119 |  |  |  |  |  |  | =back | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | =head2 Optional Methods | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | Some index systems may also provide these methods: | 
| 124 |  |  |  |  |  |  |  | 
| 125 |  |  |  |  |  |  | =over | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | =item $entry = $index-Efirst_entry | 
| 128 |  |  |  |  |  |  |  | 
| 129 |  |  |  |  |  |  | An index that has a sequential ordering may provide this method to obtain | 
| 130 |  |  |  |  |  |  | the first entry in the index.  Indexes that do not have a meaningful | 
| 131 |  |  |  |  |  |  | sequence amongst their entries do not provide this method. | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | =item $entry = $index-Eentry_at( $position ) | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | An index that has a sequential ordering may provide this method to obtain | 
| 136 |  |  |  |  |  |  | an entry at a specified position in the index.  The exact format of the | 
| 137 |  |  |  |  |  |  | position parameter is not specified in general, but should be a value | 
| 138 |  |  |  |  |  |  | previously obtained from the C method on an entry from the | 
| 139 |  |  |  |  |  |  | same index.  Valid positions may be sparse. | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  | =back | 
| 142 |  |  |  |  |  |  |  | 
| 143 |  |  |  |  |  |  | =head2 Index system registration | 
| 144 |  |  |  |  |  |  |  | 
| 145 |  |  |  |  |  |  | The C package also provides a registry of loaded index | 
| 146 |  |  |  |  |  |  | support.  The C function adds the calling package to the list. | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | =cut | 
| 149 |  |  |  |  |  |  |  | 
| 150 |  |  |  |  |  |  | # Array of arrays listing index implementations and filename patterns. | 
| 151 |  |  |  |  |  |  | #  Each element:  [ Package => qr/pattern1/, qr/pattern2/, ... ] | 
| 152 |  |  |  |  |  |  | our @Index_Handlers = (); | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | =over | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | =item WARC::Index::register( filename =E $filename_re ) | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | Add the calling package to an internal list of available index handlers. | 
| 159 |  |  |  |  |  |  | The calling package must be a subclass of C or this function | 
| 160 |  |  |  |  |  |  | will croak(). | 
| 161 |  |  |  |  |  |  |  | 
| 162 |  |  |  |  |  |  | The C key indicates that the calling package expects to handle | 
| 163 |  |  |  |  |  |  | index files with names matching the provided regex. | 
| 164 |  |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | =cut | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | sub register { | 
| 168 | 13 |  |  | 13 | 1 | 3241 | my %opt = @_; | 
| 169 | 13 |  |  |  |  | 37 | my $caller = scalar caller; | 
| 170 |  |  |  |  |  |  |  | 
| 171 | 13 | 100 |  |  |  | 382 | croak "WARC::Index implementations must subclass WARC::Index" | 
| 172 |  |  |  |  |  |  | unless $caller->isa('WARC::Index'); | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | croak "WARC::Index implementations must handle a filename pattern" | 
| 175 | 12 | 100 |  |  |  | 290 | unless $opt{filename}; | 
| 176 |  |  |  |  |  |  |  | 
| 177 | 10 |  |  |  |  | 28 | foreach my $row (grep {$_->[0] eq $caller} @Index_Handlers) { | 
|  | 8 |  |  |  |  | 23 |  | 
| 178 | 2 |  |  |  |  | 4 | push @$row, $opt{filename};	# add pattern to existing row | 
| 179 |  |  |  |  |  |  | return # ensure that there will be at most one row per package | 
| 180 | 2 |  |  |  |  | 6 | } | 
| 181 | 8 |  |  |  |  | 25 | push @Index_Handlers, [$caller => $opt{filename}]; | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | return # nothing | 
| 184 | 8 |  |  |  |  | 25 | } | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | =item WARC::Index::find_handler( $filename ) | 
| 187 |  |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  | Return the registered handler for $filename or undef if none match.  If | 
| 189 |  |  |  |  |  |  | multiple handlers match, which one is returned is unspecified. | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | =cut | 
| 192 |  |  |  |  |  |  |  | 
| 193 |  |  |  |  |  |  | sub find_handler { | 
| 194 | 20 |  |  | 20 | 1 | 948 | my $filename = shift; | 
| 195 | 20 |  |  |  |  | 43 | my @match = grep {grep {$filename =~ $_} @$_[1..$#$_]} @Index_Handlers; | 
|  | 43 |  |  |  |  | 96 |  | 
|  | 55 |  |  |  |  | 272 |  | 
| 196 | 20 | 100 |  |  |  | 64 | return undef unless @match; | 
| 197 | 18 |  |  |  |  | 68 | return $match[0][0]; | 
| 198 |  |  |  |  |  |  | } | 
| 199 |  |  |  |  |  |  |  | 
| 200 |  |  |  |  |  |  | =back | 
| 201 |  |  |  |  |  |  |  | 
| 202 |  |  |  |  |  |  | =cut | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | 1; | 
| 205 |  |  |  |  |  |  | __END__ |