File Coverage

blib/lib/WARC/Index/File/CDX.pm
Criterion Covered Total %
statement 124 124 100.0
branch 53 54 98.1
condition 11 12 100.0
subroutine 15 15 100.0
pod 5 5 100.0
total 208 210 99.5


line stmt bran cond sub pod time code
1             package WARC::Index::File::CDX; # -*- CPerl -*-
2              
3 1     1   73069 use strict;
  1         15  
  1         30  
4 1     1   5 use warnings;
  1         3  
  1         80  
5              
6             require WARC::Index;
7             our @ISA = qw(WARC::Index);
8              
9             require WARC; *WARC::Index::File::CDX::VERSION = \$WARC::VERSION;
10              
11 1     1   8 use Carp;
  1         1  
  1         54  
12 1     1   14 use File::Spec;
  1         3  
  1         35  
13 1     1   6 use Fcntl 'SEEK_SET';
  1         2  
  1         2062  
14             require File::Spec::Unix;
15              
16             require WARC::Date;
17             require WARC::Index::File::CDX::Entry;
18             require WARC::Volume;
19              
20             WARC::Index::register(filename => qr/[.]cdx$/);
21              
22             our %CDX_Field_Index_Key_Map =
23             (a => 'url', b => 'time', u => 'record_id');
24              
25             our %CDX_Import_Map =
26             (time => sub {
27             my $cdx_date = shift;
28             croak "invalid CDX datestamp"
29             unless $cdx_date =~ m/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/;
30             return WARC::Date->from_string(sprintf('%04d-%02d-%02dT%02d:%02d:%02dZ',
31             $1, $2, $3, $4, $5, $6))},
32             );
33              
34             # This implementation uses a hash as the underlying structure.
35             # Keys defined by this class:
36             #
37             # cdx_file
38             # file name of CDX file
39             # delimiter
40             # delimiter character used in CDX file
41             # fields
42             # array of CDX field codes, in order
43             # field_index
44             # hash mapping CDX field code letter => position
45             # key_index
46             # hash mapping WARC::Index search keys => position
47             # volumes
48             # hash mapping WARC file names => WARC::Volume objects
49              
50             sub _dbg_dump {
51 6     6   47 my $self = shift;
52              
53 6         28 my $out = __PACKAGE__ . "\n in " . $self->{cdx_file} . "\n";
54             $out .= sprintf ' delimiter ASCII %d/%d ',
55 6         53 (ord $self->{delimiter})/16, (ord $self->{delimiter})%16;
56 6         13 $out .= ' '.join(' ', 'CDX', @{$self->{fields}})."\n";
  6         26  
57              
58 6         31 return $out;
59             }
60              
61             sub attach {
62 13     13 1 7933 my $class = shift;
63 13         29 my $cdx_file = shift;
64              
65 13         51 local $/ = "\012"; # ASCII 0/10 LF
66 13 100       805 open my $cdx, '<', $cdx_file or croak "$cdx_file: $!";
67 12         142 binmode $cdx, ':raw';
68              
69 12         192 my $header = <$cdx>;
70 12 100       363 croak "could not read CDX header line from $cdx_file: $!"
71             unless defined $header;
72              
73 11         24 chomp $header;
74 11         33 my $delimiter = substr $header, 0, 1;
75 11 100 100     467 croak "no CDX marker found in $cdx_file"
76             unless 'CDX' eq substr $header, 1, 3
77             and $delimiter eq substr $header, 4, 1;
78              
79 9         81 my @fields = split /\Q$delimiter/, substr $header, 5;
80              
81 9         37 my %field_index = map {$fields[$_] => $_} 0 .. $#fields;
  120         256  
82             my %key_index =
83 10         33 map {$CDX_Field_Index_Key_Map{$fields[$_]} => $_}
84 9         73 grep defined $CDX_Field_Index_Key_Map{$fields[$_]}, 0 .. $#fields;
85              
86             croak "CDX file $cdx_file does not index WARC file name"
87 9 100       237 unless defined $field_index{g};
88             croak "CDX file $cdx_file does not index record offset"
89 8 100 100     238 unless defined $field_index{v} or defined $field_index{V};
90              
91 7         187 bless {cdx_file => $cdx_file, delimiter => $delimiter, fields => \@fields,
92             field_index => \%field_index, key_index => \%key_index}, $class
93             }
94              
95             sub _get_volume {
96 83     83   121 my $self = shift;
97 83         120 my $name = shift;
98              
99 83 100       392 return $self->{volumes}{$name} if defined $self->{volumes}{$name};
100              
101             # otherwise...
102 6         115 my ($vol, $cdx_dirs, $file) = File::Spec->splitpath($self->{cdx_file});
103 6         44 my @cdx_dirs = File::Spec->splitdir($cdx_dirs);
104 6         60 my ($wvol, $warc_dirs, $warc_file) = File::Spec::Unix->splitpath($name);
105 6         23 my @warc_dirs = File::Spec::Unix->splitdir($warc_dirs);
106 6         101 my $warcfilename =
107             File::Spec->catpath($vol, File::Spec->catdir(@cdx_dirs,
108             @warc_dirs), $warc_file);
109 6         41 return $self->{volumes}{$name} = mount WARC::Volume ($warcfilename);
110             }
111              
112             sub _parse_cdx_entry {
113 95     95   176 my $self = shift;
114 95         129 my $pos = shift;
115 95         146 my $entry = shift;
116             # uncoverable condition right
117 95   66     330 my $entry_length = $entry && length $entry;
118              
119 95 100       282 return undef unless $entry_length; # as occurs at end-of-file
120              
121 83         144 chomp $entry;
122 83         386 my @fields = split /\Q$self->{delimiter}/, $entry;
123 83         179 my $volname = $fields[$self->{field_index}{g}];
124             my %entry =
125             ( _index => $self, _entry_offset => $pos, _entry_length => $entry_length,
126             _g__volume => $self->_get_volume($volname),
127             _Vv__record_offset =>
128             $fields[$self->{field_index}{($volname =~ m/[.]w?arc$/ ? 'v' : 'V')}],
129             map {$_ => (defined $CDX_Import_Map{$_}
130             ? $CDX_Import_Map{$_}->($fields[$self->{key_index}{$_}])
131 83 100       179 : $fields[$self->{key_index}{$_}])} keys %{$self->{key_index}}
  71 100       381  
  83         270  
132             );
133              
134 83         738 bless \%entry, (ref $self).'::Entry';
135             }
136              
137             sub searchable {
138 16     16 1 1807 my $self = shift;
139 16         30 my $key = shift;
140              
141 16 100       59 return defined $self->{key_index}{url} if $key eq 'url_prefix';
142 12         63 return defined $self->{key_index}{$key};
143             }
144              
145             sub _search_all {
146 14     14   18 my $self = shift;
147              
148 14         57 local $/ = "\012"; # ASCII 0/10 LF
149 14         29 my @results = ();
150              
151 14 100       712 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
152 13         101 binmode $cdx, ':raw';
153              
154 13         170 my $header = <$cdx>; # skip header to reach first entry
155 13         48 my $offset = tell $cdx;
156 13         54 my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
157 13 100       59 return () unless defined $entry->distance(@_);
158              
159 7         17 while (defined $entry) {
160 28 100       73 push @results, $entry unless 0 > $entry->distance(@_);
161 28         54 $offset = tell $cdx; # ... and advance ...
162 28         155 $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
163             }
164              
165 7         132 return @results;
166             }
167              
168             sub _search_best_match {
169 18     18   31 my $self = shift;
170              
171 18         71 local $/ = "\012"; # ASCII 0/10 LF
172 18         28 my $result = undef;
173 18         28 my $result_distance = -1;
174              
175 18 100       953 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
176 17         121 binmode $cdx, ':raw';
177              
178 17         218 my $header = <$cdx>; # skip header to reach first entry
179 17         57 my $offset = tell $cdx;
180 17         70 my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
181 17 100       68 return undef unless defined $entry->distance(@_);
182              
183 11         26 while (defined $entry) {
184 31         78 my $distance = $entry->distance(@_);
185 31 100       67 unless (0 > $distance) {
186 19 100 100     63 if ($result_distance < 0 # first match found
187             or $distance < $result_distance) # or better match found
188 16         36 { $result = $entry; $result_distance = $distance }
  16         27  
189             }
190 31 100       249 return $result if $result_distance == 0; # no better match possible
191 22         41 $offset = tell $cdx; # ... and advance ...
192 22         91 $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>);
193             }
194              
195 2         40 return $result;
196             }
197              
198             sub search {
199 35     35 1 8048 my $self = shift;
200              
201 35 100       102 unless (defined wantarray)
202 1         199 { carp "calling 'search' method in void context"; return }
  1         79  
203              
204 34 100       187 croak "no arguments given to 'search' method"
205             unless scalar @_;
206 33 100       191 croak "odd number of arguments given to 'search' method"
207             if scalar @_ % 2;
208              
209 32 100       68 if (wantarray) { return $self->_search_all(@_) }
  14         42  
210 18         50 else { return $self->_search_best_match(@_) }
211             }
212              
213             sub first_entry {
214 4     4 1 428 my $self = shift;
215              
216 4         16 local $/ = "\012"; # ASCII 0/10 LF
217              
218 4 100       273 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
219 3         22 binmode $cdx, ':raw';
220              
221 3         37 my $header = <$cdx>; # skip header to reach first entry
222 3         12 my $offset = tell $cdx;
223 3         14 return $self->_parse_cdx_entry($offset, scalar <$cdx>);
224             }
225              
226             sub entry_at {
227 15     15 1 1301 my $self = shift;
228 15         23 my $offset = shift;
229              
230 15         62 local $/ = "\012"; # ASCII 0/10 LF
231              
232 15 100       765 open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!";
233 14         137 binmode $cdx, ':raw';
234              
235             # one octet before requested position must be an end-of-line marker
236 14 100       245 seek $cdx, $offset - 1, SEEK_SET or croak "seek $self->{cdx_file}: $!";
237 13 50       34 my $eol; defined(read $cdx, $eol, 1) or croak "read $self->{cdx_file}: $!";
  13         201  
238 13 100       204 croak "offset $offset in $self->{cdx_file} not a record boundary"
239             unless $eol eq $/;
240              
241 12         77 return $self->_parse_cdx_entry($offset, scalar <$cdx>);
242             }
243              
244             1;
245             __END__