line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WARC::Index::File::CDX; # -*- CPerl -*- |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
73069
|
use strict; |
|
1
|
|
|
|
|
15
|
|
|
1
|
|
|
|
|
30
|
|
4
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
80
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
require WARC::Index; |
7
|
|
|
|
|
|
|
our @ISA = qw(WARC::Index); |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require WARC; *WARC::Index::File::CDX::VERSION = \$WARC::VERSION; |
10
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
8
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
54
|
|
12
|
1
|
|
|
1
|
|
14
|
use File::Spec; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
35
|
|
13
|
1
|
|
|
1
|
|
6
|
use Fcntl 'SEEK_SET'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
2062
|
|
14
|
|
|
|
|
|
|
require File::Spec::Unix; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
require WARC::Date; |
17
|
|
|
|
|
|
|
require WARC::Index::File::CDX::Entry; |
18
|
|
|
|
|
|
|
require WARC::Volume; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
WARC::Index::register(filename => qr/[.]cdx$/); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our %CDX_Field_Index_Key_Map = |
23
|
|
|
|
|
|
|
(a => 'url', b => 'time', u => 'record_id'); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
our %CDX_Import_Map = |
26
|
|
|
|
|
|
|
(time => sub { |
27
|
|
|
|
|
|
|
my $cdx_date = shift; |
28
|
|
|
|
|
|
|
croak "invalid CDX datestamp" |
29
|
|
|
|
|
|
|
unless $cdx_date =~ m/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/; |
30
|
|
|
|
|
|
|
return WARC::Date->from_string(sprintf('%04d-%02d-%02dT%02d:%02d:%02dZ', |
31
|
|
|
|
|
|
|
$1, $2, $3, $4, $5, $6))}, |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# This implementation uses a hash as the underlying structure. |
35
|
|
|
|
|
|
|
# Keys defined by this class: |
36
|
|
|
|
|
|
|
# |
37
|
|
|
|
|
|
|
# cdx_file |
38
|
|
|
|
|
|
|
# file name of CDX file |
39
|
|
|
|
|
|
|
# delimiter |
40
|
|
|
|
|
|
|
# delimiter character used in CDX file |
41
|
|
|
|
|
|
|
# fields |
42
|
|
|
|
|
|
|
# array of CDX field codes, in order |
43
|
|
|
|
|
|
|
# field_index |
44
|
|
|
|
|
|
|
# hash mapping CDX field code letter => position |
45
|
|
|
|
|
|
|
# key_index |
46
|
|
|
|
|
|
|
# hash mapping WARC::Index search keys => position |
47
|
|
|
|
|
|
|
# volumes |
48
|
|
|
|
|
|
|
# hash mapping WARC file names => WARC::Volume objects |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub _dbg_dump { |
51
|
6
|
|
|
6
|
|
47
|
my $self = shift; |
52
|
|
|
|
|
|
|
|
53
|
6
|
|
|
|
|
28
|
my $out = __PACKAGE__ . "\n in " . $self->{cdx_file} . "\n"; |
54
|
|
|
|
|
|
|
$out .= sprintf ' delimiter ASCII %d/%d ', |
55
|
6
|
|
|
|
|
53
|
(ord $self->{delimiter})/16, (ord $self->{delimiter})%16; |
56
|
6
|
|
|
|
|
13
|
$out .= ' '.join(' ', 'CDX', @{$self->{fields}})."\n"; |
|
6
|
|
|
|
|
26
|
|
57
|
|
|
|
|
|
|
|
58
|
6
|
|
|
|
|
31
|
return $out; |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub attach { |
62
|
13
|
|
|
13
|
1
|
7933
|
my $class = shift; |
63
|
13
|
|
|
|
|
29
|
my $cdx_file = shift; |
64
|
|
|
|
|
|
|
|
65
|
13
|
|
|
|
|
51
|
local $/ = "\012"; # ASCII 0/10 LF |
66
|
13
|
100
|
|
|
|
805
|
open my $cdx, '<', $cdx_file or croak "$cdx_file: $!"; |
67
|
12
|
|
|
|
|
142
|
binmode $cdx, ':raw'; |
68
|
|
|
|
|
|
|
|
69
|
12
|
|
|
|
|
192
|
my $header = <$cdx>; |
70
|
12
|
100
|
|
|
|
363
|
croak "could not read CDX header line from $cdx_file: $!" |
71
|
|
|
|
|
|
|
unless defined $header; |
72
|
|
|
|
|
|
|
|
73
|
11
|
|
|
|
|
24
|
chomp $header; |
74
|
11
|
|
|
|
|
33
|
my $delimiter = substr $header, 0, 1; |
75
|
11
|
100
|
100
|
|
|
467
|
croak "no CDX marker found in $cdx_file" |
76
|
|
|
|
|
|
|
unless 'CDX' eq substr $header, 1, 3 |
77
|
|
|
|
|
|
|
and $delimiter eq substr $header, 4, 1; |
78
|
|
|
|
|
|
|
|
79
|
9
|
|
|
|
|
81
|
my @fields = split /\Q$delimiter/, substr $header, 5; |
80
|
|
|
|
|
|
|
|
81
|
9
|
|
|
|
|
37
|
my %field_index = map {$fields[$_] => $_} 0 .. $#fields; |
|
120
|
|
|
|
|
256
|
|
82
|
|
|
|
|
|
|
my %key_index = |
83
|
10
|
|
|
|
|
33
|
map {$CDX_Field_Index_Key_Map{$fields[$_]} => $_} |
84
|
9
|
|
|
|
|
73
|
grep defined $CDX_Field_Index_Key_Map{$fields[$_]}, 0 .. $#fields; |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
croak "CDX file $cdx_file does not index WARC file name" |
87
|
9
|
100
|
|
|
|
237
|
unless defined $field_index{g}; |
88
|
|
|
|
|
|
|
croak "CDX file $cdx_file does not index record offset" |
89
|
8
|
100
|
100
|
|
|
238
|
unless defined $field_index{v} or defined $field_index{V}; |
90
|
|
|
|
|
|
|
|
91
|
7
|
|
|
|
|
187
|
bless {cdx_file => $cdx_file, delimiter => $delimiter, fields => \@fields, |
92
|
|
|
|
|
|
|
field_index => \%field_index, key_index => \%key_index}, $class |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub _get_volume { |
96
|
83
|
|
|
83
|
|
121
|
my $self = shift; |
97
|
83
|
|
|
|
|
120
|
my $name = shift; |
98
|
|
|
|
|
|
|
|
99
|
83
|
100
|
|
|
|
392
|
return $self->{volumes}{$name} if defined $self->{volumes}{$name}; |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# otherwise... |
102
|
6
|
|
|
|
|
115
|
my ($vol, $cdx_dirs, $file) = File::Spec->splitpath($self->{cdx_file}); |
103
|
6
|
|
|
|
|
44
|
my @cdx_dirs = File::Spec->splitdir($cdx_dirs); |
104
|
6
|
|
|
|
|
60
|
my ($wvol, $warc_dirs, $warc_file) = File::Spec::Unix->splitpath($name); |
105
|
6
|
|
|
|
|
23
|
my @warc_dirs = File::Spec::Unix->splitdir($warc_dirs); |
106
|
6
|
|
|
|
|
101
|
my $warcfilename = |
107
|
|
|
|
|
|
|
File::Spec->catpath($vol, File::Spec->catdir(@cdx_dirs, |
108
|
|
|
|
|
|
|
@warc_dirs), $warc_file); |
109
|
6
|
|
|
|
|
41
|
return $self->{volumes}{$name} = mount WARC::Volume ($warcfilename); |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub _parse_cdx_entry { |
113
|
95
|
|
|
95
|
|
176
|
my $self = shift; |
114
|
95
|
|
|
|
|
129
|
my $pos = shift; |
115
|
95
|
|
|
|
|
146
|
my $entry = shift; |
116
|
|
|
|
|
|
|
# uncoverable condition right |
117
|
95
|
|
66
|
|
|
330
|
my $entry_length = $entry && length $entry; |
118
|
|
|
|
|
|
|
|
119
|
95
|
100
|
|
|
|
282
|
return undef unless $entry_length; # as occurs at end-of-file |
120
|
|
|
|
|
|
|
|
121
|
83
|
|
|
|
|
144
|
chomp $entry; |
122
|
83
|
|
|
|
|
386
|
my @fields = split /\Q$self->{delimiter}/, $entry; |
123
|
83
|
|
|
|
|
179
|
my $volname = $fields[$self->{field_index}{g}]; |
124
|
|
|
|
|
|
|
my %entry = |
125
|
|
|
|
|
|
|
( _index => $self, _entry_offset => $pos, _entry_length => $entry_length, |
126
|
|
|
|
|
|
|
_g__volume => $self->_get_volume($volname), |
127
|
|
|
|
|
|
|
_Vv__record_offset => |
128
|
|
|
|
|
|
|
$fields[$self->{field_index}{($volname =~ m/[.]w?arc$/ ? 'v' : 'V')}], |
129
|
|
|
|
|
|
|
map {$_ => (defined $CDX_Import_Map{$_} |
130
|
|
|
|
|
|
|
? $CDX_Import_Map{$_}->($fields[$self->{key_index}{$_}]) |
131
|
83
|
100
|
|
|
|
179
|
: $fields[$self->{key_index}{$_}])} keys %{$self->{key_index}} |
|
71
|
100
|
|
|
|
381
|
|
|
83
|
|
|
|
|
270
|
|
132
|
|
|
|
|
|
|
); |
133
|
|
|
|
|
|
|
|
134
|
83
|
|
|
|
|
738
|
bless \%entry, (ref $self).'::Entry'; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub searchable { |
138
|
16
|
|
|
16
|
1
|
1807
|
my $self = shift; |
139
|
16
|
|
|
|
|
30
|
my $key = shift; |
140
|
|
|
|
|
|
|
|
141
|
16
|
100
|
|
|
|
59
|
return defined $self->{key_index}{url} if $key eq 'url_prefix'; |
142
|
12
|
|
|
|
|
63
|
return defined $self->{key_index}{$key}; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub _search_all { |
146
|
14
|
|
|
14
|
|
18
|
my $self = shift; |
147
|
|
|
|
|
|
|
|
148
|
14
|
|
|
|
|
57
|
local $/ = "\012"; # ASCII 0/10 LF |
149
|
14
|
|
|
|
|
29
|
my @results = (); |
150
|
|
|
|
|
|
|
|
151
|
14
|
100
|
|
|
|
712
|
open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!"; |
152
|
13
|
|
|
|
|
101
|
binmode $cdx, ':raw'; |
153
|
|
|
|
|
|
|
|
154
|
13
|
|
|
|
|
170
|
my $header = <$cdx>; # skip header to reach first entry |
155
|
13
|
|
|
|
|
48
|
my $offset = tell $cdx; |
156
|
13
|
|
|
|
|
54
|
my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>); |
157
|
13
|
100
|
|
|
|
59
|
return () unless defined $entry->distance(@_); |
158
|
|
|
|
|
|
|
|
159
|
7
|
|
|
|
|
17
|
while (defined $entry) { |
160
|
28
|
100
|
|
|
|
73
|
push @results, $entry unless 0 > $entry->distance(@_); |
161
|
28
|
|
|
|
|
54
|
$offset = tell $cdx; # ... and advance ... |
162
|
28
|
|
|
|
|
155
|
$entry = $self->_parse_cdx_entry($offset, scalar <$cdx>); |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
7
|
|
|
|
|
132
|
return @results; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
sub _search_best_match { |
169
|
18
|
|
|
18
|
|
31
|
my $self = shift; |
170
|
|
|
|
|
|
|
|
171
|
18
|
|
|
|
|
71
|
local $/ = "\012"; # ASCII 0/10 LF |
172
|
18
|
|
|
|
|
28
|
my $result = undef; |
173
|
18
|
|
|
|
|
28
|
my $result_distance = -1; |
174
|
|
|
|
|
|
|
|
175
|
18
|
100
|
|
|
|
953
|
open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!"; |
176
|
17
|
|
|
|
|
121
|
binmode $cdx, ':raw'; |
177
|
|
|
|
|
|
|
|
178
|
17
|
|
|
|
|
218
|
my $header = <$cdx>; # skip header to reach first entry |
179
|
17
|
|
|
|
|
57
|
my $offset = tell $cdx; |
180
|
17
|
|
|
|
|
70
|
my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>); |
181
|
17
|
100
|
|
|
|
68
|
return undef unless defined $entry->distance(@_); |
182
|
|
|
|
|
|
|
|
183
|
11
|
|
|
|
|
26
|
while (defined $entry) { |
184
|
31
|
|
|
|
|
78
|
my $distance = $entry->distance(@_); |
185
|
31
|
100
|
|
|
|
67
|
unless (0 > $distance) { |
186
|
19
|
100
|
100
|
|
|
63
|
if ($result_distance < 0 # first match found |
187
|
|
|
|
|
|
|
or $distance < $result_distance) # or better match found |
188
|
16
|
|
|
|
|
36
|
{ $result = $entry; $result_distance = $distance } |
|
16
|
|
|
|
|
27
|
|
189
|
|
|
|
|
|
|
} |
190
|
31
|
100
|
|
|
|
249
|
return $result if $result_distance == 0; # no better match possible |
191
|
22
|
|
|
|
|
41
|
$offset = tell $cdx; # ... and advance ... |
192
|
22
|
|
|
|
|
91
|
$entry = $self->_parse_cdx_entry($offset, scalar <$cdx>); |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
2
|
|
|
|
|
40
|
return $result; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub search { |
199
|
35
|
|
|
35
|
1
|
8048
|
my $self = shift; |
200
|
|
|
|
|
|
|
|
201
|
35
|
100
|
|
|
|
102
|
unless (defined wantarray) |
202
|
1
|
|
|
|
|
199
|
{ carp "calling 'search' method in void context"; return } |
|
1
|
|
|
|
|
79
|
|
203
|
|
|
|
|
|
|
|
204
|
34
|
100
|
|
|
|
187
|
croak "no arguments given to 'search' method" |
205
|
|
|
|
|
|
|
unless scalar @_; |
206
|
33
|
100
|
|
|
|
191
|
croak "odd number of arguments given to 'search' method" |
207
|
|
|
|
|
|
|
if scalar @_ % 2; |
208
|
|
|
|
|
|
|
|
209
|
32
|
100
|
|
|
|
68
|
if (wantarray) { return $self->_search_all(@_) } |
|
14
|
|
|
|
|
42
|
|
210
|
18
|
|
|
|
|
50
|
else { return $self->_search_best_match(@_) } |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub first_entry { |
214
|
4
|
|
|
4
|
1
|
428
|
my $self = shift; |
215
|
|
|
|
|
|
|
|
216
|
4
|
|
|
|
|
16
|
local $/ = "\012"; # ASCII 0/10 LF |
217
|
|
|
|
|
|
|
|
218
|
4
|
100
|
|
|
|
273
|
open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!"; |
219
|
3
|
|
|
|
|
22
|
binmode $cdx, ':raw'; |
220
|
|
|
|
|
|
|
|
221
|
3
|
|
|
|
|
37
|
my $header = <$cdx>; # skip header to reach first entry |
222
|
3
|
|
|
|
|
12
|
my $offset = tell $cdx; |
223
|
3
|
|
|
|
|
14
|
return $self->_parse_cdx_entry($offset, scalar <$cdx>); |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
sub entry_at { |
227
|
15
|
|
|
15
|
1
|
1301
|
my $self = shift; |
228
|
15
|
|
|
|
|
23
|
my $offset = shift; |
229
|
|
|
|
|
|
|
|
230
|
15
|
|
|
|
|
62
|
local $/ = "\012"; # ASCII 0/10 LF |
231
|
|
|
|
|
|
|
|
232
|
15
|
100
|
|
|
|
765
|
open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!"; |
233
|
14
|
|
|
|
|
137
|
binmode $cdx, ':raw'; |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
# one octet before requested position must be an end-of-line marker |
236
|
14
|
100
|
|
|
|
245
|
seek $cdx, $offset - 1, SEEK_SET or croak "seek $self->{cdx_file}: $!"; |
237
|
13
|
50
|
|
|
|
34
|
my $eol; defined(read $cdx, $eol, 1) or croak "read $self->{cdx_file}: $!"; |
|
13
|
|
|
|
|
201
|
|
238
|
13
|
100
|
|
|
|
204
|
croak "offset $offset in $self->{cdx_file} not a record boundary" |
239
|
|
|
|
|
|
|
unless $eol eq $/; |
240
|
|
|
|
|
|
|
|
241
|
12
|
|
|
|
|
77
|
return $self->_parse_cdx_entry($offset, scalar <$cdx>); |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
1; |
245
|
|
|
|
|
|
|
__END__ |