| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WARC::Index::File::CDX; # -*- CPerl -*- |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
70375
|
use strict; |
|
|
1
|
|
|
|
|
13
|
|
|
|
1
|
|
|
|
|
32
|
|
|
4
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
68
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
require WARC::Index; |
|
7
|
|
|
|
|
|
|
our @ISA = qw(WARC::Index); |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require WARC; *WARC::Index::File::CDX::VERSION = \$WARC::VERSION; |
|
10
|
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
6
|
use Carp; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
69
|
|
|
12
|
1
|
|
|
1
|
|
7
|
use File::Spec; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
25
|
|
|
13
|
1
|
|
|
1
|
|
21
|
use Fcntl 'SEEK_SET'; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
2479
|
|
|
14
|
|
|
|
|
|
|
require File::Spec::Unix; |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
require WARC::Date; |
|
17
|
|
|
|
|
|
|
require WARC::Index::File::CDX::Entry; |
|
18
|
|
|
|
|
|
|
require WARC::Volume; |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
WARC::Index::register(filename => qr/[.]cdx$/); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our %CDX_Field_Index_Key_Map = |
|
23
|
|
|
|
|
|
|
(a => 'url', b => 'time', u => 'record_id'); |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
our %CDX_Import_Map = |
|
26
|
|
|
|
|
|
|
(time => sub { |
|
27
|
|
|
|
|
|
|
my $cdx_date = shift; |
|
28
|
|
|
|
|
|
|
croak "invalid CDX datestamp" |
|
29
|
|
|
|
|
|
|
unless $cdx_date =~ m/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/; |
|
30
|
|
|
|
|
|
|
return WARC::Date->from_string(sprintf('%04d-%02d-%02dT%02d:%02d:%02dZ', |
|
31
|
|
|
|
|
|
|
$1, $2, $3, $4, $5, $6))}, |
|
32
|
|
|
|
|
|
|
); |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# This implementation uses a hash as the underlying structure. |
|
35
|
|
|
|
|
|
|
# Keys defined by this class: |
|
36
|
|
|
|
|
|
|
# |
|
37
|
|
|
|
|
|
|
# cdx_file |
|
38
|
|
|
|
|
|
|
# file name of CDX file |
|
39
|
|
|
|
|
|
|
# delimiter |
|
40
|
|
|
|
|
|
|
# delimiter character used in CDX file |
|
41
|
|
|
|
|
|
|
# fields |
|
42
|
|
|
|
|
|
|
# array of CDX field codes, in order |
|
43
|
|
|
|
|
|
|
# field_index |
|
44
|
|
|
|
|
|
|
# hash mapping CDX field code letter => position |
|
45
|
|
|
|
|
|
|
# key_index |
|
46
|
|
|
|
|
|
|
# hash mapping WARC::Index search keys => position |
|
47
|
|
|
|
|
|
|
# volumes |
|
48
|
|
|
|
|
|
|
# hash mapping WARC file names => WARC::Volume objects |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub _dbg_dump { |
|
51
|
6
|
|
|
6
|
|
44
|
my $self = shift; |
|
52
|
|
|
|
|
|
|
|
|
53
|
6
|
|
|
|
|
30
|
my $out = __PACKAGE__ . "\n in " . $self->{cdx_file} . "\n"; |
|
54
|
|
|
|
|
|
|
$out .= sprintf ' delimiter ASCII %d/%d ', |
|
55
|
6
|
|
|
|
|
53
|
(ord $self->{delimiter})/16, (ord $self->{delimiter})%16; |
|
56
|
6
|
|
|
|
|
16
|
$out .= ' '.join(' ', 'CDX', @{$self->{fields}})."\n"; |
|
|
6
|
|
|
|
|
21
|
|
|
57
|
|
|
|
|
|
|
|
|
58
|
6
|
|
|
|
|
32
|
return $out; |
|
59
|
|
|
|
|
|
|
} |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub attach { |
|
62
|
13
|
|
|
13
|
1
|
7271
|
my $class = shift; |
|
63
|
13
|
|
|
|
|
26
|
my $cdx_file = shift; |
|
64
|
|
|
|
|
|
|
|
|
65
|
13
|
|
|
|
|
50
|
local $/ = "\012"; # ASCII 0/10 LF |
|
66
|
13
|
100
|
|
|
|
782
|
open my $cdx, '<', $cdx_file or croak "$cdx_file: $!"; |
|
67
|
12
|
|
|
|
|
84
|
binmode $cdx, ':raw'; |
|
68
|
|
|
|
|
|
|
|
|
69
|
12
|
|
|
|
|
190
|
my $header = <$cdx>; |
|
70
|
12
|
100
|
|
|
|
224
|
croak "could not read CDX header line from $cdx_file: $!" |
|
71
|
|
|
|
|
|
|
unless defined $header; |
|
72
|
|
|
|
|
|
|
|
|
73
|
11
|
|
|
|
|
85
|
chomp $header; |
|
74
|
11
|
|
|
|
|
46
|
my $delimiter = substr $header, 0, 1; |
|
75
|
11
|
100
|
100
|
|
|
513
|
croak "no CDX marker found in $cdx_file" |
|
76
|
|
|
|
|
|
|
unless 'CDX' eq substr $header, 1, 3 |
|
77
|
|
|
|
|
|
|
and $delimiter eq substr $header, 4, 1; |
|
78
|
|
|
|
|
|
|
|
|
79
|
9
|
|
|
|
|
79
|
my @fields = split /\Q$delimiter/, substr $header, 5; |
|
80
|
|
|
|
|
|
|
|
|
81
|
9
|
|
|
|
|
32
|
my %field_index = map {$fields[$_] => $_} 0 .. $#fields; |
|
|
120
|
|
|
|
|
254
|
|
|
82
|
|
|
|
|
|
|
my %key_index = |
|
83
|
10
|
|
|
|
|
31
|
map {$CDX_Field_Index_Key_Map{$fields[$_]} => $_} |
|
84
|
9
|
|
|
|
|
72
|
grep defined $CDX_Field_Index_Key_Map{$fields[$_]}, 0 .. $#fields; |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
croak "CDX file $cdx_file does not index WARC file name" |
|
87
|
9
|
100
|
|
|
|
216
|
unless defined $field_index{g}; |
|
88
|
|
|
|
|
|
|
croak "CDX file $cdx_file does not index record offset" |
|
89
|
8
|
100
|
100
|
|
|
237
|
unless defined $field_index{v} or defined $field_index{V}; |
|
90
|
|
|
|
|
|
|
|
|
91
|
7
|
|
|
|
|
217
|
bless {cdx_file => $cdx_file, delimiter => $delimiter, fields => \@fields, |
|
92
|
|
|
|
|
|
|
field_index => \%field_index, key_index => \%key_index}, $class |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub _get_volume { |
|
96
|
83
|
|
|
83
|
|
118
|
my $self = shift; |
|
97
|
83
|
|
|
|
|
123
|
my $name = shift; |
|
98
|
|
|
|
|
|
|
|
|
99
|
83
|
100
|
|
|
|
394
|
return $self->{volumes}{$name} if defined $self->{volumes}{$name}; |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# otherwise... |
|
102
|
6
|
|
|
|
|
125
|
my ($vol, $cdx_dirs, $file) = File::Spec->splitpath($self->{cdx_file}); |
|
103
|
6
|
|
|
|
|
48
|
my @cdx_dirs = File::Spec->splitdir($cdx_dirs); |
|
104
|
6
|
|
|
|
|
61
|
my ($wvol, $warc_dirs, $warc_file) = File::Spec::Unix->splitpath($name); |
|
105
|
6
|
|
|
|
|
24
|
my @warc_dirs = File::Spec::Unix->splitdir($warc_dirs); |
|
106
|
6
|
|
|
|
|
90
|
my $warcfilename = |
|
107
|
|
|
|
|
|
|
File::Spec->catpath($vol, File::Spec->catdir(@cdx_dirs, |
|
108
|
|
|
|
|
|
|
@warc_dirs), $warc_file); |
|
109
|
6
|
|
|
|
|
39
|
return $self->{volumes}{$name} = mount WARC::Volume ($warcfilename); |
|
110
|
|
|
|
|
|
|
} |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub _parse_cdx_entry { |
|
113
|
95
|
|
|
95
|
|
187
|
my $self = shift; |
|
114
|
95
|
|
|
|
|
149
|
my $pos = shift; |
|
115
|
95
|
|
|
|
|
164
|
my $entry = shift; |
|
116
|
|
|
|
|
|
|
# uncoverable condition right |
|
117
|
95
|
|
66
|
|
|
311
|
my $entry_length = $entry && length $entry; |
|
118
|
|
|
|
|
|
|
|
|
119
|
95
|
100
|
|
|
|
273
|
return undef unless $entry_length; # as occurs at end-of-file |
|
120
|
|
|
|
|
|
|
|
|
121
|
83
|
|
|
|
|
129
|
chomp $entry; |
|
122
|
83
|
|
|
|
|
377
|
my @fields = split /\Q$self->{delimiter}/, $entry; |
|
123
|
83
|
|
|
|
|
195
|
my $volname = $fields[$self->{field_index}{g}]; |
|
124
|
|
|
|
|
|
|
my %entry = |
|
125
|
|
|
|
|
|
|
( _index => $self, _entry_offset => $pos, _entry_length => $entry_length, |
|
126
|
|
|
|
|
|
|
_g__volume => $self->_get_volume($volname), |
|
127
|
|
|
|
|
|
|
_Vv__record_offset => |
|
128
|
|
|
|
|
|
|
$fields[$self->{field_index}{($volname =~ m/[.]w?arc$/ ? 'v' : 'V')}], |
|
129
|
|
|
|
|
|
|
map {$_ => (defined $CDX_Import_Map{$_} |
|
130
|
|
|
|
|
|
|
? $CDX_Import_Map{$_}->($fields[$self->{key_index}{$_}]) |
|
131
|
83
|
100
|
|
|
|
181
|
: $fields[$self->{key_index}{$_}])} keys %{$self->{key_index}} |
|
|
71
|
100
|
|
|
|
366
|
|
|
|
83
|
|
|
|
|
280
|
|
|
132
|
|
|
|
|
|
|
); |
|
133
|
|
|
|
|
|
|
|
|
134
|
83
|
|
|
|
|
684
|
bless \%entry, (ref $self).'::Entry'; |
|
135
|
|
|
|
|
|
|
} |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub searchable { |
|
138
|
16
|
|
|
16
|
1
|
1260
|
my $self = shift; |
|
139
|
16
|
|
|
|
|
27
|
my $key = shift; |
|
140
|
|
|
|
|
|
|
|
|
141
|
16
|
100
|
|
|
|
60
|
return defined $self->{key_index}{url} if $key eq 'url_prefix'; |
|
142
|
12
|
|
|
|
|
61
|
return defined $self->{key_index}{$key}; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub _search_all { |
|
146
|
14
|
|
|
14
|
|
24
|
my $self = shift; |
|
147
|
|
|
|
|
|
|
|
|
148
|
14
|
|
|
|
|
51
|
local $/ = "\012"; # ASCII 0/10 LF |
|
149
|
14
|
|
|
|
|
25
|
my @results = (); |
|
150
|
|
|
|
|
|
|
|
|
151
|
14
|
100
|
|
|
|
645
|
open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!"; |
|
152
|
13
|
|
|
|
|
89
|
binmode $cdx, ':raw'; |
|
153
|
|
|
|
|
|
|
|
|
154
|
13
|
|
|
|
|
149
|
my $header = <$cdx>; # skip header to reach first entry |
|
155
|
13
|
|
|
|
|
42
|
my $offset = tell $cdx; |
|
156
|
13
|
|
|
|
|
56
|
my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>); |
|
157
|
13
|
100
|
|
|
|
58
|
return () unless defined $entry->distance(@_); |
|
158
|
|
|
|
|
|
|
|
|
159
|
7
|
|
|
|
|
13
|
while (defined $entry) { |
|
160
|
28
|
100
|
|
|
|
67
|
push @results, $entry unless 0 > $entry->distance(@_); |
|
161
|
28
|
|
|
|
|
48
|
$offset = tell $cdx; # ... and advance ... |
|
162
|
28
|
|
|
|
|
145
|
$entry = $self->_parse_cdx_entry($offset, scalar <$cdx>); |
|
163
|
|
|
|
|
|
|
} |
|
164
|
|
|
|
|
|
|
|
|
165
|
7
|
|
|
|
|
125
|
return @results; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
sub _search_best_match { |
|
169
|
18
|
|
|
18
|
|
30
|
my $self = shift; |
|
170
|
|
|
|
|
|
|
|
|
171
|
18
|
|
|
|
|
64
|
local $/ = "\012"; # ASCII 0/10 LF |
|
172
|
18
|
|
|
|
|
28
|
my $result = undef; |
|
173
|
18
|
|
|
|
|
28
|
my $result_distance = -1; |
|
174
|
|
|
|
|
|
|
|
|
175
|
18
|
100
|
|
|
|
876
|
open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!"; |
|
176
|
17
|
|
|
|
|
110
|
binmode $cdx, ':raw'; |
|
177
|
|
|
|
|
|
|
|
|
178
|
17
|
|
|
|
|
206
|
my $header = <$cdx>; # skip header to reach first entry |
|
179
|
17
|
|
|
|
|
53
|
my $offset = tell $cdx; |
|
180
|
17
|
|
|
|
|
69
|
my $entry = $self->_parse_cdx_entry($offset, scalar <$cdx>); |
|
181
|
17
|
100
|
|
|
|
65
|
return undef unless defined $entry->distance(@_); |
|
182
|
|
|
|
|
|
|
|
|
183
|
11
|
|
|
|
|
22
|
while (defined $entry) { |
|
184
|
31
|
|
|
|
|
79
|
my $distance = $entry->distance(@_); |
|
185
|
31
|
100
|
|
|
|
65
|
unless (0 > $distance) { |
|
186
|
19
|
100
|
100
|
|
|
57
|
if ($result_distance < 0 # first match found |
|
187
|
|
|
|
|
|
|
or $distance < $result_distance) # or better match found |
|
188
|
16
|
|
|
|
|
39
|
{ $result = $entry; $result_distance = $distance } |
|
|
16
|
|
|
|
|
22
|
|
|
189
|
|
|
|
|
|
|
} |
|
190
|
31
|
100
|
|
|
|
231
|
return $result if $result_distance == 0; # no better match possible |
|
191
|
22
|
|
|
|
|
38
|
$offset = tell $cdx; # ... and advance ... |
|
192
|
22
|
|
|
|
|
90
|
$entry = $self->_parse_cdx_entry($offset, scalar <$cdx>); |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
2
|
|
|
|
|
34
|
return $result; |
|
196
|
|
|
|
|
|
|
} |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub search { |
|
199
|
35
|
|
|
35
|
1
|
6547
|
my $self = shift; |
|
200
|
|
|
|
|
|
|
|
|
201
|
35
|
100
|
|
|
|
83
|
unless (defined wantarray) |
|
202
|
1
|
|
|
|
|
207
|
{ carp "calling 'search' method in void context"; return } |
|
|
1
|
|
|
|
|
82
|
|
|
203
|
|
|
|
|
|
|
|
|
204
|
34
|
100
|
|
|
|
192
|
croak "no arguments given to 'search' method" |
|
205
|
|
|
|
|
|
|
unless scalar @_; |
|
206
|
33
|
100
|
|
|
|
187
|
croak "odd number of arguments given to 'search' method" |
|
207
|
|
|
|
|
|
|
if scalar @_ % 2; |
|
208
|
|
|
|
|
|
|
|
|
209
|
32
|
100
|
|
|
|
66
|
if (wantarray) { return $self->_search_all(@_) } |
|
|
14
|
|
|
|
|
34
|
|
|
210
|
18
|
|
|
|
|
46
|
else { return $self->_search_best_match(@_) } |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub first_entry { |
|
214
|
4
|
|
|
4
|
1
|
431
|
my $self = shift; |
|
215
|
|
|
|
|
|
|
|
|
216
|
4
|
|
|
|
|
18
|
local $/ = "\012"; # ASCII 0/10 LF |
|
217
|
|
|
|
|
|
|
|
|
218
|
4
|
100
|
|
|
|
255
|
open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!"; |
|
219
|
3
|
|
|
|
|
20
|
binmode $cdx, ':raw'; |
|
220
|
|
|
|
|
|
|
|
|
221
|
3
|
|
|
|
|
37
|
my $header = <$cdx>; # skip header to reach first entry |
|
222
|
3
|
|
|
|
|
11
|
my $offset = tell $cdx; |
|
223
|
3
|
|
|
|
|
17
|
return $self->_parse_cdx_entry($offset, scalar <$cdx>); |
|
224
|
|
|
|
|
|
|
} |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
sub entry_at { |
|
227
|
15
|
|
|
15
|
1
|
1105
|
my $self = shift; |
|
228
|
15
|
|
|
|
|
21
|
my $offset = shift; |
|
229
|
|
|
|
|
|
|
|
|
230
|
15
|
|
|
|
|
57
|
local $/ = "\012"; # ASCII 0/10 LF |
|
231
|
|
|
|
|
|
|
|
|
232
|
15
|
100
|
|
|
|
701
|
open my $cdx, '<', $self->{cdx_file} or croak "$self->{cdx_file}: $!"; |
|
233
|
14
|
|
|
|
|
91
|
binmode $cdx, ':raw'; |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
# one octet before requested position must be an end-of-line marker |
|
236
|
14
|
100
|
|
|
|
242
|
seek $cdx, $offset - 1, SEEK_SET or croak "seek $self->{cdx_file}: $!"; |
|
237
|
13
|
50
|
|
|
|
26
|
my $eol; defined(read $cdx, $eol, 1) or croak "read $self->{cdx_file}: $!"; |
|
|
13
|
|
|
|
|
188
|
|
|
238
|
13
|
100
|
|
|
|
246
|
croak "offset $offset in $self->{cdx_file} not a record boundary" |
|
239
|
|
|
|
|
|
|
unless $eol eq $/; |
|
240
|
|
|
|
|
|
|
|
|
241
|
12
|
|
|
|
|
77
|
return $self->_parse_cdx_entry($offset, scalar <$cdx>); |
|
242
|
|
|
|
|
|
|
} |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
1; |
|
245
|
|
|
|
|
|
|
__END__ |