line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WARC::Index::File::CDX::Builder; # -*- CPerl -*- |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
74537
|
use strict; |
|
1
|
|
|
|
|
14
|
|
|
1
|
|
|
|
|
40
|
|
4
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
72
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
require WARC::Index::Builder; |
7
|
|
|
|
|
|
|
our @ISA = qw(WARC::Index::Builder); |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
require WARC; *WARC::Index::File::CDX::Builder::VERSION = \$WARC::VERSION; |
10
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
604
|
use URI; |
|
1
|
|
|
|
|
7057
|
|
|
1
|
|
|
|
|
32
|
|
12
|
1
|
|
|
1
|
|
7
|
use Carp; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
69
|
|
13
|
1
|
|
|
1
|
|
6
|
use Cwd qw//; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
17
|
|
14
|
1
|
|
|
1
|
|
5
|
use File::Spec; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
25
|
|
15
|
1
|
|
|
1
|
|
4
|
use Fcntl qw/:seek/; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1416
|
|
16
|
|
|
|
|
|
|
require File::Spec::Unix; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our %Record_Field_Handlers = |
19
|
|
|
|
|
|
|
# each handler is called with WARC::Record and index builder objects and |
20
|
|
|
|
|
|
|
# returns the text value for that field or undef, which is written as '-' |
21
|
|
|
|
|
|
|
(a => sub { (shift)->field('WARC-Target-URI') }, |
22
|
|
|
|
|
|
|
k => sub { (shift)->field('WARC-Payload-Digest') }, |
23
|
|
|
|
|
|
|
u => sub { (shift)->id }, |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
b => sub { my $date = (shift)->date; |
26
|
|
|
|
|
|
|
$date =~ y/-T:Z//d; substr $date, 0, 14 }, |
27
|
|
|
|
|
|
|
N => sub { my $uri = (shift)->field('WARC-Target-URI'); |
28
|
|
|
|
|
|
|
return undef unless $uri; |
29
|
|
|
|
|
|
|
$uri = new URI ($uri); |
30
|
|
|
|
|
|
|
return undef unless $uri->can('host') && $uri->can('path'); |
31
|
|
|
|
|
|
|
my $surt_host = join ',', reverse split /[.]/, $uri->host; |
32
|
|
|
|
|
|
|
return $surt_host.')'.$uri->path |
33
|
|
|
|
|
|
|
}, |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
g => sub { my $record = shift; my $builder = shift; |
36
|
|
|
|
|
|
|
return $builder->_get_relvolname($record->volume) }, |
37
|
|
|
|
|
|
|
S => sub { (shift)->{sl_packed_size} }, |
38
|
|
|
|
|
|
|
v => sub { my $record = shift; |
39
|
|
|
|
|
|
|
return undef if $record->volume->filename !~ m/[.]warc\z/; |
40
|
|
|
|
|
|
|
return $record->offset }, |
41
|
|
|
|
|
|
|
V => sub { my $record = shift; |
42
|
|
|
|
|
|
|
return undef if $record->volume->filename =~ m/[.]warc\z/; |
43
|
|
|
|
|
|
|
return $record->offset }, |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
# HTTP responses only |
46
|
|
|
|
|
|
|
m => sub { my $response = (shift)->replay; |
47
|
|
|
|
|
|
|
return undef unless UNIVERSAL::can($response, 'headers'); |
48
|
|
|
|
|
|
|
$response->headers->content_type }, |
49
|
|
|
|
|
|
|
r => sub { my $response = (shift)->replay; |
50
|
|
|
|
|
|
|
return undef unless UNIVERSAL::can($response, 'headers'); |
51
|
|
|
|
|
|
|
$response->headers->header('Location') }, |
52
|
|
|
|
|
|
|
s => sub { my $response = (shift)->replay; |
53
|
|
|
|
|
|
|
return undef unless UNIVERSAL::can($response, 'code'); |
54
|
|
|
|
|
|
|
$response->code }, |
55
|
|
|
|
|
|
|
); |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# This implementation uses a hash as the underlying structure. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Keys defined by this class: |
60
|
|
|
|
|
|
|
# |
61
|
|
|
|
|
|
|
# file_name |
62
|
|
|
|
|
|
|
# Name of CDX file where records will be appended. |
63
|
|
|
|
|
|
|
# file |
64
|
|
|
|
|
|
|
# Handle opened for writing/appending on that file. |
65
|
|
|
|
|
|
|
# fields |
66
|
|
|
|
|
|
|
# CDX field letters to be written. |
67
|
|
|
|
|
|
|
# fieldgen |
68
|
|
|
|
|
|
|
# Array of handlers to call to produce field values. |
69
|
|
|
|
|
|
|
# delimiter |
70
|
|
|
|
|
|
|
# Field delimiter used in CDX file. Default is space; cannot be set |
71
|
|
|
|
|
|
|
# as an option but can be read from an existing CDX file header. |
72
|
|
|
|
|
|
|
# volnames |
73
|
|
|
|
|
|
|
# Hash mapping volume names to relative paths from the CDX file. |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub _get_relvolname { |
76
|
37
|
|
|
37
|
|
67
|
my $self = shift; |
77
|
37
|
|
|
|
|
81
|
my $name = (shift)->filename; |
78
|
|
|
|
|
|
|
|
79
|
37
|
100
|
|
|
|
156
|
return $self->{volnames}->{$name} if defined $self->{volnames}{$name}; |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# otherwise ... |
82
|
4
|
|
|
|
|
75
|
my ($vol, $cdx_dirs, $file) = File::Spec->splitpath($self->{file_name}); |
83
|
4
|
|
|
|
|
354
|
my $relname = File::Spec->abs2rel |
84
|
|
|
|
|
|
|
($name, File::Spec->catpath($vol, $cdx_dirs, undef)); |
85
|
4
|
|
|
|
|
40
|
my ($rvol, $rel_dirs, $rel_file) = File::Spec->splitpath($relname); |
86
|
4
|
|
|
|
|
18
|
my @rel_dirs = File::Spec->splitdir($rel_dirs); |
87
|
4
|
|
|
|
|
30
|
my $warcfilename = File::Spec::Unix->catpath |
88
|
|
|
|
|
|
|
($rvol, File::Spec::Unix->catdir(@rel_dirs), $rel_file); |
89
|
4
|
|
|
|
|
23
|
return $self->{volnames}{$name} = $warcfilename; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub _new { |
93
|
7
|
|
|
7
|
|
5475
|
my $class = shift; |
94
|
7
|
|
|
|
|
28
|
my %args = @_; |
95
|
|
|
|
|
|
|
|
96
|
7
|
100
|
|
|
|
278
|
croak "required parameter 'into' missing" unless $args{into}; |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
my $ob = { delimiter => ' ', fields => [qw/N b a m s k r M S V g u/], |
99
|
6
|
|
|
|
|
386
|
file_name => Cwd::abs_path($args{into}) }; |
100
|
|
|
|
|
|
|
|
101
|
6
|
100
|
|
|
|
35
|
$ob->{fields} = $args{fields} if $args{fields}; |
102
|
|
|
|
|
|
|
|
103
|
6
|
50
|
|
|
|
358
|
open my $fh, '+>>', $args{into} or croak $args{into}.': '.$!; |
104
|
|
|
|
|
|
|
{ |
105
|
6
|
|
|
|
|
21
|
local $/ = "\012"; |
|
6
|
|
|
|
|
38
|
|
106
|
6
|
50
|
|
|
|
57
|
seek $fh, 0, SEEK_SET or croak 'seek '.$args{into}.': '.$!; |
107
|
6
|
|
|
|
|
115
|
my $header = <$fh>; |
108
|
6
|
100
|
|
|
|
25
|
if ($header) { |
109
|
|
|
|
|
|
|
$header =~ m/^(.)CDX((?:\1[[:alpha:]])+)/ |
110
|
3
|
100
|
|
|
|
438
|
or croak $args{into}.' exists but lacks CDX header'; |
111
|
1
|
|
|
|
|
5
|
$ob->{delimiter} = $1; |
112
|
1
|
|
|
|
|
18
|
$ob->{fields} = [split /\Q$1/, $2]; |
113
|
1
|
|
|
|
|
3
|
shift @{$ob->{fields}}; # remove leading empty field |
|
1
|
|
|
|
|
3
|
|
114
|
|
|
|
|
|
|
} else { |
115
|
|
|
|
|
|
|
# write CDX header |
116
|
3
|
|
|
|
|
11
|
print $fh ' CDX ', join(' ', @{$ob->{fields}}), "\012"; |
|
3
|
|
|
|
|
24
|
|
117
|
|
|
|
|
|
|
} |
118
|
4
|
50
|
|
|
|
121
|
seek $fh, 0, SEEK_END or croak 'seek '.$args{into}.': '.$!; |
119
|
|
|
|
|
|
|
} |
120
|
4
|
|
|
|
|
18
|
$ob->{file} = $fh; |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
$ob->{fieldgen} = |
123
|
4
|
100
|
|
29
|
|
11
|
[map { $Record_Field_Handlers{$_} or sub { undef } } @{$ob->{fields}}]; |
|
30
|
|
|
|
|
105
|
|
|
29
|
|
|
|
|
60
|
|
|
4
|
|
|
|
|
13
|
|
124
|
|
|
|
|
|
|
|
125
|
4
|
|
|
|
|
82
|
bless $ob, $class |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# inherit add |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub _add_record { |
131
|
37
|
|
|
37
|
|
76
|
my $self = shift; |
132
|
37
|
|
|
|
|
62
|
my $record = shift; |
133
|
|
|
|
|
|
|
|
134
|
356
|
100
|
100
|
|
|
1077
|
my $line = join $self->{delimiter}, map { defined $_ && $_ ne '' ? $_ : '-' } |
135
|
37
|
|
|
|
|
74
|
map { $_->($record, $self) } @{$self->{fieldgen}}; |
|
372
|
|
|
|
|
2760
|
|
|
37
|
|
|
|
|
105
|
|
136
|
|
|
|
|
|
|
|
137
|
37
|
|
|
|
|
111
|
print {$self->{file}} $line, "\012"; |
|
37
|
|
|
|
|
256
|
|
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub flush { |
141
|
4
|
|
|
4
|
1
|
33
|
my $self = shift; |
142
|
|
|
|
|
|
|
|
143
|
4
|
|
|
|
|
138
|
seek $self->{file}, 0, SEEK_END |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
1; |
147
|
|
|
|
|
|
|
__END__ |