line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WARC::Builder; # -*- CPerl -*- |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
70665
|
use strict; |
|
1
|
|
|
|
|
11
|
|
|
1
|
|
|
|
|
30
|
|
4
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
187
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our @ISA = qw(); |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
require WARC; *WARC::Builder::VERSION = \$WARC::VERSION; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
require WARC::Record; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
WARC::Builder - Web ARChive construction support for Perl |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use WARC::Builder; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
$warcinfo_data = new WARC::Fields (software => 'MyWebCrawler/1.2.3 ...', |
23
|
|
|
|
|
|
|
format => 'WARC File Format 1.0', |
24
|
|
|
|
|
|
|
# other fields omitted ... |
25
|
|
|
|
|
|
|
); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
$warcinfo = new WARC::Record (type => 'warcinfo', |
28
|
|
|
|
|
|
|
content => $warcinfo_data); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# for a small-scale crawl |
31
|
|
|
|
|
|
|
$build = new WARC::Builder (warcinfo => $warcinfo, |
32
|
|
|
|
|
|
|
filename => $warcfilename); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# for a large-scale crawl |
35
|
|
|
|
|
|
|
$index1 = build WARC::Index::File::CDX (into => $indexprefix.'.cdx'); |
36
|
|
|
|
|
|
|
$index2 = build WARC::Index::File::SDBM (into => $indexprefix.'.sdbm'); |
37
|
|
|
|
|
|
|
$build = new WARC::Builder (warcinfo => $warcinfo, |
38
|
|
|
|
|
|
|
filename_template => |
39
|
|
|
|
|
|
|
$warcprefix.'-%s-%05d-'.$hostname.'.warc.gz', |
40
|
|
|
|
|
|
|
index => [$index1, $index2]); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# for each collected object |
43
|
|
|
|
|
|
|
$build->append(@records); # or ... |
44
|
|
|
|
|
|
|
$build->append($record1, $record2, ... ); |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head1 DESCRIPTION |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
The C class is the high-level interface for writing WARC |
49
|
|
|
|
|
|
|
archives. It is a very simple interface, because, at this level, WARC is a |
50
|
|
|
|
|
|
|
very simple format: a simple sequence of WARC records, which |
51
|
|
|
|
|
|
|
C accepts as C objects to append to the |
52
|
|
|
|
|
|
|
in-progress WARC file. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
WARC file size limits are handled automatically if configured. |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head2 Methods |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=over |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=item $build = new WARC::Builder (I =E I, ...) |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Construct a C object. The following keys are supported: |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=over |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=item index =E [$index] |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=item index =E [$index1, $index2, ...] |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
If set, must be an array reference of a list of index builder objects. |
71
|
|
|
|
|
|
|
Each newly-added WARC::Record will be presented to all index builder |
72
|
|
|
|
|
|
|
objects in this list. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=item filename =E $warcfilename |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
If set, create a single WARC file with the given file name. The file name |
77
|
|
|
|
|
|
|
must match m/\.warc(?:\.gz)?$/. The presence of a final ".gz" indicates |
78
|
|
|
|
|
|
|
that the WARC file should be written with per-record gzip compression. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
This option is mutually exclusive with the C option. |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
Using this option inhibits starting a new WARC file and causes the |
83
|
|
|
|
|
|
|
C option to be ignored. A warning is emitted in this case. |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=item filename_template =E $warcprefix.'-%s-%05d-'.$hostname.'.warc.gz' |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
Establish an sprintf format string to construct file names. The file name |
88
|
|
|
|
|
|
|
produced by the template string must match m/\.warc(?:\.gz)?$/. The |
89
|
|
|
|
|
|
|
presence of a final ".gz" indicates that the WARC file should be written |
90
|
|
|
|
|
|
|
with per-record gzip compression. |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
The C option gives the format string, while |
93
|
|
|
|
|
|
|
C gives an array reference of named parameters to |
94
|
|
|
|
|
|
|
be used with the format. |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
If constructing file names in accordance with the IIPC WARC implementation |
97
|
|
|
|
|
|
|
guidelines, this string should be of the form |
98
|
|
|
|
|
|
|
'PREFIX-%s-%05d-HOSTNAME.warc.gz' where PREFIX is any chosen prefix to name |
99
|
|
|
|
|
|
|
the crawl and HOSTNAME is the name or other identifier for the machine |
100
|
|
|
|
|
|
|
writing the file. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
This option is mutually exclusive with the C option. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item filename_template_vars =E [qw/timestamp serial/] |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Provide the list of parameters to the sprintf call used to produce a WARC |
107
|
|
|
|
|
|
|
filename from the C option. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
The available variables are: |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=over |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=item serial |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
A number, incremented each time adding a record causes a new WARC file to |
116
|
|
|
|
|
|
|
be started. |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=item timestamp |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
A 14-digit timestamp in the YYYYmmddHHMMSS format recommended in the IIPC |
121
|
|
|
|
|
|
|
WARC implementation guidelines. The timestamp is always in UTC. The time |
122
|
|
|
|
|
|
|
used is the time at which the C object was constructed and |
123
|
|
|
|
|
|
|
is constant between WARC files. This should be substituted as a string. |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=back |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
Default [qw/timestamp serial/] in accordance with IIPC guidelines. |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=item first_serial =E $count |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
The initial value of the C filename variable for this object. |
132
|
|
|
|
|
|
|
Default 0. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=item max_file_size =E $size |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
Maximum size of a WARC file. A new WARC file is started if appending a |
137
|
|
|
|
|
|
|
record would cause the current file to exceed this length. |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
The limit can be specified as an exact number of bytes, or a number |
140
|
|
|
|
|
|
|
followed by a size suffix m/[KMG]i?/. The "K", "M", and "G" suffixes |
141
|
|
|
|
|
|
|
indicate base-10 multiples (10**(3*n)), while the "Ki", "Mi", and "Gi" |
142
|
|
|
|
|
|
|
suffixes indicate base-2 multiples (2**(10*n)) widely used in computing. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Default "1G" == 1_000_000_000. |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=item warcinfo =E $warcinfo_record |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
A C object of type "warcinfo" that will be written at the |
149
|
|
|
|
|
|
|
start of each WARC file. This record will be cloned and written with a |
150
|
|
|
|
|
|
|
distinct "WARC-Record-ID" as the first record in each WARC file, including |
151
|
|
|
|
|
|
|
the first. As a consequence, it does not require a "WARC-Record-ID" header |
152
|
|
|
|
|
|
|
and any "WARC-Record-ID" given is silently ignored. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
Each clone of this record will also have the "WARC-Filename" header added. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
Each clone of this record will also have the "WARC-Date" header set to the |
157
|
|
|
|
|
|
|
time at which the C object was constructed. |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=item warcversion =E 'WARC/1.0' |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Set the version of the WARC format to be written. This string is the first |
162
|
|
|
|
|
|
|
line of each WARC record. It must begin with the prefix 'WARC/' and should |
163
|
|
|
|
|
|
|
be the version from the WARC specification that the crawler follows. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
Default "WARC/1.0". |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=back |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=cut |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
0
|
1
|
|
sub new { |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=item $build-Eappend( $record1, ... ) |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
Add any number of C objects to the growing WARC file. If |
177
|
|
|
|
|
|
|
WARC file size limits are configured, and a record would cause the current |
178
|
|
|
|
|
|
|
WARC file to exceed the configured size limits, a new WARC file is opened |
179
|
|
|
|
|
|
|
automatically. |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
All records passed to a single C call are added to the same WARC |
182
|
|
|
|
|
|
|
file. If a new WARC file is to be started, it will be started B |
183
|
|
|
|
|
|
|
any records are written. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
All records passed to a single C call are considered "concurrent" |
186
|
|
|
|
|
|
|
and all subsequent records will have a "WARC-Concurrent-To" header added |
187
|
|
|
|
|
|
|
referencing the first record, if they do not already have a |
188
|
|
|
|
|
|
|
"WARC-Concurrent-To" header. This is a convenience feature for simpler |
189
|
|
|
|
|
|
|
crawlers and is inhibited if any record already has a "WARC-Concurrent-To" |
190
|
|
|
|
|
|
|
header when C is called. |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
If a C passed to this method lacks a "WARC-Record-ID" header, |
193
|
|
|
|
|
|
|
a warning will be emitted using carp(), a UUID will be generated, and a |
194
|
|
|
|
|
|
|
record ID of the form "urn:uuid:UUID" will be assigned. If the record |
195
|
|
|
|
|
|
|
object is read-only, this method will croak() instead. |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
If a C passed to this method lacks any of the "WARC-Date", |
198
|
|
|
|
|
|
|
"WARC-Type", or "Content-Length" headers, this method will croak(). |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=cut |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
0
|
1
|
|
sub append { |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=back |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=cut |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
1; |
210
|
|
|
|
|
|
|
__END__ |