File Coverage

blib/lib/WARC/Record.pm
Criterion Covered Total %
statement 53 57 92.9
branch 7 8 87.5
condition n/a
subroutine 24 25 96.0
pod 17 17 100.0
total 101 107 94.3


line stmt bran cond sub pod time code
1             package WARC::Record; # -*- CPerl -*-
2              
3 26     26   70549 use strict;
  26         59  
  26         828  
4 26     26   144 use warnings;
  26         43  
  26         646  
5              
6 26     26   125 use Carp;
  26         40  
  26         1738  
7 26     26   191 use Scalar::Util;
  26         62  
  26         2593  
8              
9             our @ISA = qw();
10              
11             require WARC; *WARC::Record::VERSION = \$WARC::VERSION;
12             require WARC::Date;
13              
14             =head1 NAME
15              
16             WARC::Record - one record from a WARC file
17              
18             =head1 SYNOPSIS
19              
20             use WARC; # or ...
21             use WARC::Volume; # or ...
22             use WARC::Collection;
23              
24             # WARC::Record objects are returned from ->record_at and ->search methods
25              
26             # Construct a record, as when preparing a WARC file
27             $warcinfo = new WARC::Record (type => 'warcinfo');
28              
29             # Accessors
30              
31             $value = $record->field($name);
32              
33             $version = $record->protocol; # analogous to HTTP::Message::protocol
34             $volume = $record->volume;
35             $offset = $record->offset;
36             $record = $record->next;
37              
38             $fields = $record->fields;
39              
40             ...
41              
42             =cut
43              
44 26     26   26212 use overload '<=>' => 'compareTo', 'cmp' => 'compareTo';
  26         20655  
  26         152  
45 26     26   2198 use overload fallback => 1;
  26         53  
  26         118  
46              
47             # This implementation uses a hash as the underlying object.
48             # Keys defined by this class:
49             #
50             # fields
51             # Embedded WARC::Fields object
52              
53             sub _dbg_dump {
54 24     24   42 my $self = shift;
55              
56 24         36 my $out = "WARC record [dumping as base class]\n";
57 24         43 my @out = map {s/^/ /gm; $_} $self->fields->as_string;
  24         157  
  24         67  
58 24         77 $out .= join("\n", @out);
59              
60 24         74 return $out;
61             }
62              
63             # This method can be overridden in subclasses for read-only objects.
64             sub _set {
65 0     0   0 my $self = shift;
66 0         0 my $key = shift;
67 0         0 my $value = shift;
68              
69 0         0 $self->{$key} = $value;
70             }
71              
72             =head1 DESCRIPTION
73              
74             C objects come in two flavors with a common interface.
75             Records read from WARC files are read-only and have meaningful return
76             values from the methods listed in "Methods on records from WARC files".
77             Records constructed in memory can be updated and those same methods all
78             return undef.
79              
80             =head2 Common Methods
81              
82             =over
83              
84             =item $record-Efields
85              
86             Get the internal C object that contains WARC record headers.
87              
88             =cut
89              
90 3215     3215 1 7498 sub fields { (shift)->{fields} }
91              
92             =item $record-Efield( $name )
93              
94             Get the value of the WARC header named $name from the internal
95             C object.
96              
97             =cut
98              
99             sub field {
100 5130     5130 1 7914 my $self = shift;
101 5130         8254 return $self->fields->field(shift);
102             }
103              
104             =item $record E=E $other_record
105              
106             =item $record-EcompareTo( $other_record )
107              
108             Compare two C objects according to a simple total order:
109             ordering by starting offset for two records in the same file, and by
110             filename of the containing C objects for records in different
111             files. Constructed C objects are assumed to come from a
112             volume named "" (the empty string) for this purpose, and are ordered in an
113             arbitrary but stable manner amongst themselves. Distinct constructed
114             C objects never compare as equal.
115              
116             Perl constructs a C<==> operator using this method, so WARC record objects
117             will compare as equal iff they refer to the same physical record.
118              
119             =cut
120              
121             sub compareTo {
122 14     14 1 4435 my $a = shift;
123 14         25 my $b = shift;
124 14         20 my $swap = shift;
125              
126             # sort in-memory-only records ahead of on-disk records
127 14 100       31 return $swap ? 1 : -1 if defined $b->volume;
    100          
128              
129             # neither record is from a WARC volume
130 7         20 my $cmp = (Scalar::Util::refaddr $a) <=> (Scalar::Util::refaddr $b);
131              
132 7 50       49 return $swap ? 0-$cmp : 0+$cmp;
133             }
134              
135             =back
136              
137             =head3 Convenience getters
138              
139             =over
140              
141             =item $record-Etype
142              
143             Alias for C<$record-Efield('WARC-Type')>.
144              
145             =cut
146              
147 286     286 1 631 sub type { (shift)->field('WARC-Type') }
148              
149             =item $record-Eid
150              
151             Alias for C<$record-Efield('WARC-Record-ID')>.
152              
153             =cut
154              
155 659     659 1 124338 sub id { (shift)->field('WARC-Record-ID') }
156              
157             =item $record-Edate
158              
159             Return the C<'WARC-Date'> field as a C object.
160              
161             =cut
162              
163 262     262 1 4148 sub date { WARC::Date->from_string((shift)->field('WARC-Date')) }
164              
165             =back
166              
167             =head2 Methods on records from WARC files
168              
169             These methods all return undef if called on a C object that
170             does not represent a record in a WARC file.
171              
172             =over
173              
174             =item $record-Eprotocol
175              
176             Return the format and version tag for this record. For WARC 1.0, this
177             method returns 'WARC/1.0'.
178              
179             =cut
180              
181 1     1 1 4 sub protocol { return undef }
182              
183             =item $record-Evolume
184              
185             Return the C object representing the file in which this
186             record is located.
187              
188             =cut
189              
190 11     11 1 57 sub volume { return undef }
191              
192             =item $record-Eoffset
193              
194             Return the file offset at which this record can be found.
195              
196             =cut
197              
198 1     1 1 4 sub offset { return undef }
199              
200             =item $record-Elogical
201              
202             Return the logical record object for this record. Logical records
203             reassemble WARC continuation segments. Records recorded without using WARC
204             segmentation are their own logical records. Reassembled logical records
205             are also their own logical records.
206              
207             =cut
208              
209 1     1 1 5 sub logical { return undef }
210              
211             =item $record-Esegments
212              
213             Return a list of segments for this record. A record recorded without using
214             WARC segmentation, including a segment of a larger logical record, is
215             considered its own only segment. A constructed record is considered to
216             have no segments at all.
217              
218             This method exists on all records to allow
219             C<$record-Elogical-Esegments> to work.
220              
221             =cut
222              
223 1     1 1 5 sub segments { return () }
224              
225             =item $record-Enext
226              
227             Return the next C in the WARC file that contains this record.
228             Returns an undefined value if called on the last record in a file.
229              
230             =cut
231              
232 1     1 1 5 sub next { return undef }
233              
234             =item $record-Eopen_block
235              
236             Return a tied filehandle that reads the WARC record block.
237              
238             The WARC record block is the content of a WARC record, analogous to the
239             entity body in an C.
240              
241             =cut
242              
243 5     5 1 13 sub open_block { return undef }
244              
245             =item $record-Eopen_continued
246              
247             Return a tied filehandle that reads the logical WARC record block.
248              
249             For records that do not use WARC segmentation, this is effectively an alias
250             for C<$record-Eopen_block>. For records that span multiple segments,
251             this is an alias for C<$record-Elogical-Eopen_block>.
252              
253             =cut
254              
255 1     1 1 5 sub open_continued { return undef }
256              
257             =item $record-Ereplay
258              
259             =item $record-Ereplay( as =E $type )
260              
261             Return a protocol-specific object representing the record contents.
262              
263             This method returns undef if the library does not recognize the protocol
264             message stored in the record and croaks if a requested conversion is not
265             supported.
266              
267             A record with Content-Type "application/http" with an appropriate "msgtype"
268             parameter produces an C or C object. The
269             returned object may be a subclass to support deferred loading of entity
270             bodies.
271              
272             A request to replay a record "as =E http" attempts to convert whatever
273             is stored in the record to an HTTP exchange, analogous to the "everything
274             is HTTP" interface that C provides.
275              
276             =cut
277              
278 1     1 1 4 sub replay { return undef }
279              
280             =item $record-Eopen_payload
281              
282             Return a tied filehandle that reads the WARC record payload.
283              
284             The WARC record payload is defined as the decoded content of the protocol
285             response or other resource stored in the record. This method returns undef
286             if called on a WARC record that has no payload or content that we do not
287             recognize.
288              
289             =cut
290              
291 1     1 1 5 sub open_payload { return undef }
292              
293             =back
294              
295             =head2 Methods on fresh WARC records
296              
297             =over
298              
299             =item $record = new WARC::Record (I =E I, ...)
300              
301             Construct a fresh WARC record, suitable for use with C.
302              
303             =cut
304              
305             sub new {
306 6     6 1 1391 my $class = shift;
307 6         17 my %opt = @_;
308              
309 6         15 foreach my $name (qw/type/)
310 6 100       312 { croak "required field '$name' not specified" unless $opt{$name} }
311              
312 5         22 my $fields = new WARC::Fields ('WARC-Type' => $opt{type});
313              
314 5         22 bless { fields => $fields }, $class;
315             }
316              
317             =back
318              
319             =cut
320              
321             1;
322             __END__