File Coverage

blib/lib/WARC/Record/FromVolume.pm
Criterion Covered Total %
statement 129 133 96.9
branch 51 66 77.2
condition 12 12 100.0
subroutine 22 24 91.6
pod 12 12 100.0
total 226 247 91.5


line stmt bran cond sub pod time code
1             package WARC::Record::FromVolume; # -*- CPerl -*-
2              
3 26     26   71346 use strict;
  26         60  
  26         895  
4 26     26   141 use warnings;
  26         48  
  26         1500  
5              
6             our @ISA = qw(WARC::Record);
7             our @CARP_NOT = (@ISA, qw(WARC::Volume WARC::Record::Stub));
8              
9 26     26   564 use WARC; *WARC::Record::FromVolume::VERSION = \$WARC::VERSION;
  26         62  
  26         881  
10              
11 26     26   149 use Carp;
  26         86  
  26         1531  
12 26     26   177 use Fcntl 'SEEK_SET';
  26         55  
  26         1375  
13 26     26   12408 use Symbol 'geniosym';
  26         19971  
  26         1539  
14 26     26   14875 use IO::Uncompress::Gunzip '$GunzipError';
  26         1146509  
  26         46729  
15              
16             require WARC::Fields;
17             require WARC::Record;
18             require WARC::Record::Block;
19             require WARC::Record::Replay;
20              
21 0     0   0 sub _set { croak "attempt to modify WARC record in file" }
22              
23             # The overload to a method call is inherited.
24             sub compareTo {
25 134     134 1 25773 my $a = shift;
26 134         208 my $b = shift;
27 134         199 my $swap = shift;
28              
29             # sort in-memory-only records ahead of on-disk records
30 134 100       300 return $swap ? -1 : 1 unless defined $b->volume;
    100          
31              
32 131 100 100     349 my $cmp =
33             ((($a->volume->filename eq $b->volume->filename)
34             || ($a->volume->_file_tag eq $b->volume->_file_tag))
35             ? ($a->offset <=> $b->offset)
36             : ($a->volume->filename cmp $b->volume->filename));
37              
38 131 100       1740 return $swap ? 0-$cmp : 0+$cmp;
39             }
40              
41             # This implementation uses a hash as the underlying structure.
42              
43             # Keys inherited from WARC::Record base class:
44             #
45             # fields
46              
47             # Keys defined by this class:
48             #
49             # volume
50             # Parent WARC::Volume object
51             # collection (optional)
52             # Parent WARC::Collection object, if record found via a collection
53             # offset
54             # Offset of start-of-record within parent volume
55             # compression
56             # Name of decompression filter used with this record
57             # data_offset
58             # Offset of data block within record (possibly compressed)
59             # sl_packed_size
60             # Size of compressed data block according to "sl" gzip extension
61             # sl_full_size
62             # Size of uncompressed data block according to "sl" gzip extension
63             # protocol
64             # WARC version found at start of record
65             # logical (optional)
66             # Weak reference to logical record object containing this segment
67             # (Defined by this class, but only set by WARC::Record::Logical.)
68              
69             # Keys tested by logical record heuristics:
70             #
71             # compression
72             # defined iff record is compressed
73             # sl_packed_size
74             # defined iff compressed record can be skipped without reading data block
75              
76             # Keys used in index writers:
77             #
78             # sl_packed_size
79             # used for "S" field in CDX indexes
80              
81 1729     1729   28400 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  1729         6680  
82              
83             sub _dbg_dump {
84 36     36   16529 my $self = shift;
85              
86 36         109 my $out = 'WARC '.$self->field('WARC-Type').' record ['.$self->protocol.']';
87 36 100       168 $out .= ' [via '.$self->{compression}.']' if defined $self->{compression};
88 36         70 $out .= "\n";
89              
90 36         96 $out .= ' id '.$self->id."\n";
91              
92 36         96 $out .= ' at '.$self->offset.' in '.$self->volume."\n";
93             $out .= ' "sl" header: '.$self->{sl_packed_size}.' packed from '
94 36 100       134 .$self->{sl_full_size}." octets\n" if defined $self->{sl_full_size};
95              
96 36         110 $out .= ' data begins at offset '.$self->{data_offset};
97 36 100       102 $out .= ' within '.(defined $self->{compression} ? 'record' : 'volume');
98 36         57 $out .="\n";
99              
100 36         124 return $out;
101             }
102              
103             sub _get_compression_error {
104 3     3   13 my $self = shift;
105              
106 3 100       15 if (not defined $self->{compression}) {
    100          
107 1         11 return '(record not compressed)';
108             } elsif ($self->{compression} eq 'IO::Uncompress::Gunzip') {
109 1         7 return $GunzipError;
110             } else {
111 1         8 die "unknown compression method";
112             }
113             }
114              
115 1     1 1 927 sub new { croak "WARC records are read from volumes" }
116              
117             sub _read {
118 632     632   1330 my $class = shift;
119 632         782 my $volume = shift;
120 632         884 my $offset = shift;
121              
122 632         819 my $handle;
123 632 100       1131 if (ref $offset) { # I/O handle passed in instead
124 4         6 $handle = $offset;
125 4         8 $offset = tell $handle;
126             } else { # open new handle and seek to offset
127 628         1490 $handle = $volume->open;
128 628 50       4556 seek $handle, $offset, SEEK_SET or die "seek: $!";
129             }
130              
131 632         2518 my %ob = (volume => $volume, offset => $offset);
132              
133 632         843 my $magic; my $protocol = '';
  632         916  
134 632 50       7915 defined(read $handle, $magic, 6) or die "read: $!";
135 632 100       2116 return undef if $magic eq ''; # end-of-file reached
136              
137 604 100       1471 if ($magic eq 'WARC/1') {
    100          
138             # uncompressed WARC record found ==> pass it on through
139 512         861 $protocol = $magic;
140             } elsif (unpack('H4', $magic) eq '1f8b') {
141             # gzip signature found ==> check for extension header and stack filter
142              
143 90 100       302 if (unpack('x3C', $magic) & 0x04) { # FLG.FEXTRA is set
144 35 50       118 defined(read $handle, $magic, 6, 6) or die "read: $!";
145 35         95 my $xlen = unpack 'v', substr $magic, -2;
146 35 50       60 my $extra; defined(read $handle, $extra, $xlen) or die "read: $!";
  35         85  
147 35         137 my @extra = unpack '(a2 v/a*)*', $extra;
148 35         90 $magic .= $extra;
149             # @extra is now (tag => $data)...
150 35         103 for (my $i = 0; $i < @extra; $i += 2) {
151 30 100 100     158 if ($extra[$i] eq 'sl' and length($extra[1+$i]) == 8)
152 20         116 { @ob{qw/sl_packed_size sl_full_size/} = unpack 'VV', $extra[1+$i] }
153             }
154             }
155              
156 90 50       608 $handle = new IO::Uncompress::Gunzip ($handle,
157             Prime => $magic, MultiStream => 0,
158             AutoClose => 1, Transparent => 0)
159             or die "IO::Uncompress::Gunzip: $GunzipError";
160 90         137749 $ob{compression} = 'IO::Uncompress::Gunzip';
161             } else
162 2         12 { croak "WARC record header not found at offset $offset in $volume\n"
163             ." found [".join(' ', unpack '(H2)*', $magic)."] instead" }
164              
165             # read WARC version
166 602         1982 $protocol .= <$handle>;
167 602         9810 $protocol =~ s/[[:space:]]+$//;
168             # The WARC version read from the file is appended because an
169             # uncompressed WARC record is recognized by the first six bytes of the
170             # WARC version tag, which were transferred to $protocol if found.
171 602 50       2028 $protocol =~ m/^WARC/
172             or croak "WARC record header not found after decompression\n"
173             ." found [".join(' ', unpack '(H2)*', $protocol)."] instead";
174 602         1339 $ob{protocol} = $protocol;
175              
176 602         2952 $ob{fields} = parse WARC::Fields from => $handle;
177 602         1652 $ob{fields}->set_readonly;
178              
179 602         1372 $ob{data_offset} = tell $handle;
180              
181 602         7936 close $handle;
182              
183 602         6681 { our $_total_read; $_total_read++ }
  602         685  
  602         909  
184              
185 602         4480 bless \%ob, $class;
186             }
187              
188 73     73 1 246 sub protocol { (shift)->{protocol} }
189              
190 1456     1456 1 4177 sub volume { (shift)->{volume} }
191              
192 841     841 1 7764 sub offset { (shift)->{offset} }
193              
194             sub logical {
195 162     162 1 315 my $self = shift;
196              
197 162         404 my $segment_header_value = $self->field('WARC-Segment-Number');
198 162 50       464 if (defined $self->{logical}) {
    50          
199 0         0 return $self->{logical}; # cached object remains valid ==> return it
200             } elsif (defined $segment_header_value) {
201 0         0 return _read WARC::Record::Logical $self;
202             } else {
203 162         424 return $self; # no continuation records present
204             }
205             }
206              
207 36     36 1 154 sub segments { return shift }
208              
209             sub next {
210 355     355 1 812 my $self = shift;
211              
212 355         469 my $next = undef;
213              
214 355 100       1098 if ($self->{sl_packed_size}) { # gzip "sl" extended header available
    100          
215 12         31 my $handle = $self->volume->open;
216              
217             # seek to read 32-bit ISIZE field at end of gzip stream
218 12 50       35 seek $handle, $self->offset + $self->{sl_packed_size} - 4, SEEK_SET
219             or die "seek: $!";
220 12 50       27 my $isize; defined(read $handle, $isize, 4) or die "read: $!";
  12         174  
221              
222 12 100 100     86 if (length $isize > 0 # read off the end yields nothing
223             and $self->{sl_full_size} == unpack 'V', $isize) { # ... and looks valid
224 4         13 $next = _read WARC::Record::FromVolume $self->volume, $handle;
225 4         122 close $handle;
226 4         20 return $next;
227             } else {
228 8         20 carp "extended 'sl' header was found to be invalid\n"
229             .' in record at '.($self->offset).' in '.($self->volume);
230             }
231             } elsif (not defined $self->{compression}) { # WARC record is not compressed
232             return _read WARC::Record::FromVolume $self->volume,
233 311         573 $self->{data_offset} + $self->field('Content-Length') + 4;
234             }
235              
236             # if we get here, we have to scan for the end of the record
237 40         3335 my $handle = $self->volume->open;
238 40 50       129 seek $handle, $self->offset, SEEK_SET or die "seek: $!";
239              
240             my $zhandle = $self->{compression}->new
241 40 50       299 ($handle, MultiStream => 0, AutoClose => 0)
242             or die "$self->{compression}: ".$self->_get_compression_error;
243 40 50       58663 seek $zhandle, $self->{data_offset} + $self->field('Content-Length'), SEEK_SET
244             or die "zseek: $! ".$self->_get_compression_error;
245 40 50       3226 my $end; defined(read $zhandle, $end, 4)
  40         113  
246             or die "zread: $! ".$self->_get_compression_error;
247 40 50       1536 croak "end-of-record marker not found" unless $end eq (WARC::CRLF x 2);
248              
249             # The main handle is somewhere *after* the actual end of the block
250             # because IO::Uncompress::Gunzip reads ahead. We can get the contents
251             # of that "read ahead" buffer and use that to adjust our final offset.
252 40         98 $next = _read WARC::Record::FromVolume $self->volume,
253             (tell($handle) - length($zhandle->trailingData));
254              
255 40         1069 close $zhandle; close $handle;
  40         1100  
256              
257 40         194 return $next;
258             }
259              
260             sub open_block {
261 128     128 1 208 my $self = shift;
262              
263 128         329 my $xhandle = Symbol::geniosym;
264 128         3409 tie *$xhandle, 'WARC::Record::Block', $self;
265              
266 128         498 return $xhandle;
267             }
268              
269 126     126 1 288 sub open_continued { (shift)->logical->open_block }
270              
271             sub replay {
272 140     140 1 4169 my $self = shift;
273              
274 140         378 my @handlers = WARC::Record::Replay::find_handlers($self);
275              
276 140         246 my $result = undef;
277 140   100     642 $result = (shift @handlers)->($self)
278             while scalar @handlers && !defined $result;
279              
280 140         1971 return $result;
281             }
282              
283             sub open_payload {
284 0     0 1   die "not yet implemented"
285             }
286              
287             1;
288             __END__