|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package WARC::Record::FromVolume;				# -*- CPerl -*-  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
71346
 | 
 use strict;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
60
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
895
 | 
    | 
| 
4
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
141
 | 
 use warnings;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
48
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1500
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @ISA = qw(WARC::Record);  | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @CARP_NOT = (@ISA, qw(WARC::Volume WARC::Record::Stub));  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
9
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
564
 | 
 use WARC; *WARC::Record::FromVolume::VERSION = \$WARC::VERSION;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
62
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
881
 | 
    | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
11
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
149
 | 
 use Carp;  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
86
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1531
 | 
    | 
| 
12
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
177
 | 
 use Fcntl 'SEEK_SET';  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
55
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1375
 | 
    | 
| 
13
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
12408
 | 
 use Symbol 'geniosym';  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
19971
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1539
 | 
    | 
| 
14
 | 
26
 | 
 
 | 
 
 | 
  
26
  
 | 
 
 | 
14875
 | 
 use IO::Uncompress::Gunzip '$GunzipError';  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1146509
 | 
    | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46729
 | 
    | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 require WARC::Fields;  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 require WARC::Record;  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 require WARC::Record::Block;  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 require WARC::Record::Replay;  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
21
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
 sub _set { croak "attempt to modify WARC record in file" }  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # The overload to a method call is inherited.  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub compareTo {  | 
| 
25
 | 
134
 | 
 
 | 
 
 | 
  
134
  
 | 
  
1
  
 | 
25773
 | 
   my $a = shift;  | 
| 
26
 | 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
208
 | 
   my $b = shift;  | 
| 
27
 | 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
199
 | 
   my $swap = shift;  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # sort in-memory-only records ahead of on-disk records  | 
| 
30
 | 
134
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
300
 | 
   return $swap ? -1 : 1 unless defined $b->volume;  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
32
 | 
131
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
349
 | 
   my $cmp =  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     ((($a->volume->filename eq $b->volume->filename)  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       || ($a->volume->_file_tag eq $b->volume->_file_tag))  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      ? ($a->offset <=> $b->offset)  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      : ($a->volume->filename cmp $b->volume->filename));  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
38
 | 
131
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1740
 | 
   return $swap ? 0-$cmp : 0+$cmp;  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # This implementation uses a hash as the underlying structure.  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  Keys inherited from WARC::Record base class:  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   fields  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  Keys defined by this class:  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   volume  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	Parent WARC::Volume object  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   collection (optional)  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	Parent WARC::Collection object, if record found via a collection  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   offset  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	Offset of start-of-record within parent volume  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   compression  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	Name of decompression filter used with this record  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   data_offset  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	Offset of data block within record (possibly compressed)  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   sl_packed_size  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	Size of compressed data block according to "sl" gzip extension  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   sl_full_size  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	Size of uncompressed data block according to "sl" gzip extension  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   protocol  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	WARC version found at start of record  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   logical (optional)  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	Weak reference to logical record object containing this segment  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	  (Defined by this class, but only set by WARC::Record::Logical.)  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  Keys tested by logical record heuristics:  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   compression  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	defined iff record is compressed  | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   sl_packed_size  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	defined iff compressed record can be skipped without reading data block  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  Keys used in index writers:  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   sl_packed_size  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #	used for "S" field in CDX indexes  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
81
 | 
1729
 | 
 
 | 
 
 | 
  
1729
  
 | 
 
 | 
28400
 | 
 sub DESTROY { our $_total_destroyed;	$_total_destroyed++ }  | 
| 
 
 | 
1729
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6680
 | 
    | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _dbg_dump {  | 
| 
84
 | 
36
 | 
 
 | 
 
 | 
  
36
  
 | 
 
 | 
16529
 | 
   my $self = shift;  | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
86
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
109
 | 
   my $out = 'WARC '.$self->field('WARC-Type').' record ['.$self->protocol.']';  | 
| 
87
 | 
36
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
168
 | 
   $out .= ' [via '.$self->{compression}.']' if defined $self->{compression};  | 
| 
88
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
70
 | 
   $out .= "\n";  | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
90
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
96
 | 
   $out .= ' id '.$self->id."\n";  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
92
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
96
 | 
   $out .= ' at '.$self->offset.' in '.$self->volume."\n";  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $out .= '  "sl" header:  '.$self->{sl_packed_size}.' packed from '  | 
| 
94
 | 
36
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
134
 | 
     .$self->{sl_full_size}." octets\n" if defined $self->{sl_full_size};  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
96
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
110
 | 
   $out .= ' data begins at offset '.$self->{data_offset};  | 
| 
97
 | 
36
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
102
 | 
   $out .= ' within '.(defined $self->{compression} ? 'record' : 'volume');  | 
| 
98
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
57
 | 
   $out .="\n";  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
100
 | 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
124
 | 
   return $out;  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _get_compression_error {  | 
| 
104
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
13
 | 
   my $self = shift;  | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
106
 | 
3
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
15
 | 
   if (not defined $self->{compression}) {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
107
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
     return '(record not compressed)';  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } elsif ($self->{compression} eq 'IO::Uncompress::Gunzip') {  | 
| 
109
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     return $GunzipError;  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {  | 
| 
111
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     die "unknown compression method";  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
115
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
927
 | 
 sub new { croak "WARC records are read from volumes" }  | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _read {  | 
| 
118
 | 
632
 | 
 
 | 
 
 | 
  
632
  
 | 
 
 | 
1330
 | 
   my $class = shift;  | 
| 
119
 | 
632
 | 
 
 | 
 
 | 
 
 | 
 
 | 
782
 | 
   my $volume = shift;  | 
| 
120
 | 
632
 | 
 
 | 
 
 | 
 
 | 
 
 | 
884
 | 
   my $offset = shift;  | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
122
 | 
632
 | 
 
 | 
 
 | 
 
 | 
 
 | 
819
 | 
   my $handle;  | 
| 
123
 | 
632
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1131
 | 
   if (ref $offset) {		# I/O handle passed in instead  | 
| 
124
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     $handle = $offset;  | 
| 
125
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     $offset = tell $handle;  | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {			# open new handle and seek to offset  | 
| 
127
 | 
628
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1490
 | 
     $handle = $volume->open;  | 
| 
128
 | 
628
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
4556
 | 
     seek $handle, $offset, SEEK_SET or die "seek: $!";  | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
131
 | 
632
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2518
 | 
   my %ob = (volume => $volume, offset => $offset);  | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
133
 | 
632
 | 
 
 | 
 
 | 
 
 | 
 
 | 
843
 | 
   my $magic; my $protocol = '';  | 
| 
 
 | 
632
 | 
 
 | 
 
 | 
 
 | 
 
 | 
916
 | 
    | 
| 
134
 | 
632
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
7915
 | 
   defined(read $handle, $magic, 6) or die "read: $!";  | 
| 
135
 | 
632
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
2116
 | 
   return undef if $magic eq '';	# end-of-file reached  | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
137
 | 
604
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1471
 | 
   if ($magic eq 'WARC/1') {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # uncompressed WARC record found ==> pass it on through  | 
| 
139
 | 
512
 | 
 
 | 
 
 | 
 
 | 
 
 | 
861
 | 
     $protocol = $magic;  | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } elsif (unpack('H4', $magic) eq '1f8b') {  | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # gzip signature found ==> check for extension header and stack filter  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
143
 | 
90
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
302
 | 
     if (unpack('x3C', $magic) & 0x04) { # FLG.FEXTRA is set  | 
| 
144
 | 
35
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
118
 | 
       defined(read $handle, $magic, 6, 6) or die "read: $!";  | 
| 
145
 | 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
95
 | 
       my $xlen = unpack 'v', substr $magic, -2;  | 
| 
146
 | 
35
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
60
 | 
       my $extra; defined(read $handle, $extra, $xlen) or die "read: $!";  | 
| 
 
 | 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
85
 | 
    | 
| 
147
 | 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
137
 | 
       my @extra = unpack '(a2 v/a*)*', $extra;  | 
| 
148
 | 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
90
 | 
       $magic .= $extra;  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       # @extra is now (tag => $data)...  | 
| 
150
 | 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
103
 | 
       for (my $i = 0; $i < @extra; $i += 2) {  | 
| 
151
 | 
30
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
158
 | 
 	if ($extra[$i] eq 'sl' and length($extra[1+$i]) == 8)  | 
| 
152
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
116
 | 
 	  { @ob{qw/sl_packed_size sl_full_size/} = unpack 'VV', $extra[1+$i] }  | 
| 
153
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       }  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
156
 | 
90
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
608
 | 
     $handle = new IO::Uncompress::Gunzip ($handle,  | 
| 
157
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 					  Prime => $magic, MultiStream => 0,  | 
| 
158
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 					  AutoClose => 1, Transparent => 0)  | 
| 
159
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       or die "IO::Uncompress::Gunzip: $GunzipError";  | 
| 
160
 | 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
137749
 | 
     $ob{compression} = 'IO::Uncompress::Gunzip';  | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else  | 
| 
162
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     { croak "WARC record header not found at offset $offset in $volume\n"  | 
| 
163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	." found [".join(' ', unpack '(H2)*', $magic)."] instead" }  | 
| 
164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # read WARC version  | 
| 
166
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1982
 | 
   $protocol .= <$handle>;  | 
| 
167
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9810
 | 
   $protocol =~ s/[[:space:]]+$//;  | 
| 
168
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  The WARC version read from the file is appended because an  | 
| 
169
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   uncompressed WARC record is recognized by the first six bytes of the  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #   WARC version tag, which were transferred to $protocol if found.  | 
| 
171
 | 
602
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2028
 | 
   $protocol =~ m/^WARC/  | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     or croak "WARC record header not found after decompression\n"  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       ." found [".join(' ', unpack '(H2)*', $protocol)."] instead";  | 
| 
174
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1339
 | 
   $ob{protocol} = $protocol;  | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
176
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2952
 | 
   $ob{fields} = parse WARC::Fields from => $handle;  | 
| 
177
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1652
 | 
   $ob{fields}->set_readonly;  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
179
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1372
 | 
   $ob{data_offset} = tell $handle;  | 
| 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
181
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7936
 | 
   close $handle;  | 
| 
182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
183
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6681
 | 
   { our $_total_read;	$_total_read++ }  | 
| 
 
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
685
 | 
    | 
| 
 
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
909
 | 
    | 
| 
184
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
185
 | 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4480
 | 
   bless \%ob, $class;  | 
| 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
188
 | 
73
 | 
 
 | 
 
 | 
  
73
  
 | 
  
1
  
 | 
246
 | 
 sub protocol { (shift)->{protocol} }  | 
| 
189
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
190
 | 
1456
 | 
 
 | 
 
 | 
  
1456
  
 | 
  
1
  
 | 
4177
 | 
 sub volume { (shift)->{volume} }  | 
| 
191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
192
 | 
841
 | 
 
 | 
 
 | 
  
841
  
 | 
  
1
  
 | 
7764
 | 
 sub offset { (shift)->{offset} }  | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
194
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub logical {  | 
| 
195
 | 
162
 | 
 
 | 
 
 | 
  
162
  
 | 
  
1
  
 | 
315
 | 
   my $self = shift;  | 
| 
196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
197
 | 
162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
404
 | 
   my $segment_header_value = $self->field('WARC-Segment-Number');  | 
| 
198
 | 
162
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
464
 | 
   if (defined $self->{logical}) {  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
199
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     return $self->{logical};	# cached object remains valid ==> return it  | 
| 
200
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } elsif (defined $segment_header_value) {  | 
| 
201
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     return _read WARC::Record::Logical $self;  | 
| 
202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } else {  | 
| 
203
 | 
162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
424
 | 
     return $self;		# no continuation records present  | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
207
 | 
36
 | 
 
 | 
 
 | 
  
36
  
 | 
  
1
  
 | 
154
 | 
 sub segments { return shift }  | 
| 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub next {  | 
| 
210
 | 
355
 | 
 
 | 
 
 | 
  
355
  
 | 
  
1
  
 | 
812
 | 
   my $self = shift;  | 
| 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
212
 | 
355
 | 
 
 | 
 
 | 
 
 | 
 
 | 
469
 | 
   my $next = undef;  | 
| 
213
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
214
 | 
355
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
1098
 | 
   if ($self->{sl_packed_size}) { # gzip "sl" extended header available  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
215
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
     my $handle = $self->volume->open;  | 
| 
216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
217
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # seek to read 32-bit ISIZE field at end of gzip stream  | 
| 
218
 | 
12
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
35
 | 
     seek $handle, $self->offset + $self->{sl_packed_size} - 4, SEEK_SET  | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       or die "seek: $!";  | 
| 
220
 | 
12
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
27
 | 
     my $isize; defined(read $handle, $isize, 4) or die "read: $!";  | 
| 
 
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
174
 | 
    | 
| 
221
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
222
 | 
12
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
86
 | 
     if (length $isize > 0	# read off the end yields nothing  | 
| 
223
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	and $self->{sl_full_size} == unpack 'V', $isize) { # ... and looks valid  | 
| 
224
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
       $next = _read WARC::Record::FromVolume $self->volume, $handle;  | 
| 
225
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
122
 | 
       close $handle;  | 
| 
226
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
       return $next;  | 
| 
227
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
228
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
       carp "extended 'sl' header was found to be invalid\n"  | 
| 
229
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	.'  in record at '.($self->offset).' in '.($self->volume);  | 
| 
230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
231
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   } elsif (not defined $self->{compression}) { # WARC record is not compressed  | 
| 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return _read WARC::Record::FromVolume $self->volume,  | 
| 
233
 | 
311
 | 
 
 | 
 
 | 
 
 | 
 
 | 
573
 | 
       $self->{data_offset} + $self->field('Content-Length') + 4;  | 
| 
234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   }  | 
| 
235
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
236
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # if we get here, we have to scan for the end of the record  | 
| 
237
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3335
 | 
   my $handle = $self->volume->open;  | 
| 
238
 | 
40
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
129
 | 
   seek $handle, $self->offset, SEEK_SET or die "seek: $!";  | 
| 
239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
240
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my $zhandle = $self->{compression}->new  | 
| 
241
 | 
40
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
299
 | 
     ($handle, MultiStream => 0, AutoClose => 0)  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       or die "$self->{compression}: ".$self->_get_compression_error;  | 
| 
243
 | 
40
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
58663
 | 
   seek $zhandle, $self->{data_offset} + $self->field('Content-Length'), SEEK_SET  | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     or die "zseek: $! ".$self->_get_compression_error;  | 
| 
245
 | 
40
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3226
 | 
   my $end; defined(read $zhandle, $end, 4)  | 
| 
 
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
113
 | 
    | 
| 
246
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     or die "zread: $! ".$self->_get_compression_error;  | 
| 
247
 | 
40
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
1536
 | 
   croak "end-of-record marker not found" unless $end eq (WARC::CRLF x 2);  | 
| 
248
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # The main handle is somewhere *after* the actual end of the block  | 
| 
250
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  because IO::Uncompress::Gunzip reads ahead.  We can get the contents  | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   #  of that "read ahead" buffer and use that to adjust our final offset.  | 
| 
252
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
98
 | 
   $next = _read WARC::Record::FromVolume $self->volume,  | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     (tell($handle) - length($zhandle->trailingData));  | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
255
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1069
 | 
   close $zhandle; close $handle;  | 
| 
 
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1100
 | 
    | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
257
 | 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
194
 | 
   return $next;  | 
| 
258
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
260
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub open_block {  | 
| 
261
 | 
128
 | 
 
 | 
 
 | 
  
128
  
 | 
  
1
  
 | 
208
 | 
   my $self = shift;  | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
263
 | 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
329
 | 
   my $xhandle = Symbol::geniosym;  | 
| 
264
 | 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3409
 | 
   tie *$xhandle, 'WARC::Record::Block', $self;  | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
266
 | 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
498
 | 
   return $xhandle;  | 
| 
267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
269
 | 
126
 | 
 
 | 
 
 | 
  
126
  
 | 
  
1
  
 | 
288
 | 
 sub open_continued { (shift)->logical->open_block }  | 
| 
270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
271
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub replay {  | 
| 
272
 | 
140
 | 
 
 | 
 
 | 
  
140
  
 | 
  
1
  
 | 
4169
 | 
   my $self = shift;  | 
| 
273
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
274
 | 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
378
 | 
   my @handlers = WARC::Record::Replay::find_handlers($self);  | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
276
 | 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
246
 | 
   my $result = undef;  | 
| 
277
 | 
140
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
642
 | 
   $result = (shift @handlers)->($self)  | 
| 
278
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     while scalar @handlers && !defined $result;  | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
280
 | 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1971
 | 
   return $result;  | 
| 
281
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub open_payload {  | 
| 
284
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
 
 | 
   die "not yet implemented"  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
286
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
287
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
288
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |