line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WARC::Record::Logical; # -*- CPerl -*- |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
68341
|
use strict; |
|
2
|
|
|
|
|
11
|
|
|
2
|
|
|
|
|
60
|
|
4
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
93
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
require WARC::Record::FromVolume; |
7
|
|
|
|
|
|
|
our @ISA = qw(WARC::Record::FromVolume); |
8
|
|
|
|
|
|
|
|
9
|
2
|
|
|
2
|
|
410
|
use WARC; *WARC::Record::Logical::VERSION = \$WARC::VERSION; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
80
|
|
10
|
|
|
|
|
|
|
|
11
|
2
|
|
|
2
|
|
11
|
use Carp; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
99
|
|
12
|
2
|
|
|
2
|
|
2408
|
use Math::BigInt; |
|
2
|
|
|
|
|
51654
|
|
|
2
|
|
|
|
|
9
|
|
13
|
2
|
|
|
2
|
|
47759
|
use Scalar::Util qw(); |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
38
|
|
14
|
2
|
|
|
2
|
|
12
|
use Symbol 'geniosym'; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
169
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
require WARC::Fields; |
17
|
|
|
|
|
|
|
require WARC::Record::Logical::Block; |
18
|
|
|
|
|
|
|
require WARC::Record::Logical::Heuristics; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# inherit _set |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# inherit compareTo |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# This implementation uses a hash as the underlying structure. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# Keys inherited from WARC::Record base class (via WARC::Record::FromVolume): |
27
|
|
|
|
|
|
|
# |
28
|
|
|
|
|
|
|
# fields |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# Keys inherited from WARC::Record::FromVolume base class: |
31
|
|
|
|
|
|
|
# |
32
|
|
|
|
|
|
|
# collection (optional) |
33
|
|
|
|
|
|
|
# Parent WARC::Collection object, if available |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Keys defined by this class: |
36
|
|
|
|
|
|
|
# |
37
|
|
|
|
|
|
|
# segments |
38
|
|
|
|
|
|
|
# Array, each element is array of: |
39
|
|
|
|
|
|
|
# SEG_REC: WARC::Record::FromVolume |
40
|
|
|
|
|
|
|
# SEG_BASE: integer, logical offset of first octet in segment block |
41
|
|
|
|
|
|
|
# SEG_LENGTH: integer, number of octets in segment data block |
42
|
|
|
|
|
|
|
|
43
|
2
|
|
|
2
|
|
12
|
use constant {SEG_REC => 0, SEG_LENGTH => 1, SEG_BASE => 2}; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
151
|
|
44
|
2
|
|
|
2
|
|
14
|
use constant SEGMENT_INDEX => qw/SEG_REC SEG_LENGTH SEG_BASE/; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
1811
|
|
45
|
|
|
|
|
|
|
|
46
|
6
|
|
|
6
|
|
842
|
sub DESTROY { our $_total_destroyed; $_total_destroyed++ } |
|
6
|
|
|
|
|
28
|
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub _dbg_dump { |
49
|
3
|
|
|
3
|
|
17
|
my $self = shift; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
my $out = 'WARC logical record [' |
52
|
3
|
|
|
|
|
5
|
.(scalar @{$self->{segments}})." segments] containing:\n"; |
|
3
|
|
|
|
|
12
|
|
53
|
|
|
|
|
|
|
my @out = |
54
|
3
|
|
|
|
|
6
|
map {s/^/ /gm; $_} map {$_->[SEG_REC]->_dbg_dump} @{$self->{segments}}; |
|
20
|
|
|
|
|
125
|
|
|
20
|
|
|
|
|
42
|
|
|
20
|
|
|
|
|
53
|
|
|
3
|
|
|
|
|
8
|
|
55
|
3
|
|
|
|
|
25
|
$out .= join("\n", @out); |
56
|
|
|
|
|
|
|
|
57
|
3
|
|
|
|
|
18
|
return $out; |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# inherit new |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub _read { |
63
|
17
|
|
|
17
|
|
178
|
my $class = shift; |
64
|
17
|
|
|
|
|
25
|
my $member = shift; |
65
|
|
|
|
|
|
|
|
66
|
17
|
|
|
|
|
28
|
my %ob = (); |
67
|
17
|
100
|
|
|
|
79
|
$ob{collection} = $member->{collection} if defined $member->{collection}; |
68
|
|
|
|
|
|
|
|
69
|
17
|
|
|
|
|
52
|
my $member_segment_number = $member->field('WARC-Segment-Number'); |
70
|
17
|
100
|
|
|
|
277
|
croak "attempting to load logical record for non-segmented record" |
71
|
|
|
|
|
|
|
unless $member_segment_number; |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# find the first segment |
74
|
16
|
|
|
|
|
33
|
my $first_segment = undef; my @clues = (); |
|
16
|
|
|
|
|
30
|
|
75
|
|
|
|
|
|
|
SEGMENT: { |
76
|
16
|
100
|
|
|
|
21
|
if ($member_segment_number == 1) { |
|
16
|
|
|
|
|
42
|
|
77
|
8
|
|
|
|
|
13
|
$first_segment = $member; # <-- that was easy... |
78
|
|
|
|
|
|
|
} else { # ... less easy: go find the first segment... |
79
|
8
|
|
|
|
|
18
|
my $segment_origin_id = $member->field('WARC-Segment-Origin-ID'); |
80
|
8
|
100
|
|
|
|
157
|
croak "record segment lacks required 'WARC-Segment-Origin-ID' field" |
81
|
|
|
|
|
|
|
unless $segment_origin_id; |
82
|
7
|
100
|
100
|
|
|
32
|
if (defined $member->{collection} |
83
|
|
|
|
|
|
|
&& $member->{collection}->searchable('record_id')) { |
84
|
|
|
|
|
|
|
$first_segment = $member->{collection}->search |
85
|
3
|
|
|
|
|
29
|
(record_id => $segment_origin_id); |
86
|
3
|
100
|
|
|
|
15
|
next SEGMENT if defined $first_segment; |
87
|
1
|
|
|
|
|
109
|
carp "index failed to locate first segment by Record-ID"; |
88
|
|
|
|
|
|
|
# ... and onwards to heuristics ... |
89
|
|
|
|
|
|
|
} |
90
|
5
|
|
|
|
|
80
|
($first_segment, @clues) = |
91
|
|
|
|
|
|
|
WARC::Record::Logical::Heuristics::find_first_segment ($member); |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
} |
94
|
15
|
100
|
|
|
|
353
|
croak "failed to locate first segment of logical record" |
95
|
|
|
|
|
|
|
unless defined $first_segment; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# find the other segments |
98
|
12
|
|
|
|
|
20
|
my @pool = (); |
99
|
|
|
|
|
|
|
SEGMENT: { |
100
|
12
|
100
|
100
|
|
|
16
|
if (defined $member->{collection} |
|
12
|
|
|
|
|
47
|
|
101
|
|
|
|
|
|
|
&& $member->{collection}->searchable('segment_origin_id')) { |
102
|
|
|
|
|
|
|
@pool = $member->{collection}->search |
103
|
7
|
|
|
|
|
103
|
(segment_origin_id => $first_segment->id); |
104
|
20
|
|
|
|
|
40
|
@pool = map {$_->[0]} sort {$a->[1] <=> $b->[1]} |
|
43
|
|
|
|
|
121
|
|
105
|
7
|
|
|
|
|
19
|
map {[$_, $_->field('WARC-Segment-Number')]} @pool; |
|
20
|
|
|
|
|
51
|
|
106
|
|
|
|
|
|
|
last SEGMENT if # we have all of the segments |
107
|
|
|
|
|
|
|
(@pool |
108
|
7
|
100
|
100
|
|
|
31
|
&& ($pool[-1]->field('WARC-Segment-Number') == (1+@pool)) |
|
|
|
100
|
|
|
|
|
109
|
|
|
|
|
|
|
&& ($pool[-1]->field('WARC-Segment-Total-Length'))); |
110
|
4
|
|
|
|
|
470
|
carp "index failed to locate all segments by Origin-ID"; |
111
|
|
|
|
|
|
|
# ... and onwards to heuristics ... |
112
|
|
|
|
|
|
|
} |
113
|
9
|
|
|
|
|
287
|
push @pool, (WARC::Record::Logical::Heuristics::find_continuation |
114
|
|
|
|
|
|
|
($first_segment, @pool, @clues)); |
115
|
|
|
|
|
|
|
# sort again in case heuristics added more records |
116
|
13
|
|
|
|
|
31
|
@pool = map {$_->[0]} sort {$a->[1] <=> $b->[1]} |
|
11
|
|
|
|
|
33
|
|
117
|
9
|
|
|
|
|
47
|
map {[$_, $_->field('WARC-Segment-Number')]} @pool; |
|
13
|
|
|
|
|
32
|
|
118
|
|
|
|
|
|
|
} |
119
|
12
|
100
|
|
|
|
357
|
croak "failed to locate any continuation segments for logical record" |
120
|
|
|
|
|
|
|
unless scalar @pool > 0; |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# assemble logical record segments |
123
|
9
|
|
|
|
|
19
|
my @record = ($first_segment); |
124
|
|
|
|
|
|
|
{ |
125
|
9
|
|
|
|
|
17
|
my $i = 0; |
|
9
|
|
|
|
|
11
|
|
126
|
9
|
|
|
|
|
21
|
while ($i < @pool) { |
127
|
24
|
|
|
|
|
59
|
my $segment_number = $pool[$i]->field('WARC-Segment-Number'); |
128
|
24
|
|
|
|
|
52
|
push @record, $pool[$i]; |
129
|
24
|
|
100
|
|
|
72
|
$i++ # skip duplicate segments heuristics may have found |
130
|
|
|
|
|
|
|
while $i < @pool |
131
|
|
|
|
|
|
|
&& $segment_number == $pool[$i]->field('WARC-Segment-Number'); |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# verify logical record |
136
|
9
|
|
|
|
|
25
|
for (my $i = 0; $i < @record; $i++) { |
137
|
33
|
100
|
|
|
|
85
|
croak "logical record segment missing or out-of-place" |
138
|
|
|
|
|
|
|
unless $record[$i]->field('WARC-Segment-Number') == (1+$i); |
139
|
32
|
100
|
100
|
|
|
132
|
croak "logical record segment not part of record (corrupted index?)" |
140
|
|
|
|
|
|
|
unless $i == 0 |
141
|
|
|
|
|
|
|
|| $record[$i]->field('WARC-Segment-Origin-ID') eq $record[0]->id; |
142
|
|
|
|
|
|
|
} |
143
|
7
|
100
|
|
|
|
20
|
croak "final segment lacks required 'WARC-Segment-Total-Length' header" |
144
|
|
|
|
|
|
|
unless $record[-1]->field('WARC-Segment-Total-Length'); |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
# assemble logical record header |
147
|
6
|
|
|
|
|
21
|
my $fields = $record[0]->fields->clone; |
148
|
|
|
|
|
|
|
{ |
149
|
|
|
|
|
|
|
# Set "Content-Length" to the total length |
150
|
6
|
|
|
|
|
10
|
$fields->field('Content-Length', |
|
6
|
|
|
|
|
19
|
|
151
|
|
|
|
|
|
|
$record[-1]->field('WARC-Segment-Total-Length')); |
152
|
|
|
|
|
|
|
# Transfer any other non-segment-related headers that appear on the |
153
|
|
|
|
|
|
|
# last segment and are not present at the first segment. |
154
|
6
|
|
|
|
|
9
|
foreach my $key (grep !m/^WARC-Segment-/, keys %{$record[-1]->fields}) { |
|
6
|
|
|
|
|
16
|
|
155
|
22
|
100
|
|
|
|
56
|
$fields->field($key, $record[-1]->field($key)) |
156
|
|
|
|
|
|
|
unless defined $fields->field($key); |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
# Delete the block digest header, since it is from a segment. |
159
|
6
|
|
|
|
|
28
|
$fields->field('WARC-Block-Digest' => []); |
160
|
|
|
|
|
|
|
# Delete all segment-related headers |
161
|
6
|
|
|
|
|
11
|
my %fields; tie %fields, ref $fields, $fields; |
|
6
|
|
|
|
|
22
|
|
162
|
6
|
|
|
|
|
20
|
my @segment_headers = grep m/^WARC-Segment/, keys %fields; |
163
|
6
|
|
|
|
|
28
|
$fields->field($_ => []) for @segment_headers; |
164
|
6
|
|
|
|
|
19
|
untie %fields; |
165
|
|
|
|
|
|
|
} |
166
|
6
|
|
|
|
|
20
|
$fields->set_readonly; |
167
|
6
|
|
|
|
|
12
|
$ob{fields} = $fields; |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# assemble logical record data |
170
|
6
|
|
|
|
|
10
|
my @segments = (); |
171
|
|
|
|
|
|
|
{ |
172
|
2
|
|
|
2
|
|
26
|
use integer; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
19
|
|
|
6
|
|
|
|
|
9
|
|
173
|
6
|
|
|
|
|
10
|
my $running_base = 0; |
174
|
6
|
|
|
|
|
16
|
for (my $i = 0; $i < @record; $i++) { |
175
|
27
|
|
|
|
|
2527
|
my @row = (); |
176
|
|
|
|
|
|
|
|
177
|
27
|
|
|
|
|
60
|
$row[SEG_REC] = $record[$i]; |
178
|
27
|
|
|
|
|
72
|
$row[SEG_LENGTH] = 0+$record[$i]->field('Content-Length'); |
179
|
|
|
|
|
|
|
|
180
|
27
|
100
|
100
|
|
|
108
|
$running_base = Math::BigInt->new($running_base) |
181
|
|
|
|
|
|
|
if ((not ref $running_base) |
182
|
|
|
|
|
|
|
&& (($running_base + $row[SEG_LENGTH]) < $running_base)); |
183
|
27
|
|
|
|
|
98
|
$row[SEG_BASE] = $running_base; |
184
|
|
|
|
|
|
|
|
185
|
27
|
|
|
|
|
55
|
$segments[$i] = \@row; |
186
|
27
|
|
|
|
|
73
|
$running_base += $row[SEG_LENGTH]; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
} |
189
|
6
|
|
|
|
|
179
|
$ob{segments} = \@segments; |
190
|
|
|
|
|
|
|
|
191
|
6
|
|
|
|
|
9
|
{ our $_total_read; $_total_read++ } |
|
6
|
|
|
|
|
7
|
|
|
6
|
|
|
|
|
10
|
|
192
|
|
|
|
|
|
|
|
193
|
6
|
|
|
|
|
11
|
my $self = bless \%ob, $class; |
194
|
|
|
|
|
|
|
|
195
|
6
|
|
|
|
|
17
|
$_->[SEG_REC]->{logical} = $self for @{$self->{segments}}; |
|
6
|
|
|
|
|
85
|
|
196
|
6
|
|
|
|
|
13
|
Scalar::Util::weaken $_->[SEG_REC]->{logical} for @{$self->{segments}}; |
|
6
|
|
|
|
|
36
|
|
197
|
|
|
|
|
|
|
|
198
|
6
|
|
|
|
|
35
|
return $self; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
2
|
|
|
2
|
1
|
9
|
sub protocol { (shift)->{segments}[0][SEG_REC]->protocol } |
202
|
20
|
|
|
20
|
1
|
1195
|
sub volume { (shift)->{segments}[0][SEG_REC]->volume } |
203
|
14
|
|
|
14
|
1
|
1222
|
sub offset { (shift)->{segments}[0][SEG_REC]->offset } |
204
|
|
|
|
|
|
|
|
205
|
2
|
|
|
2
|
1
|
10
|
sub logical { shift } |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
sub segments { |
208
|
13
|
100
|
|
13
|
1
|
50
|
if (wantarray) { |
209
|
10
|
|
|
|
|
15
|
return map {$_->[SEG_REC]} @{(shift)->{segments}} |
|
35
|
|
|
|
|
73
|
|
|
10
|
|
|
|
|
26
|
|
210
|
|
|
|
|
|
|
} else { |
211
|
3
|
|
|
|
|
6
|
return scalar @{(shift)->{segments}} |
|
3
|
|
|
|
|
19
|
|
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
2
|
|
|
2
|
1
|
1093
|
sub next { (shift)->{segments}[-1][SEG_REC]->next } |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub open_block { |
218
|
4
|
|
|
4
|
1
|
1147
|
my $self = shift; |
219
|
|
|
|
|
|
|
|
220
|
4
|
|
|
|
|
14
|
my $xhandle = Symbol::geniosym; |
221
|
4
|
|
|
|
|
126
|
tie *$xhandle, 'WARC::Record::Logical::Block', $self; |
222
|
|
|
|
|
|
|
|
223
|
4
|
|
|
|
|
41
|
return $xhandle; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
2
|
|
|
2
|
1
|
5
|
sub open_continued { (shift)->open_block } |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# inherit replay |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# inherit open_payload |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
1; |
233
|
|
|
|
|
|
|
__END__ |