line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WARC::Fields; # -*- CPerl -*- |
2
|
|
|
|
|
|
|
|
3
|
26
|
|
|
26
|
|
70061
|
use strict; |
|
26
|
|
|
|
|
70
|
|
|
26
|
|
|
|
|
792
|
|
4
|
26
|
|
|
26
|
|
138
|
use warnings; |
|
26
|
|
|
|
|
55
|
|
|
26
|
|
|
|
|
681
|
|
5
|
|
|
|
|
|
|
|
6
|
26
|
|
|
26
|
|
132
|
use Carp; |
|
26
|
|
|
|
|
52
|
|
|
26
|
|
|
|
|
1379
|
|
7
|
26
|
|
|
26
|
|
14845
|
use Encode; |
|
26
|
|
|
|
|
250914
|
|
|
26
|
|
|
|
|
1822
|
|
8
|
26
|
|
|
26
|
|
191
|
use Scalar::Util; |
|
26
|
|
|
|
|
59
|
|
|
26
|
|
|
|
|
1342
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our @ISA = qw(); |
11
|
|
|
|
|
|
|
|
12
|
26
|
|
|
26
|
|
964
|
use WARC; *WARC::Fields::VERSION = \$WARC::VERSION; |
|
26
|
|
|
|
|
58
|
|
|
26
|
|
|
|
|
1360
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
WARC::Fields - WARC record headers and application/warc-fields |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
require WARC::Fields; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
$f = new WARC::Fields; |
23
|
|
|
|
|
|
|
$f = $record->fields; # get WARC record headers |
24
|
|
|
|
|
|
|
$g = $f->clone; # make writable copy |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
$g->set_readonly; # make read-only |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
$f->field('WARC-Type' => 'metadata'); # set |
29
|
|
|
|
|
|
|
$value = $f->field('WARC-Type'); # get |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
$fields_text = $f->as_string; # get WARC header lines for display |
32
|
|
|
|
|
|
|
$fields_block = $f->as_block; # format for WARC file |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
tie @field_names, ref $f, $f; # bind ordered list of field names |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
tie %fields, ref $f, $f; # bind hash of field names => values |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
$entry = $f->[$num]; # tie an anonymous array and access it |
39
|
|
|
|
|
|
|
$value = $f->{$name}; # likewise with an anonymous tied hash |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
$name = "$entry"; # tied array returns objects |
42
|
|
|
|
|
|
|
$value = $entry->value; # one specific value |
43
|
|
|
|
|
|
|
$offset = $entry->offset; # N of M with same name |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
foreach (keys %{$f}) { ... } # iterate over names, in order |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
=cut |
48
|
|
|
|
|
|
|
|
49
|
26
|
|
|
26
|
|
2575
|
use overload '@{}' => \&_as_tied_array, '%{}' => \&_as_tied_hash; |
|
26
|
|
|
|
|
2035
|
|
|
26
|
|
|
|
|
266
|
|
50
|
26
|
|
|
26
|
|
2035
|
use overload fallback => 1; |
|
26
|
|
|
|
|
50
|
|
|
26
|
|
|
|
|
93
|
|
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# This implementation uses column-oriented storage, with an array as the |
53
|
|
|
|
|
|
|
# underlying object and constants to select array offsets. |
54
|
|
|
|
|
|
|
# |
55
|
|
|
|
|
|
|
# The NAMES and VALUES columns are always valid, but the MVOFF and INDEX |
56
|
|
|
|
|
|
|
# positions may be undefined and are lazily rebuilt when needed. |
57
|
|
|
|
|
|
|
|
58
|
26
|
|
|
|
|
2982
|
use constant { NAMES => 0, VALUES => 1, MVOFF => 2, |
59
|
26
|
|
|
26
|
|
1694
|
INDEX => 3, IS_RO => 4, C_TA => 5, C_TH => 6 }; |
|
26
|
|
|
|
|
53
|
|
60
|
26
|
|
|
26
|
|
174
|
use constant OBJECT_INDEX => qw/NAMES VALUES MVOFF INDEX IS_RO/; |
|
26
|
|
|
|
|
52
|
|
|
26
|
|
|
|
|
1668
|
|
61
|
26
|
|
|
26
|
|
149
|
use constant OBJECT_INIT => undef, undef, undef, undef, 0, undef, undef; |
|
26
|
|
|
|
|
54
|
|
|
26
|
|
|
|
|
82552
|
|
62
|
|
|
|
|
|
|
|
63
|
706
|
|
|
706
|
|
8032
|
sub DESTROY { my $ob = shift; |
64
|
706
|
100
|
|
|
|
1803
|
untie @{$$ob->[C_TA]} if defined $$ob->[C_TA]; |
|
3
|
|
|
|
|
22
|
|
65
|
706
|
100
|
|
|
|
1283
|
untie %{$$ob->[C_TH]} if defined $$ob->[C_TH]; |
|
61
|
|
|
|
|
188
|
|
66
|
706
|
|
|
|
|
891
|
our $_total_destroyed; $_total_destroyed++ } |
|
706
|
|
|
|
|
6997
|
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# NAMES: array of field names, exactly as written |
69
|
|
|
|
|
|
|
# VALUES: array of field values |
70
|
|
|
|
|
|
|
# MVOFF: array of offsets for multiple-valued fields |
71
|
|
|
|
|
|
|
# INDEX: hash of case-folded field names to array of row numbers |
72
|
|
|
|
|
|
|
# IS_RO: boolean: TRUE if this object is read-only |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# C_TA: cache: tied array for array dereference |
75
|
|
|
|
|
|
|
# C_TH: cache: tide hash for hash dereference |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub _rebuild_INDEX { |
78
|
30
|
|
|
30
|
|
45
|
my $self = shift; |
79
|
30
|
|
|
|
|
49
|
my %idx = (); |
80
|
|
|
|
|
|
|
|
81
|
30
|
|
|
|
|
50
|
for (my $i = 0; $i < @{$$self->[NAMES]}; $i++) { |
|
113
|
|
|
|
|
243
|
|
82
|
83
|
|
|
|
|
99
|
push @{$idx{lc $$self->[NAMES][$i]}}, $i; |
|
83
|
|
|
|
|
1199
|
|
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
30
|
|
|
|
|
66
|
$$self->[INDEX] = \%idx; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub _update_INDEX { |
89
|
72
|
|
|
72
|
|
99
|
my $self = shift; # INDEX slot must be valid |
90
|
72
|
|
|
|
|
99
|
my $base = shift; # row number where an insertion or removal was made |
91
|
72
|
|
|
|
|
94
|
my $count = shift; # how many rows were inserted (+) or removed (-) |
92
|
|
|
|
|
|
|
|
93
|
72
|
|
|
|
|
109
|
my %done = (); |
94
|
|
|
|
|
|
|
|
95
|
72
|
100
|
|
|
|
169
|
for (my $i = ($base < 0) ? 0 : $base; $i < @{$$self->[NAMES]}; $i++) { |
|
186
|
|
|
|
|
440
|
|
96
|
114
|
|
|
|
|
194
|
my $key = lc $$self->[NAMES][$i]; |
97
|
114
|
100
|
|
|
|
221
|
next if $done{$key}; |
98
|
72
|
|
|
|
|
159
|
@{$$self->[INDEX]{$key}} = |
99
|
72
|
100
|
|
|
|
87
|
map { $_ + ($_ > $base ? $count : 0) } @{$$self->[INDEX]{$key}}; |
|
118
|
|
|
|
|
270
|
|
|
72
|
|
|
|
|
149
|
|
100
|
72
|
|
|
|
|
164
|
$done{$key}++; |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub _rebuild_MVOFF { |
105
|
6
|
|
|
6
|
|
11
|
my $self = shift; |
106
|
|
|
|
|
|
|
|
107
|
6
|
|
|
|
|
12
|
my @mvoff = ();#(undef) x scalar @{$$self->[NAMES]}; |
108
|
6
|
|
|
|
|
11
|
my %cidx = (); # counted index; references to unique entries |
109
|
|
|
|
|
|
|
|
110
|
6
|
|
|
|
|
11
|
foreach my $name (@{$$self->[NAMES]}) { |
|
6
|
|
|
|
|
18
|
|
111
|
10028
|
|
|
|
|
12242
|
my $key = lc $name; |
112
|
10028
|
100
|
|
|
|
18019
|
if (not defined $cidx{$key}) { |
|
|
100
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# first time this key is seen |
114
|
26
|
|
|
|
|
54
|
push @mvoff, undef; |
115
|
26
|
|
|
|
|
58
|
$cidx{$key} = \$mvoff[$#mvoff]; |
116
|
|
|
|
|
|
|
} elsif (ref $cidx{$key} eq 'SCALAR') { |
117
|
|
|
|
|
|
|
# second time this key is seen |
118
|
4
|
|
|
|
|
9
|
push @mvoff, 1; |
119
|
4
|
|
|
|
|
7
|
${$cidx{$key}} = 0; # replace undefined value |
|
4
|
|
|
|
|
10
|
|
120
|
4
|
|
|
|
|
9
|
$cidx{$key} = 2; # prepare counter |
121
|
|
|
|
|
|
|
} else { |
122
|
|
|
|
|
|
|
# third or later time this key is seen |
123
|
9998
|
|
|
|
|
13964
|
push @mvoff, $cidx{$key}++; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
6
|
|
|
|
|
31
|
$$self->[MVOFF] = \@mvoff; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
sub _dbg_dump { |
131
|
51
|
|
|
51
|
|
5351
|
my $self = shift; |
132
|
|
|
|
|
|
|
|
133
|
51
|
|
|
|
|
82
|
my @mvoff = (' ') x @{$$self->[NAMES]}; |
|
51
|
|
|
|
|
2499
|
|
134
|
20016
|
100
|
|
|
|
29784
|
@mvoff = map { defined $_ ? $_ : 'U' } |
135
|
51
|
100
|
|
|
|
174
|
@{$$self->[MVOFF]}[0 .. $#{$$self->[NAMES]}] |
|
5
|
|
|
|
|
206
|
|
|
5
|
|
|
|
|
280
|
|
136
|
|
|
|
|
|
|
if defined $$self->[MVOFF]; |
137
|
51
|
|
|
|
|
511
|
my @widths = map {length} qw/ROW MVO NAME/; |
|
153
|
|
|
|
|
279
|
|
138
|
|
|
|
|
|
|
|
139
|
51
|
|
|
|
|
77
|
foreach my $row (0 .. $#{$$self->[NAMES]}) { |
|
51
|
|
|
|
|
174
|
|
140
|
20233
|
100
|
|
|
|
30761
|
$widths[0] = length $row if length $row > $widths[0]; |
141
|
20233
|
100
|
|
|
|
33392
|
$widths[1] = length $mvoff[$row] if length $mvoff[$row] > $widths[1]; |
142
|
20233
|
100
|
|
|
|
34604
|
$widths[2] = length $$self->[NAMES][$row] |
143
|
|
|
|
|
|
|
if length $$self->[NAMES][$row] > $widths[2]; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
51
|
|
|
|
|
266
|
my $out = sprintf ' %4$*1$s %5$*2$s %6$*3$s %7$s', @widths, |
147
|
|
|
|
|
|
|
qw/ ROW MVO NAME VALUE /; |
148
|
51
|
|
|
|
|
182
|
$out .= "\n".('=' x length $out)."\n"; |
149
|
|
|
|
|
|
|
$out .= join "\n", map |
150
|
20233
|
100
|
|
|
|
83612
|
{ sprintf ' %4$*1$d %5$*2$s %6$*3$s%7$1s %8$s', @widths, |
|
|
100
|
|
|
|
|
|
151
|
|
|
|
|
|
|
($_, $mvoff[$_], $$self->[NAMES][$_], |
152
|
|
|
|
|
|
|
(defined $$self->[VALUES][$_] ? ':' : ' '), |
153
|
|
|
|
|
|
|
(defined $$self->[VALUES][$_] ? $$self->[VALUES][$_] : '*deleted*')) } |
154
|
51
|
|
|
|
|
95
|
0 .. $#{$$self->[NAMES]}; |
|
51
|
|
|
|
|
345
|
|
155
|
|
|
|
|
|
|
|
156
|
51
|
|
|
|
|
2384
|
return $out; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# From RFC2616: |
160
|
|
|
|
|
|
|
# CTL =
|
161
|
|
|
|
|
|
|
# (octets 0 - 31) and DEL (127)> |
162
|
|
|
|
|
|
|
# LWS = [CRLF] 1*( SP | HT ) |
163
|
|
|
|
|
|
|
# separators = "(" | ")" | "<" | ">" | "@" |
164
|
|
|
|
|
|
|
# | "," | ";" | ":" | "\" | <"> |
165
|
|
|
|
|
|
|
# | "/" | "[" | "]" | "?" | "=" |
166
|
|
|
|
|
|
|
# | "{" | "}" | SP | HT |
167
|
|
|
|
|
|
|
my $PARSE_RE__LWS = qr/(?:\015\012)?[ \t]+/; |
168
|
|
|
|
|
|
|
my $PARSE_RE__separator = qr[[][)(><}{@,;:/"\\?=[:space:]]]; |
169
|
|
|
|
|
|
|
my $PARSE_RE__not_separator = qr[[^][)(><}{@,;:/"\\?=[:space:]]]; |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# From WARC specification: |
172
|
|
|
|
|
|
|
# field-name = token |
173
|
|
|
|
|
|
|
# token = 1*
|
174
|
|
|
|
|
|
|
# except CTLs or separators> |
175
|
|
|
|
|
|
|
my $PARSE_RE__token = qr/[!#$%'*+-.0-9A-Z^_`a-z|~]+/; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
=head1 DESCRIPTION |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
The C class encapsulates information in the |
180
|
|
|
|
|
|
|
"application/warc-fields" format used for WARC record headers. This is a |
181
|
|
|
|
|
|
|
simple key-value format closely analogous to HTTP headers, however |
182
|
|
|
|
|
|
|
differences are significant enough that the C class cannot |
183
|
|
|
|
|
|
|
be reliably reused for WARC fields. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
Instances of this class are usually created as member variables of the |
186
|
|
|
|
|
|
|
C class, but can also be returned as the content of WARC |
187
|
|
|
|
|
|
|
records with Content-Type "application/warc-fields". |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
Instances of C retrieved from WARC files are read-only and |
190
|
|
|
|
|
|
|
will croak() if any attempt is made to change their contents. |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
This class strives to faithfully represent the contents of a WARC file, |
193
|
|
|
|
|
|
|
while providing a simple interface to answer simple questions. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head2 Multiple Values |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Most WARC headers may only appear once and with a single value in valid |
198
|
|
|
|
|
|
|
WARC records, with the notable exception of the WARC-Concurrent-To header. |
199
|
|
|
|
|
|
|
C neither attempts to enforce nor relies upon this |
200
|
|
|
|
|
|
|
constraint. Headers that appear multiple times are considered to have |
201
|
|
|
|
|
|
|
multiple values. When iterating a tied hash, all values of a recurring |
202
|
|
|
|
|
|
|
header are collected and returned with the B occurrence of its key. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
Multiple values are returned from the C method and tied hash |
205
|
|
|
|
|
|
|
interface as array references, and are set by passing in an array |
206
|
|
|
|
|
|
|
reference. Existing rows are reused where possible when updating a field |
207
|
|
|
|
|
|
|
with multiple values. If the new array reference contains fewer items |
208
|
|
|
|
|
|
|
(including the special case of replacing multiple values with a single |
209
|
|
|
|
|
|
|
value) excess rows are deleted. If the new array reference requires |
210
|
|
|
|
|
|
|
additional rows to be inserted, they are inserted immediately after the |
211
|
|
|
|
|
|
|
last existing row for a field, with the same name case as that row. |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
Precise control of the layout is available using the tied array interface, |
214
|
|
|
|
|
|
|
but the ordering of the header rows is not constrained in the WARC |
215
|
|
|
|
|
|
|
specification. |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=head2 Field Name Mangling |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
As with C, the '_' character is converted to '-' in field |
220
|
|
|
|
|
|
|
names unless the first character of the name is ':', which cannot itself |
221
|
|
|
|
|
|
|
appear in a field name. Unlike C, the leading ':' is |
222
|
|
|
|
|
|
|
stripped off immediately and the name stored otherwise exactly as given. |
223
|
|
|
|
|
|
|
The C method and tied hash interface allow this convenience feature. |
224
|
|
|
|
|
|
|
The field names exposed via the tied array interface are reported |
225
|
|
|
|
|
|
|
B as they appear in the WARC file. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
Strictly, "X-Crazy-Header" and "X_Crazy_Header" are two B |
228
|
|
|
|
|
|
|
headers that the above convenience mechanism conflates. The solution is |
229
|
|
|
|
|
|
|
simple: if (and only if) a header field B with the B |
230
|
|
|
|
|
|
|
name given, it is used, otherwise C occurs and the name is |
231
|
|
|
|
|
|
|
rechecked for another exact match. If no match is found, case is folded |
232
|
|
|
|
|
|
|
and a third check performed. If a match is found, the existing header is |
233
|
|
|
|
|
|
|
updated, otherwise a new header is created with character case as given. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
The WARC specification specifically states that field names are |
236
|
|
|
|
|
|
|
case-insensitive, accordingly, "X-Crazy-Header" and "X-CRAZY-HeAdEr" are |
237
|
|
|
|
|
|
|
considered the same header for the C method and tied hash interface. |
238
|
|
|
|
|
|
|
They will appear exactly as given in the tied array interface, however. |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=cut |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# This function handles two different canonicalizations: |
243
|
|
|
|
|
|
|
# (1) case folding as required by the WARC specification |
244
|
|
|
|
|
|
|
# (2) convenience translation s/_/-/g, |
245
|
|
|
|
|
|
|
# (2a) suppressed if m/^:/, which is removed |
246
|
|
|
|
|
|
|
# (2b) overridden by an exact match |
247
|
|
|
|
|
|
|
# To make this work: |
248
|
|
|
|
|
|
|
# --- all keys in INDEX are case-folded |
249
|
|
|
|
|
|
|
# --- all keys in NAMES preserve case |
250
|
|
|
|
|
|
|
# --- existing keys are case-folded by this function |
251
|
|
|
|
|
|
|
# --- new keys translate s/_/-/g but preserve case |
252
|
|
|
|
|
|
|
sub _find_key { |
253
|
8444
|
|
|
8444
|
|
10542
|
my $self = shift; # INDEX slot must be valid |
254
|
8444
|
|
|
|
|
10231
|
my $k = shift; |
255
|
8444
|
|
|
|
|
9798
|
my $key; ($key = $k) =~ s/^://; |
|
8444
|
|
|
|
|
13290
|
|
256
|
8444
|
|
|
|
|
10549
|
my $pad = $key; |
257
|
8444
|
|
|
|
|
12086
|
my $is_quoted = ($k =~ m/^:/); |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
# exact case-folded match? |
260
|
8444
|
100
|
|
|
|
24258
|
return lc $key if defined $$self->[INDEX]{lc $key}; |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
# case-folded match after s/_/-/g? |
263
|
3806
|
|
|
|
|
7365
|
$pad =~ s/_/-/g; |
264
|
3806
|
100
|
100
|
|
|
10725
|
return lc $pad if defined $$self->[INDEX]{lc $pad} && !$is_quoted; |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# not found ==> a new key will be made |
267
|
2817
|
100
|
|
|
|
5814
|
return $is_quoted ? $key : $pad; |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# called only if there is no or one current value |
271
|
|
|
|
|
|
|
sub _set_single_value { |
272
|
12
|
|
|
12
|
|
20
|
my $self = shift; # INDEX slot must be valid |
273
|
12
|
|
|
|
|
21
|
my $key = shift; # as returned from _find_key |
274
|
12
|
|
|
|
|
18
|
my $value = shift; |
275
|
|
|
|
|
|
|
|
276
|
12
|
100
|
|
|
|
221
|
croak "attempt to modify read-only object" if $$self->[IS_RO]; |
277
|
|
|
|
|
|
|
|
278
|
11
|
100
|
|
|
|
32
|
unless (defined $$self->[INDEX]{lc $key}) { |
279
|
|
|
|
|
|
|
# insert new key |
280
|
6
|
|
|
|
|
10
|
push @{$$self->[NAMES]}, $key; # preserve original key |
|
6
|
|
|
|
|
16
|
|
281
|
6
|
|
|
|
|
13
|
$key = lc $key; # fold key case |
282
|
6
|
|
|
|
|
10
|
push @{$$self->[INDEX]{$key}}, $#{$$self->[NAMES]}; |
|
6
|
|
|
|
|
16
|
|
|
6
|
|
|
|
|
15
|
|
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
|
285
|
11
|
|
|
|
|
34
|
$$self->[VALUES][$$self->[INDEX]{$key}[0]] = $value; |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
sub _key_multiple_value_p { |
289
|
4584
|
|
|
4584
|
|
5768
|
my $self = shift; # INDEX slot must be valid |
290
|
4584
|
|
|
|
|
5675
|
my $key = shift; # as returned from _find_key |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
# For this to be true, the key must already exist, which means that |
293
|
|
|
|
|
|
|
# _find_key has case-folded it already. |
294
|
|
|
|
|
|
|
return (defined $$self->[INDEX]{$key} |
295
|
4584
|
|
100
|
|
|
9876
|
&& 1 < scalar @{$$self->[INDEX]{$key}}) |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
# called in all cases where multiple values are involved |
299
|
|
|
|
|
|
|
sub _set_multiple_value { |
300
|
56
|
|
|
56
|
|
87
|
my $self = shift; # INDEX slot must be valid |
301
|
56
|
|
|
|
|
83
|
my $key = shift; # as returned from _find_key |
302
|
56
|
|
|
|
|
104
|
my $value_aref = shift; |
303
|
|
|
|
|
|
|
|
304
|
56
|
100
|
|
|
|
353
|
croak "attempt to modify read-only object" if $$self->[IS_RO]; |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
my $cur_count = (defined $$self->[INDEX]{$key} |
307
|
54
|
|
100
|
|
|
140
|
&& scalar @{$$self->[INDEX]{$key}}); |
308
|
54
|
|
|
|
|
101
|
my $new_count = scalar @$value_aref; |
309
|
|
|
|
|
|
|
|
310
|
54
|
100
|
|
|
|
103
|
unless ($cur_count) { |
311
|
|
|
|
|
|
|
# insert new key |
312
|
16
|
|
|
|
|
27
|
push @{$$self->[NAMES]}, $key; # preserve original key |
|
16
|
|
|
|
|
44
|
|
313
|
16
|
|
|
|
|
34
|
push @{$$self->[VALUES]}, undef; # prepare slot |
|
16
|
|
|
|
|
37
|
|
314
|
16
|
|
|
|
|
32
|
$key = lc $key; # fold key case |
315
|
16
|
|
|
|
|
23
|
push @{$$self->[INDEX]{$key}}, $#{$$self->[NAMES]}; |
|
16
|
|
|
|
|
46
|
|
|
16
|
|
|
|
|
43
|
|
316
|
16
|
|
|
|
|
29
|
$cur_count = 1; # account for the added slot |
317
|
|
|
|
|
|
|
} |
318
|
|
|
|
|
|
|
# $key is always case-folded at this point |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# adjust table to accommodate new number of values |
321
|
54
|
100
|
|
|
|
131
|
if ($cur_count > $new_count) { |
|
|
100
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# remove extra rows |
323
|
27
|
|
|
|
|
34
|
foreach my $extra_row (reverse sort |
324
|
27
|
|
|
|
|
124
|
splice @{$$self->[INDEX]{$key}}, $new_count) { |
325
|
40
|
|
|
|
|
63
|
splice @{$$self->[NAMES]}, $extra_row, 1; |
|
40
|
|
|
|
|
69
|
|
326
|
40
|
|
|
|
|
64
|
splice @{$$self->[VALUES]}, $extra_row, 1; |
|
40
|
|
|
|
|
64
|
|
327
|
40
|
|
|
|
|
83
|
_update_INDEX($self, $extra_row, -1); |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
# special case: removing a field entirely |
330
|
27
|
100
|
|
|
|
72
|
if ($new_count == 0) { |
331
|
|
|
|
|
|
|
# This is here to catch a hypothetical bug before data is corrupted. |
332
|
|
|
|
|
|
|
die "stray INDEX entries left after removing field" |
333
|
|
|
|
|
|
|
# uncoverable branch true |
334
|
19
|
50
|
|
|
|
24
|
unless scalar @{$$self->[INDEX]{$key}} == 0; |
|
19
|
|
|
|
|
61
|
|
335
|
19
|
|
|
|
|
51
|
delete $$self->[INDEX]{$key}; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
} elsif ($cur_count < $new_count) { |
338
|
|
|
|
|
|
|
# add more rows |
339
|
15
|
|
|
|
|
32
|
my $last_row = $$self->[INDEX]{$key}[-1]; |
340
|
15
|
|
|
|
|
26
|
my $new_rows = $new_count - $cur_count; |
341
|
15
|
|
|
|
|
36
|
_update_INDEX($self, $last_row, $new_rows); |
342
|
15
|
|
|
|
|
22
|
splice @{$$self->[NAMES]}, 1+$last_row, 0, |
|
15
|
|
|
|
|
2414
|
|
343
|
|
|
|
|
|
|
(($$self->[NAMES][$last_row]) x $new_rows); |
344
|
15
|
|
|
|
|
25
|
splice @{$$self->[VALUES]}, 1+$last_row, 0, ((undef) x $new_rows); |
|
15
|
|
|
|
|
580
|
|
345
|
15
|
|
|
|
|
24
|
push @{$$self->[INDEX]{$key}}, 1+$last_row .. $last_row+$new_rows; |
|
15
|
|
|
|
|
1275
|
|
346
|
|
|
|
|
|
|
} # otherwise, $cur_count == $new_count |
347
|
54
|
100
|
|
|
|
298
|
$$self->[MVOFF] = undef unless $cur_count == $new_count; |
348
|
|
|
|
|
|
|
# there are always $new_count rows with $key at this point |
349
|
|
|
|
|
|
|
|
350
|
54
|
|
|
|
|
128
|
for (my $i = 0; $i < $new_count; $i++) |
351
|
9968
|
|
|
|
|
22195
|
{ $$self->[VALUES][$$self->[INDEX]{$key}[$i]] = $value_aref->[$i] } |
352
|
|
|
|
|
|
|
} |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=head2 Methods |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=over |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
=item $f = WARC::Fields-Enew |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
Construct a new C object. Initial contents can be passed as |
361
|
|
|
|
|
|
|
key-value pairs to this constructor and will be added in the given order. |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
Repeating a key or supplying an array reference as a value assigns multiple |
364
|
|
|
|
|
|
|
values to a key. To reduce the risk of confusion, only quoting with a |
365
|
|
|
|
|
|
|
leading ':' overrides the convenience feature of applying C when |
366
|
|
|
|
|
|
|
constructing a C object. The exact match rules used when |
367
|
|
|
|
|
|
|
setting values on an existing object do not apply here. |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
Field names given when constructing a WARC::Fields object are otherwise |
370
|
|
|
|
|
|
|
stored exactly as given, with case preserved, even when other names that |
371
|
|
|
|
|
|
|
fold to the same string have been given earlier in the argument list. |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=cut |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
sub new { |
376
|
220
|
|
|
220
|
1
|
15411
|
my $class = shift; |
377
|
220
|
|
|
|
|
505
|
my $ob = [OBJECT_INIT]; |
378
|
220
|
|
|
|
|
351
|
my $k; my $v; |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# explicitly initialize NAMES and VALUES to allow as_string and as_block |
381
|
|
|
|
|
|
|
# methods to be called on empty objects |
382
|
220
|
|
|
|
|
331
|
$ob->[NAMES] = []; |
383
|
220
|
|
|
|
|
361
|
$ob->[VALUES] = []; |
384
|
|
|
|
|
|
|
|
385
|
220
|
|
|
|
|
633
|
while (($k, $v) = splice @_, 0, 2) { |
386
|
792
|
100
|
|
|
|
1742
|
croak "key without value" unless defined $v; |
387
|
|
|
|
|
|
|
|
388
|
791
|
100
|
|
|
|
1422
|
if ($k =~ m/^:/) { $k =~ s/^:// } else { $k =~ s/_/-/g } |
|
7
|
|
|
|
|
19
|
|
|
784
|
|
|
|
|
1845
|
|
389
|
|
|
|
|
|
|
|
390
|
791
|
100
|
|
|
|
2346
|
croak "reference to field with no name" unless $k =~ m/./; |
391
|
789
|
100
|
|
|
|
2238
|
croak "reference to invalid field name" if $k !~ m/^$PARSE_RE__token$/o; |
392
|
|
|
|
|
|
|
|
393
|
787
|
100
|
|
|
|
1309
|
if (ref $v eq 'ARRAY') { |
394
|
3
|
|
|
|
|
9
|
foreach my $value (@$v) { |
395
|
8
|
|
|
|
|
24
|
push @{$ob->[NAMES]}, $k; |
|
8
|
|
|
|
|
19
|
|
396
|
8
|
|
|
|
|
14
|
push @{$ob->[VALUES]}, $value; |
|
8
|
|
|
|
|
13
|
|
397
|
8
|
|
|
|
|
12
|
push @{$ob->[INDEX]{lc $k}}, $#{$ob->[NAMES]}; |
|
8
|
|
|
|
|
21
|
|
|
8
|
|
|
|
|
25
|
|
398
|
|
|
|
|
|
|
} |
399
|
|
|
|
|
|
|
} else { |
400
|
784
|
|
|
|
|
914
|
push @{$ob->[NAMES]}, $k; |
|
784
|
|
|
|
|
1484
|
|
401
|
784
|
|
|
|
|
1065
|
push @{$ob->[VALUES]}, $v; |
|
784
|
|
|
|
|
1295
|
|
402
|
784
|
|
|
|
|
998
|
push @{$ob->[INDEX]{lc $k}}, $#{$ob->[NAMES]}; |
|
784
|
|
|
|
|
1737
|
|
|
784
|
|
|
|
|
2666
|
|
403
|
|
|
|
|
|
|
} |
404
|
|
|
|
|
|
|
} |
405
|
|
|
|
|
|
|
|
406
|
215
|
|
|
|
|
302
|
{our $_total_newly_constructed; $_total_newly_constructed++} |
|
215
|
|
|
|
|
272
|
|
|
215
|
|
|
|
|
278
|
|
407
|
215
|
|
|
|
|
1035
|
bless \ $ob, $class; |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=item $f-Eclone |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
Copy a C object. A copy of a read-only object is writable. |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=cut |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
sub clone { |
417
|
8
|
|
|
8
|
1
|
14
|
my $self = shift; |
418
|
8
|
|
|
|
|
22
|
my $new = [OBJECT_INIT]; |
419
|
|
|
|
|
|
|
|
420
|
8
|
|
|
|
|
14
|
$new->[NAMES] = [@{$$self->[NAMES]}]; |
|
8
|
|
|
|
|
663
|
|
421
|
8
|
|
|
|
|
18
|
$new->[VALUES] = [@{$$self->[VALUES]}]; |
|
8
|
|
|
|
|
1382
|
|
422
|
8
|
100
|
|
|
|
38
|
$new->[MVOFF] = [@{$$self->[MVOFF]}] if defined $$self->[MVOFF]; |
|
1
|
|
|
|
|
283
|
|
423
|
30
|
|
|
|
|
43
|
$new->[INDEX] = {map {$_ => [@{$$self->[INDEX]{$_}}]} |
|
30
|
|
|
|
|
88
|
|
424
|
8
|
100
|
|
|
|
40
|
keys %{$$self->[INDEX]}} if defined $$self->[INDEX]; |
|
7
|
|
|
|
|
31
|
|
425
|
|
|
|
|
|
|
|
426
|
8
|
|
|
|
|
18
|
{our $_total_newly_cloned; $_total_newly_cloned++} |
|
8
|
|
|
|
|
12
|
|
|
8
|
|
|
|
|
16
|
|
427
|
8
|
|
|
|
|
33
|
bless \ $new, ref $self; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=item $f-Efield( $name ) |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=item $f-Efield( $name =E $value ) |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=item $f-Efield( $n1 =E $v1, $n2 =E $v2, ... ) |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
Get or set the value of one or more fields. The field name is not case |
437
|
|
|
|
|
|
|
sensitive, but C will preserve its case if a new entry is |
438
|
|
|
|
|
|
|
created. |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
Setting a field to C effectively deletes that field, although it |
441
|
|
|
|
|
|
|
remains visible in the tied array interface and will retain its position if |
442
|
|
|
|
|
|
|
a new value is assigned. Setting a field to an empty array reference |
443
|
|
|
|
|
|
|
removes that field entirely. |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
=cut |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
sub field { |
448
|
7410
|
|
|
7410
|
1
|
37151
|
my $self = shift; |
449
|
|
|
|
|
|
|
|
450
|
7410
|
100
|
|
|
|
15093
|
_rebuild_INDEX($self) unless defined $$self->[INDEX]; |
451
|
|
|
|
|
|
|
|
452
|
7410
|
|
|
|
|
10029
|
my $k; my $v; my $have_value_arg = scalar @_ > 1; |
|
7410
|
|
|
|
|
10746
|
|
453
|
7410
|
|
|
|
|
16789
|
while (($k, $v) = splice @_, 0, 2) { |
454
|
7410
|
|
|
|
|
11837
|
my $key = $self->_find_key($k); |
455
|
|
|
|
|
|
|
|
456
|
7410
|
100
|
|
|
|
20125
|
croak "reference to field with no name" unless $key =~ m/./; |
457
|
7408
|
100
|
|
|
|
18389
|
croak "reference to invalid field name" if $key !~ m/^$PARSE_RE__token$/o; |
458
|
|
|
|
|
|
|
|
459
|
7405
|
100
|
|
|
|
11997
|
if (not $have_value_arg) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# get a value |
461
|
7343
|
100
|
|
|
|
17634
|
return undef unless defined $$self->[INDEX]{$key}; |
462
|
4570
|
100
|
|
|
|
7597
|
return $$self->[VALUES][$$self->[INDEX]{$key}[0]] |
463
|
|
|
|
|
|
|
unless $self->_key_multiple_value_p($key); |
464
|
78
|
|
|
|
|
303
|
return [grep {defined $_} |
465
|
25
|
|
|
|
|
48
|
map {$$self->[VALUES][$_]} @{$$self->[INDEX]{$key}}]; |
|
78
|
|
|
|
|
163
|
|
|
25
|
|
|
|
|
59
|
|
466
|
|
|
|
|
|
|
} # otherwise set a value |
467
|
|
|
|
|
|
|
elsif (UNIVERSAL::isa($v, 'ARRAY')) |
468
|
48
|
|
|
|
|
107
|
{ $self->_set_multiple_value($key, $v) } |
469
|
|
|
|
|
|
|
elsif ($self->_key_multiple_value_p($key)) |
470
|
|
|
|
|
|
|
# has multiple values, but now only setting a single value |
471
|
2
|
|
|
|
|
12
|
{ $self->_set_multiple_value($key, [$v]) } |
472
|
|
|
|
|
|
|
else |
473
|
12
|
|
|
|
|
31
|
{ $self->_set_single_value($key, $v) } |
474
|
59
|
|
|
|
|
198
|
$have_value_arg = scalar @_ > 1; |
475
|
|
|
|
|
|
|
} |
476
|
59
|
|
|
|
|
157
|
return (); # return nothing |
477
|
|
|
|
|
|
|
# Note that setting one or more fields and then getting a field is |
478
|
|
|
|
|
|
|
# possible as a side-effect of this organization, but is explicitly NOT |
479
|
|
|
|
|
|
|
# supported. That trick is NOT part of the stable API. |
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
=item $f = WARC::Fields-Eparse( $text ) |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=item $f = WARC::Fields-Eparse( from =E $fh ) |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=item $f = parse WARC::Fields from =E $fh |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
Construct a new C object, reading initial contents from the |
489
|
|
|
|
|
|
|
provided text string or filehandle. |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
The C method throws an exception if it encounters input that it does |
492
|
|
|
|
|
|
|
not understand. |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
If the C method encounters a field name with a leading ':', which |
495
|
|
|
|
|
|
|
implies an empty name and is not allowed, the leading ':' is silently |
496
|
|
|
|
|
|
|
dropped from the line and parsing retried. If the line is not valid after |
497
|
|
|
|
|
|
|
this change, the C method throws an exception. This feature is in |
498
|
|
|
|
|
|
|
keeping with the general principle of "be liberal in what you accept" and |
499
|
|
|
|
|
|
|
is a preemptive workaround for a predicted bug in other implementations. |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=cut |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
sub parse { |
504
|
608
|
|
|
608
|
1
|
1941
|
my $class = shift; |
505
|
608
|
|
|
|
|
755
|
my $text = shift; |
506
|
608
|
|
|
|
|
746
|
my $rd; |
507
|
|
|
|
|
|
|
|
508
|
608
|
100
|
|
|
|
1096
|
if ($text eq 'from') { |
509
|
603
|
|
|
|
|
758
|
$rd = shift; |
510
|
|
|
|
|
|
|
} else { |
511
|
|
|
|
|
|
|
# This fails iff perl was built without PerlIO, which is non-default. |
512
|
|
|
|
|
|
|
# uncoverable branch true |
513
|
5
|
50
|
|
|
|
70
|
open $rd, '<', \$text or die "failure opening stream on variable: $!"; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
608
|
|
|
|
|
987
|
my @names = (); |
517
|
608
|
|
|
|
|
726
|
my @values = (); |
518
|
608
|
|
|
|
|
812
|
my %idx = (); |
519
|
608
|
|
|
|
|
728
|
my $at_end = 0; |
520
|
|
|
|
|
|
|
|
521
|
608
|
|
|
|
|
1291
|
local *_; |
522
|
608
|
|
|
|
|
1836
|
while (<$rd>) { |
523
|
4570
|
|
|
|
|
50239
|
s/[\015\012]+$//; |
524
|
4570
|
100
|
|
|
|
14330
|
if (m/^:?($PARSE_RE__token):\s+(.*)$/o) # $1 -- name $2 -- value |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
525
|
3962
|
|
|
|
|
8688
|
{ push @names, $1; push @values, $2; push @{$idx{lc $1}}, $#names } |
|
3962
|
|
|
|
|
6379
|
|
|
3962
|
|
|
|
|
4238
|
|
|
3962
|
|
|
|
|
15477
|
|
526
|
|
|
|
|
|
|
elsif (m/^\s+(\S.*)$/) # $1 -- continued value |
527
|
1
|
|
|
|
|
7
|
{ $values[$#values] .= ' '.$1 } |
528
|
605
|
|
|
|
|
785
|
elsif (m/^$/) { $at_end = 1; last } |
|
605
|
|
|
|
|
967
|
|
529
|
2
|
|
|
|
|
380
|
else { croak "unrecognized input: $_" } |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
|
532
|
606
|
100
|
|
|
|
1359
|
carp "end-of-input before end marker" unless $at_end; |
533
|
|
|
|
|
|
|
|
534
|
606
|
|
|
|
|
1012
|
@values = map {Encode::decode_utf8($_)} @values; |
|
3962
|
|
|
|
|
48119
|
|
535
|
|
|
|
|
|
|
|
536
|
606
|
|
|
|
|
9139
|
my $ob = [OBJECT_INIT]; |
537
|
606
|
|
|
|
|
1019
|
$ob->[NAMES] = \@names; |
538
|
606
|
|
|
|
|
763
|
$ob->[VALUES] = \@values; |
539
|
606
|
|
|
|
|
790
|
$ob->[INDEX] = \%idx; |
540
|
|
|
|
|
|
|
|
541
|
606
|
|
|
|
|
759
|
{our $_total_newly_parsed; $_total_newly_parsed++} |
|
606
|
|
|
|
|
679
|
|
|
606
|
|
|
|
|
740
|
|
542
|
606
|
|
|
|
|
2279
|
bless \ $ob, $class; |
543
|
|
|
|
|
|
|
} |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=item $f-Eas_block |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=item $f-Eas_string |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
Return the contents as a formatted WARC header or application/warc-fields |
550
|
|
|
|
|
|
|
block. The C method uses network line endings and UTF-8 as |
551
|
|
|
|
|
|
|
specified for the WARC format, while the C method uses the local |
552
|
|
|
|
|
|
|
line endings and does not perform encoding. |
553
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
=cut |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
sub _as_text { |
557
|
48
|
|
|
48
|
|
84
|
my $self = shift; |
558
|
48
|
|
|
|
|
66
|
my $newline = shift; |
559
|
48
|
|
|
|
|
68
|
my $out = ''; |
560
|
|
|
|
|
|
|
|
561
|
48
|
|
|
|
|
79
|
for (my $i = 0; $i < @{$$self->[NAMES]}; $i++) { |
|
258
|
|
|
|
|
643
|
|
562
|
210
|
100
|
|
|
|
396
|
next unless defined $$self->[VALUES][$i]; |
563
|
203
|
|
|
|
|
475
|
$out .= $$self->[NAMES][$i] . ': ' . $$self->[VALUES][$i] . $newline; |
564
|
|
|
|
|
|
|
} |
565
|
|
|
|
|
|
|
|
566
|
48
|
|
|
|
|
196
|
return $out; |
567
|
|
|
|
|
|
|
} |
568
|
|
|
|
|
|
|
|
569
|
1
|
|
|
1
|
1
|
4
|
sub as_block { Encode::encode('UTF-8', _as_text(shift, WARC::CRLF)) } |
570
|
47
|
|
|
47
|
1
|
5029
|
sub as_string { _as_text(shift, "\n") } |
571
|
|
|
|
|
|
|
|
572
|
|
|
|
|
|
|
=item $f-Eset_readonly |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
Mark a C object read-only. All methods that modify the |
575
|
|
|
|
|
|
|
object will croak() if called on a read-only object. |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
=cut |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
sub set_readonly { |
580
|
732
|
|
|
732
|
1
|
1196
|
my $self = shift; |
581
|
|
|
|
|
|
|
|
582
|
732
|
|
|
|
|
1825
|
$$self->[IS_RO] = 1; |
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
=back |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
=head2 Tied Array Access |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
The order of fields can be fully controlled by tying an array to a |
590
|
|
|
|
|
|
|
C object and manipulating the array using ordinary Perl |
591
|
|
|
|
|
|
|
operations. The C and C functions are likely to be useful |
592
|
|
|
|
|
|
|
for reordering array elements if desired. |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
C will croak() if an attempt is made to set a field name with |
595
|
|
|
|
|
|
|
a leading ':' using the tied array interface. |
596
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
=cut |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
sub TIEARRAY { |
600
|
4
|
|
|
4
|
|
27
|
my $class = shift; |
601
|
4
|
|
|
|
|
7
|
my $ob = shift; |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
# This method must ignore the given class to allow the "empty subclass" |
604
|
|
|
|
|
|
|
# test to pass. If a subclass really wants, an override for TIEARRAY |
605
|
|
|
|
|
|
|
# itself can call SUPER::TIEARRAY and re-bless the returned reference |
606
|
|
|
|
|
|
|
# into the desired class. |
607
|
4
|
|
|
|
|
7
|
$WARC::Fields::TiedArray::_total_tied++; |
608
|
4
|
|
|
|
|
26
|
bless \ $ob, 'WARC::Fields::TiedArray'; |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
{ |
612
|
|
|
|
|
|
|
package WARC::Fields::TiedArray::Entry; |
613
|
|
|
|
|
|
|
|
614
|
26
|
|
|
26
|
|
243
|
use Carp; |
|
26
|
|
|
|
|
127
|
|
|
26
|
|
|
|
|
2339
|
|
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
BEGIN { $WARC::Fields::TiedArray::Entry::{$_} = $WARC::Fields::{$_} |
617
|
26
|
|
|
26
|
|
1199
|
for WARC::Fields::OBJECT_INDEX; } |
618
|
|
|
|
|
|
|
|
619
|
26
|
|
|
26
|
|
175
|
use constant { NAME => 0, VALUE => 1, TABLE => 2, ROW => 3 }; |
|
26
|
|
|
|
|
64
|
|
|
26
|
|
|
|
|
2458
|
|
620
|
|
|
|
|
|
|
|
621
|
26
|
|
|
26
|
|
179
|
use overload '""' => 'name', fallback => 1; |
|
26
|
|
|
|
|
58
|
|
|
26
|
|
|
|
|
154
|
|
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
=pod |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
The tied array interface accepts simple string values but returns objects |
626
|
|
|
|
|
|
|
with additional information. The returned object has an overloaded string |
627
|
|
|
|
|
|
|
conversion that yields the name for that entry but additionally has |
628
|
|
|
|
|
|
|
C and C methods. |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
An entry object is bound to a slot in its parent C object, |
631
|
|
|
|
|
|
|
but will be copied if it is assigned to another slot in the same or another |
632
|
|
|
|
|
|
|
C object. |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
Due to complex aliasing rules necessary for array slice assignment to work |
635
|
|
|
|
|
|
|
for permuting rows in the table, entry objects must be short-lived. |
636
|
|
|
|
|
|
|
Storing the object read from a tied array and attempting to use it after |
637
|
|
|
|
|
|
|
modifying its parent C object produces unspecified results. |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=over |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
=item $entry = $array[$n] |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
=item $entry = $f-E[$n] |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
The tied array C method returns a "entry object" instead of the name |
646
|
|
|
|
|
|
|
itself. |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
=cut |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
sub _new { |
651
|
223
|
|
|
223
|
|
304
|
my $class = shift; |
652
|
223
|
|
|
|
|
278
|
my $table = shift; |
653
|
223
|
|
|
|
|
350
|
my $row = shift; |
654
|
|
|
|
|
|
|
|
655
|
223
|
|
|
|
|
1010
|
bless [$$table->[NAMES][$row], $$table->[VALUES][$row], |
656
|
|
|
|
|
|
|
$table, $row], $class; |
657
|
|
|
|
|
|
|
} |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
=item $name = "$entry" |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
=item $name = $entry-Ename |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
=item $name = "$f-E[$n]" |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
=item $name = $f-E[$n]-Ename |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
The C method on a entry object returns the field name. |
668
|
|
|
|
|
|
|
String conversion is overloaded to call this method. |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=cut |
671
|
|
|
|
|
|
|
|
672
|
62
|
|
|
62
|
|
9060
|
sub name { (shift)->[NAME] } |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
=item $value = $entry-Evalue |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=item $value = $array[$n]-Evalue |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
=item $value = $f-E[$n]-Evalue |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
=item $entry-Evalue( $new_value ) |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
=item $array[$n]-Evalue( $new_value ) |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
=item $f-E[$n]-Evalue( $new_value ) |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
The C method on a entry object returns the field value for this |
687
|
|
|
|
|
|
|
particular entry. Only a single scalar is returned, even if multiple |
688
|
|
|
|
|
|
|
entries share the same name. |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
If given an argument, the C method replaces the value for this |
691
|
|
|
|
|
|
|
particular entry. The argument will be coerced to a string. |
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
=cut |
694
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
sub value { |
696
|
154
|
|
|
154
|
|
245
|
my $self = shift; |
697
|
|
|
|
|
|
|
|
698
|
154
|
100
|
|
|
|
278
|
if (scalar @_ == 0) { # get this value |
699
|
34
|
|
|
|
|
154
|
return $self->[VALUE]; |
700
|
|
|
|
|
|
|
} else { # update this value |
701
|
120
|
100
|
|
|
|
137
|
croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO]; |
|
120
|
|
|
|
|
374
|
|
702
|
119
|
|
|
|
|
174
|
my $newval = shift; |
703
|
119
|
|
|
|
|
193
|
${$self->[TABLE]}->[VALUES]->[$self->[ROW]] = $self->[VALUE] = "$newval"; |
|
119
|
|
|
|
|
214
|
|
704
|
119
|
|
|
|
|
292
|
return (); # and return nothing |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
} |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=item $offset = $entry-Eoffset |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=item $offset = $array[$n]-Eoffset |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
=item $offset = $f-E[$n]-Eoffset |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
The C method on a entry object returns the position of this entry |
715
|
|
|
|
|
|
|
amongst multiple entries with the same field name. These positions are |
716
|
|
|
|
|
|
|
numbered from zero and are identical to the positions in the array |
717
|
|
|
|
|
|
|
reference returned for this entry's field name from the C method or |
718
|
|
|
|
|
|
|
the tied hash interface. |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
=cut |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
sub offset { |
723
|
21
|
|
|
21
|
|
42
|
my $self = shift; |
724
|
21
|
100
|
|
|
|
28
|
$self->[TABLE]->_rebuild_MVOFF unless defined ${$self->[TABLE]}->[MVOFF]; |
|
21
|
|
|
|
|
80
|
|
725
|
21
|
|
|
|
|
32
|
return ${$self->[TABLE]}->[MVOFF]->[$self->[ROW]]; |
|
21
|
|
|
|
|
113
|
|
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
=back |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=cut |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
} |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
{ |
735
|
|
|
|
|
|
|
package WARC::Fields::TiedArray::LooseEntry; |
736
|
|
|
|
|
|
|
|
737
|
26
|
|
|
26
|
|
9672
|
use Carp; |
|
26
|
|
|
|
|
68
|
|
|
26
|
|
|
|
|
2184
|
|
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
BEGIN { $WARC::Fields::TiedArray::LooseEntry::{$_} = $WARC::Fields::{$_} |
740
|
26
|
|
|
26
|
|
1018
|
for WARC::Fields::OBJECT_INDEX; } |
741
|
|
|
|
|
|
|
|
742
|
26
|
|
|
26
|
|
189
|
use constant { NAME => 0, VALUE => 1 }; |
|
26
|
|
|
|
|
63
|
|
|
26
|
|
|
|
|
2089
|
|
743
|
|
|
|
|
|
|
|
744
|
26
|
|
|
26
|
|
4934
|
BEGIN { our @ISA = qw(WARC::Fields::TiedArray::Entry) } |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
# This is a special type of "entry object" that is not associated with a |
747
|
|
|
|
|
|
|
# table row, returned from POP, SHIFT, and SPLICE when needed. |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
sub _new { |
750
|
24
|
|
|
24
|
|
401
|
my $class = shift; |
751
|
24
|
|
|
|
|
31
|
my $name = shift; |
752
|
24
|
|
|
|
|
34
|
my $value = shift; |
753
|
|
|
|
|
|
|
|
754
|
24
|
|
|
|
|
77
|
bless [$name, $value], $class; |
755
|
|
|
|
|
|
|
} |
756
|
|
|
|
|
|
|
|
757
|
18
|
|
|
18
|
|
832
|
sub name { return (shift)->[NAME] } |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
sub value { |
760
|
17
|
|
|
17
|
|
27
|
my $self = shift; |
761
|
|
|
|
|
|
|
|
762
|
17
|
100
|
|
|
|
34
|
if (scalar @_ == 0) # get |
763
|
16
|
|
|
|
|
38
|
{ return $self->[VALUE] } |
764
|
|
|
|
|
|
|
else # set |
765
|
1
|
|
|
|
|
119
|
{ croak "Loose array entries are read-only." } |
766
|
|
|
|
|
|
|
} |
767
|
|
|
|
|
|
|
|
768
|
1
|
|
|
1
|
|
5
|
sub offset { return undef } |
769
|
|
|
|
|
|
|
} |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
{ |
772
|
|
|
|
|
|
|
package WARC::Fields::TiedArray; |
773
|
|
|
|
|
|
|
|
774
|
26
|
|
|
26
|
|
199
|
use Carp; |
|
26
|
|
|
|
|
59
|
|
|
26
|
|
|
|
|
2192
|
|
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
BEGIN { $WARC::Fields::TiedArray::{$_} = $WARC::Fields::{$_} |
777
|
26
|
|
|
26
|
|
43650
|
for WARC::Fields::OBJECT_INDEX; } |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
# The underlying object is a reference to a WARC::Fields object. |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
sub FETCH { |
782
|
223
|
|
|
223
|
|
8843
|
my $self = shift; |
783
|
223
|
|
|
|
|
268
|
my $row = shift; |
784
|
223
|
|
|
|
|
645
|
return (ref($self).'::Entry')->_new($$self, $row); |
785
|
|
|
|
|
|
|
} |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
sub STORE { |
788
|
13
|
|
|
13
|
|
420
|
my $self = shift; |
789
|
13
|
|
|
|
|
19
|
my $row = shift; |
790
|
13
|
|
|
|
|
21
|
my $name = shift; |
791
|
|
|
|
|
|
|
|
792
|
13
|
100
|
|
|
|
140
|
croak "attempt to modify read-only object" if $$$self->[IS_RO]; |
793
|
|
|
|
|
|
|
|
794
|
12
|
100
|
|
|
|
15
|
$self->STORESIZE($row + 1) if $#{$$$self->[NAMES]} < $row; |
|
12
|
|
|
|
|
37
|
|
795
|
|
|
|
|
|
|
|
796
|
12
|
100
|
|
|
|
64
|
if (UNIVERSAL::isa($name, ref($self).'::Entry')) { |
797
|
|
|
|
|
|
|
# copy entry |
798
|
8
|
100
|
|
|
|
21
|
croak "attempt to set invalid name" |
799
|
|
|
|
|
|
|
if $name->name !~ m/^$PARSE_RE__token$/o; |
800
|
7
|
|
|
|
|
18
|
$$$self->[NAMES]->[$row] = $name->name; |
801
|
7
|
|
|
|
|
11
|
$$$self->[VALUES]->[$row] = $name->value; |
802
|
|
|
|
|
|
|
} else { |
803
|
|
|
|
|
|
|
# set name |
804
|
4
|
100
|
|
|
|
369
|
croak "attempt to set invalid name" |
805
|
|
|
|
|
|
|
if "$name" !~ m/^$PARSE_RE__token$/o; |
806
|
2
|
|
|
|
|
10
|
$$$self->[NAMES]->[$row] = "$name"; |
807
|
|
|
|
|
|
|
} |
808
|
9
|
|
|
|
|
14
|
$$$self->[MVOFF] = undef; |
809
|
9
|
|
|
|
|
37
|
$$$self->[INDEX] = undef; |
810
|
|
|
|
|
|
|
} |
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
sub FETCHSIZE { |
813
|
38
|
|
|
38
|
|
4786
|
my $self = shift; |
814
|
38
|
|
|
|
|
50
|
return scalar @{$$$self->[NAMES]}; |
|
38
|
|
|
|
|
299
|
|
815
|
|
|
|
|
|
|
} |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
sub STORESIZE { |
818
|
5
|
|
|
5
|
|
10
|
my $self = shift; |
819
|
5
|
|
|
|
|
9
|
my $count = shift; |
820
|
|
|
|
|
|
|
|
821
|
5
|
100
|
100
|
|
|
18
|
croak "attempt to modify read-only object" |
822
|
|
|
|
|
|
|
if $$$self->[IS_RO] && $count != $self->FETCHSIZE(); |
823
|
|
|
|
|
|
|
|
824
|
4
|
100
|
|
|
|
12
|
if ($count > $self->FETCHSIZE()) { |
|
|
100
|
|
|
|
|
|
825
|
2
|
|
|
|
|
7
|
my $needed = $count - $self->FETCHSIZE(); |
826
|
2
|
|
|
|
|
5
|
push @{$$$self->[NAMES]}, ('X-Undefined-Field-Name') x $needed; |
|
2
|
|
|
|
|
8
|
|
827
|
2
|
|
|
|
|
3
|
push @{$$$self->[VALUES]}, (undef) x $needed; |
|
2
|
|
|
|
|
6
|
|
828
|
|
|
|
|
|
|
} elsif ($count < $self->FETCHSIZE()) { |
829
|
1
|
|
|
|
|
2
|
splice @{$$$self->[NAMES]}, $count; |
|
1
|
|
|
|
|
4
|
|
830
|
1
|
|
|
|
|
2
|
splice @{$$$self->[VALUES]}, $count; |
|
1
|
|
|
|
|
4
|
|
831
|
1
|
|
|
|
|
6
|
$$$self->[INDEX] = undef; |
832
|
1
|
|
|
|
|
4
|
} else { return } # no actual change |
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
0
|
|
|
sub EXTEND { |
836
|
|
|
|
|
|
|
# do nothing |
837
|
|
|
|
|
|
|
} |
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
sub EXISTS { |
840
|
14
|
|
|
14
|
|
2278
|
my $self = shift; |
841
|
14
|
|
|
|
|
19
|
my $row = shift; |
842
|
14
|
|
|
|
|
90
|
return defined $$$self->[VALUES]->[$row]; |
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
sub DELETE { |
846
|
2
|
|
|
2
|
|
6
|
my $self = shift; |
847
|
2
|
|
|
|
|
3
|
my $row = shift; |
848
|
|
|
|
|
|
|
|
849
|
2
|
100
|
|
|
|
152
|
croak "attempt to modify read-only object" if $$$self->[IS_RO]; |
850
|
|
|
|
|
|
|
|
851
|
1
|
|
|
|
|
3
|
my $old_value = $$$self->[VALUES]->[$row]; |
852
|
1
|
|
|
|
|
3
|
$$$self->[VALUES]->[$row] = undef; |
853
|
1
|
|
|
|
|
4
|
$$$self->[MVOFF] = undef; |
854
|
1
|
|
|
|
|
5
|
$$$self->[INDEX] = undef; |
855
|
1
|
|
|
|
|
5
|
return $old_value; |
856
|
|
|
|
|
|
|
} |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
sub CLEAR { |
859
|
3
|
|
|
3
|
|
10
|
my $self = shift; |
860
|
|
|
|
|
|
|
|
861
|
3
|
100
|
|
|
|
115
|
croak "attempt to modify read-only object" if $$$self->[IS_RO]; |
862
|
|
|
|
|
|
|
|
863
|
2
|
|
|
|
|
7
|
$$$self->[NAMES] = []; |
864
|
2
|
|
|
|
|
7
|
$$$self->[VALUES] = []; |
865
|
2
|
|
|
|
|
7
|
$$$self->[MVOFF] = undef; |
866
|
2
|
|
|
|
|
6
|
$$$self->[INDEX] = undef; |
867
|
2
|
|
|
|
|
5
|
return undef; |
868
|
|
|
|
|
|
|
} |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
sub PUSH { |
871
|
7
|
|
|
7
|
|
781
|
my $self = shift; |
872
|
|
|
|
|
|
|
|
873
|
7
|
100
|
100
|
|
|
137
|
croak "attempt to modify read-only object" |
874
|
|
|
|
|
|
|
if $$$self->[IS_RO] && scalar @_; |
875
|
|
|
|
|
|
|
|
876
|
6
|
|
|
|
|
15
|
foreach my $item (@_) { |
877
|
103
|
|
|
|
|
137
|
my $name; my $value; |
878
|
103
|
100
|
|
|
|
336
|
if (UNIVERSAL::isa($item, ref($self).'::Entry')) |
879
|
2
|
|
|
|
|
8
|
{ $name = $item->name; $value = $item->value } |
|
2
|
|
|
|
|
8
|
|
880
|
|
|
|
|
|
|
else |
881
|
101
|
|
|
|
|
150
|
{ $name = "$item"; $value = undef } |
|
101
|
|
|
|
|
128
|
|
882
|
103
|
100
|
|
|
|
562
|
croak "attempt to set invalid name" if $name !~ m/^$PARSE_RE__token$/o; |
883
|
101
|
|
|
|
|
137
|
push @{$$$self->[NAMES]}, $name; |
|
101
|
|
|
|
|
591
|
|
884
|
101
|
|
|
|
|
137
|
push @{$$$self->[VALUES]}, $value; |
|
101
|
|
|
|
|
294
|
|
885
|
|
|
|
|
|
|
} |
886
|
4
|
|
|
|
|
9
|
$$$self->[MVOFF] = undef; |
887
|
4
|
|
|
|
|
120
|
$$$self->[INDEX] = undef; |
888
|
4
|
|
|
|
|
8
|
return scalar @{$$$self->[NAMES]}; |
|
4
|
|
|
|
|
17
|
|
889
|
|
|
|
|
|
|
} |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
sub POP { |
892
|
5
|
|
|
5
|
|
11
|
my $self = shift; |
893
|
|
|
|
|
|
|
|
894
|
5
|
100
|
|
|
|
126
|
croak "attempt to modify read-only object" if $$$self->[IS_RO]; |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
my $ret = WARC::Fields::TiedArray::LooseEntry->_new |
897
|
4
|
|
|
|
|
6
|
(pop @{$$$self->[NAMES]}, pop @{$$$self->[VALUES]}); |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
13
|
|
898
|
4
|
100
|
|
|
|
12
|
pop @{$$$self->[MVOFF]} if defined $$$self->[MVOFF]; |
|
2
|
|
|
|
|
5
|
|
899
|
4
|
|
|
|
|
10
|
$$$self->[INDEX] = undef; |
900
|
|
|
|
|
|
|
|
901
|
4
|
|
|
|
|
14
|
return $ret; |
902
|
|
|
|
|
|
|
} |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
sub SHIFT { |
905
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
906
|
|
|
|
|
|
|
|
907
|
2
|
100
|
|
|
|
115
|
croak "attempt to modify read-only object" if $$$self->[IS_RO]; |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
my $ret = WARC::Fields::TiedArray::LooseEntry->_new |
910
|
1
|
|
|
|
|
3
|
(shift @{$$$self->[NAMES]}, shift @{$$$self->[VALUES]}); |
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
4
|
|
911
|
1
|
|
|
|
|
4
|
$$$self->[MVOFF] = undef; |
912
|
1
|
|
|
|
|
2
|
$$$self->[INDEX] = undef; |
913
|
|
|
|
|
|
|
|
914
|
1
|
|
|
|
|
4
|
return $ret; |
915
|
|
|
|
|
|
|
} |
916
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
sub UNSHIFT { |
918
|
6
|
|
|
6
|
|
722
|
my $self = shift; |
919
|
|
|
|
|
|
|
|
920
|
6
|
100
|
100
|
|
|
129
|
croak "attempt to modify read-only object" |
921
|
|
|
|
|
|
|
if $$$self->[IS_RO] && scalar @_; |
922
|
|
|
|
|
|
|
|
923
|
5
|
|
|
|
|
13
|
foreach my $item (@_) { |
924
|
4
|
|
|
|
|
6
|
my $name; my $value; |
925
|
4
|
100
|
|
|
|
24
|
if (UNIVERSAL::isa($item, ref($self).'::Entry')) |
926
|
2
|
|
|
|
|
6
|
{ $name = $item->name; $value = $item->value } |
|
2
|
|
|
|
|
6
|
|
927
|
|
|
|
|
|
|
else |
928
|
2
|
|
|
|
|
4
|
{ $name = "$item"; $value = undef } |
|
2
|
|
|
|
|
4
|
|
929
|
4
|
100
|
|
|
|
269
|
croak "attempt to set invalid name" if $name !~ m/^$PARSE_RE__token$/o; |
930
|
2
|
|
|
|
|
4
|
unshift @{$$$self->[NAMES]}, $name; |
|
2
|
|
|
|
|
7
|
|
931
|
2
|
|
|
|
|
4
|
unshift @{$$$self->[VALUES]}, $value; |
|
2
|
|
|
|
|
8
|
|
932
|
|
|
|
|
|
|
} |
933
|
3
|
|
|
|
|
8
|
$$$self->[MVOFF] = undef; |
934
|
3
|
|
|
|
|
7
|
$$$self->[INDEX] = undef; |
935
|
3
|
|
|
|
|
3
|
return scalar @{$$$self->[NAMES]}; |
|
3
|
|
|
|
|
9
|
|
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
sub SPLICE { |
939
|
10
|
|
|
10
|
|
773
|
my $self = shift; |
940
|
10
|
|
|
|
|
17
|
my $offset = shift; |
941
|
10
|
|
|
|
|
15
|
my $length = shift; |
942
|
|
|
|
|
|
|
|
943
|
10
|
100
|
|
|
|
27
|
$offset = 0 unless defined $offset; |
944
|
10
|
100
|
|
|
|
27
|
$length = $self->FETCHSIZE() - $offset unless defined $length; |
945
|
|
|
|
|
|
|
|
946
|
10
|
100
|
100
|
|
|
38
|
return () unless ($length != 0 || scalar @_); |
947
|
|
|
|
|
|
|
|
948
|
9
|
100
|
|
|
|
270
|
croak "attempt to modify read-only object" if $$$self->[IS_RO]; |
949
|
|
|
|
|
|
|
|
950
|
7
|
|
|
|
|
15
|
my @new_names = (); my @new_values = (); my @old_names; my @old_values; |
|
7
|
|
|
|
|
9
|
|
|
7
|
|
|
|
|
15
|
|
951
|
|
|
|
|
|
|
|
952
|
7
|
|
|
|
|
14
|
foreach my $item (@_) { |
953
|
17
|
100
|
|
|
|
61
|
if (UNIVERSAL::isa($item, ref($self).'::Entry')) { |
954
|
11
|
|
|
|
|
20
|
push @new_names, $item->name; |
955
|
11
|
|
|
|
|
19
|
push @new_values, $item->value; |
956
|
|
|
|
|
|
|
} else { |
957
|
6
|
|
|
|
|
13
|
push @new_names, "$item"; |
958
|
6
|
|
|
|
|
17
|
push @new_values, undef; |
959
|
|
|
|
|
|
|
} |
960
|
|
|
|
|
|
|
} |
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
croak "attempt to set invalid name" |
963
|
7
|
100
|
|
|
|
12
|
if grep { $_ !~ m/^$PARSE_RE__token$/o } @new_names; |
|
17
|
|
|
|
|
318
|
|
964
|
|
|
|
|
|
|
|
965
|
5
|
|
|
|
|
8
|
@old_names = splice @{$$$self->[NAMES]}, $offset, $length, @new_names; |
|
5
|
|
|
|
|
20
|
|
966
|
5
|
|
|
|
|
10
|
@old_values = splice @{$$$self->[VALUES]}, $offset, $length, @new_values; |
|
5
|
|
|
|
|
15
|
|
967
|
|
|
|
|
|
|
|
968
|
5
|
|
|
|
|
11
|
my @ret = (); |
969
|
|
|
|
|
|
|
|
970
|
5
|
|
|
|
|
14
|
for (my $i = 0; $i < scalar @old_names; $i++) |
971
|
18
|
|
|
|
|
37
|
{ push @ret, WARC::Fields::TiedArray::LooseEntry->_new |
972
|
|
|
|
|
|
|
($old_names[$i], $old_values[$i]) } |
973
|
|
|
|
|
|
|
|
974
|
5
|
|
|
|
|
12
|
$$$self->[MVOFF] = undef; |
975
|
5
|
|
|
|
|
12
|
$$$self->[INDEX] = undef; |
976
|
5
|
|
|
|
|
26
|
return @ret; |
977
|
|
|
|
|
|
|
} |
978
|
|
|
|
|
|
|
|
979
|
4
|
|
|
4
|
|
10
|
sub UNTIE { our $_total_untied; $_total_untied++ } |
|
4
|
|
|
|
|
18
|
|
980
|
|
|
|
|
|
|
|
981
|
4
|
|
|
4
|
|
6
|
sub DESTROY { our $_total_destroyed; $_total_destroyed++ } |
|
4
|
|
|
|
|
17
|
|
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
=head2 Tied Hash Access |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
The contents of a C object can be easily examined by tying a |
987
|
|
|
|
|
|
|
hash to the object. Reading or setting a hash key is equivalent to the |
988
|
|
|
|
|
|
|
C method, but the tied hash will iterate keys and values in the |
989
|
|
|
|
|
|
|
order in which each key B appears in the internal table. |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
Like the tied array interface, the tied hash interface returns magical |
992
|
|
|
|
|
|
|
objects that internally refer back to the parent C object. |
993
|
|
|
|
|
|
|
These objects remain valid if the underlying C object is |
994
|
|
|
|
|
|
|
changed, but further use may produce surprising and unspecified results. |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
The use of magical objects enables the values in a tied hash to B |
997
|
|
|
|
|
|
|
be arrays, even for keys that do not exist (the array will have zero |
998
|
|
|
|
|
|
|
elements) or that have only one value (the array will have a string |
999
|
|
|
|
|
|
|
conversion that produces that one value). This allows a tied hash to |
1000
|
|
|
|
|
|
|
support autovivification of an array value just as Perl's own hashes do. |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
=cut |
1003
|
|
|
|
|
|
|
|
1004
|
|
|
|
|
|
|
sub TIEHASH { |
1005
|
68
|
|
|
68
|
|
122
|
my $class = shift; |
1006
|
68
|
|
|
|
|
88
|
my $ob = shift; |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
# This method must ignore the given class to allow the "empty subclass" |
1009
|
|
|
|
|
|
|
# test to pass. If a subclass really wants, an override for TIEHASH |
1010
|
|
|
|
|
|
|
# itself can call SUPER::TIEHASH and re-bless the returned reference |
1011
|
|
|
|
|
|
|
# into the desired class. |
1012
|
68
|
|
|
|
|
103
|
$WARC::Fields::TiedHash::_total_tied++; |
1013
|
68
|
|
|
|
|
359
|
bless \ $ob, 'WARC::Fields::TiedHash'; |
1014
|
|
|
|
|
|
|
} |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
{ |
1017
|
|
|
|
|
|
|
package WARC::Fields::TiedHash::ValueArray; |
1018
|
|
|
|
|
|
|
|
1019
|
26
|
|
|
26
|
|
276
|
use Carp; |
|
26
|
|
|
|
|
55
|
|
|
26
|
|
|
|
|
2202
|
|
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
BEGIN { $WARC::Fields::TiedHash::ValueArray::{$_} = $WARC::Fields::{$_} |
1022
|
26
|
|
|
26
|
|
1046
|
for WARC::Fields::OBJECT_INDEX; } |
1023
|
|
|
|
|
|
|
|
1024
|
26
|
|
|
26
|
|
194
|
use constant { TABLE => 0, KEY => 1, KEYc => 2 }; # KEYc -- canonical KEY |
|
26
|
|
|
|
|
69
|
|
|
26
|
|
|
|
|
55144
|
|
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
sub TIEARRAY { |
1027
|
458
|
|
|
458
|
|
671
|
my $class = shift; |
1028
|
458
|
|
|
|
|
626
|
my $table = shift; |
1029
|
458
|
100
|
|
|
|
996
|
$table->_rebuild_INDEX unless defined $$table->[INDEX]; |
1030
|
458
|
|
|
|
|
869
|
my $key = $table->_find_key(shift); # needs INDEX |
1031
|
|
|
|
|
|
|
|
1032
|
458
|
|
|
|
|
741
|
{ our $_total_tied; $_total_tied++ } |
|
458
|
|
|
|
|
552
|
|
|
458
|
|
|
|
|
619
|
|
1033
|
458
|
|
|
|
|
1569
|
bless [$table, $key, lc $key], $class; |
1034
|
|
|
|
|
|
|
} |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
sub FETCH { |
1037
|
768
|
|
|
768
|
|
1672
|
my $self = shift; |
1038
|
768
|
|
|
|
|
936
|
my $offset = shift; |
1039
|
|
|
|
|
|
|
|
1040
|
768
|
100
|
|
|
|
908
|
$self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX]; |
|
768
|
|
|
|
|
1669
|
|
1041
|
|
|
|
|
|
|
|
1042
|
768
|
|
|
|
|
990
|
my $row = ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset]; |
|
768
|
|
|
|
|
1442
|
|
1043
|
768
|
100
|
|
|
|
1279
|
return defined $row ? ${$self->[TABLE]}->[VALUES][$row] : undef; |
|
752
|
|
|
|
|
3449
|
|
1044
|
|
|
|
|
|
|
} |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
sub STORE { |
1047
|
10
|
|
|
10
|
|
543
|
my $self = shift; |
1048
|
10
|
|
|
|
|
14
|
my $offset = shift; |
1049
|
10
|
|
|
|
|
17
|
my $value = shift; |
1050
|
|
|
|
|
|
|
|
1051
|
10
|
|
|
|
|
13
|
my $T = $self->[TABLE]; |
1052
|
|
|
|
|
|
|
|
1053
|
10
|
100
|
|
|
|
154
|
croak "attempt to modify read-only object" if $$T->[IS_RO]; |
1054
|
|
|
|
|
|
|
|
1055
|
9
|
100
|
|
|
|
21
|
$T->_rebuild_INDEX unless defined $$T->[INDEX]; |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
$self->STORESIZE($offset + 1) |
1058
|
|
|
|
|
|
|
if not defined $$T->[INDEX]{$self->[KEYc]} |
1059
|
9
|
100
|
100
|
|
|
29
|
or $#{$$T->[INDEX]{$self->[KEYc]}} < $offset; |
|
8
|
|
|
|
|
30
|
|
1060
|
|
|
|
|
|
|
|
1061
|
9
|
|
|
|
|
40
|
$$T->[VALUES][$$T->[INDEX]{$self->[KEYc]}[$offset]] = "$value"; |
1062
|
|
|
|
|
|
|
} |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
sub FETCHSIZE { |
1065
|
792
|
|
|
792
|
|
1627
|
my $self = shift; |
1066
|
|
|
|
|
|
|
|
1067
|
792
|
100
|
|
|
|
1021
|
$self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX]; |
|
792
|
|
|
|
|
1942
|
|
1068
|
|
|
|
|
|
|
|
1069
|
782
|
|
|
|
|
995
|
return scalar @{${$self->[TABLE]}->[INDEX]{$self->[KEYc]}} |
|
782
|
|
|
|
|
3321
|
|
1070
|
792
|
100
|
|
|
|
1115
|
if defined ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}; |
|
792
|
|
|
|
|
1864
|
|
1071
|
10
|
|
|
|
|
77
|
return 0; # otherwise: key does not exist |
1072
|
|
|
|
|
|
|
} |
1073
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
sub STORESIZE { |
1075
|
8
|
|
|
8
|
|
14
|
my $self = shift; |
1076
|
8
|
|
|
|
|
13
|
my $count = shift; |
1077
|
|
|
|
|
|
|
|
1078
|
8
|
|
|
|
|
10
|
my $T = $self->[TABLE]; |
1079
|
|
|
|
|
|
|
|
1080
|
8
|
100
|
100
|
|
|
28
|
croak "attempt to modify read-only object" |
1081
|
|
|
|
|
|
|
if $$T->[IS_RO] && $count != $self->FETCHSIZE(); |
1082
|
|
|
|
|
|
|
|
1083
|
7
|
100
|
|
|
|
22
|
$T->_rebuild_INDEX unless defined $$T->[INDEX]; |
1084
|
|
|
|
|
|
|
|
1085
|
7
|
|
|
|
|
13
|
my @new = (); |
1086
|
6
|
|
|
|
|
21
|
@new = @{$$T->[VALUES]}[@{$$T->[INDEX]{$self->[KEYc]}}] |
|
6
|
|
|
|
|
14
|
|
1087
|
7
|
100
|
|
|
|
17
|
if defined $$T->[INDEX]{$self->[KEYc]}; |
1088
|
7
|
100
|
|
|
|
16
|
if ($count > $self->FETCHSIZE()) |
|
|
100
|
|
|
|
|
|
1089
|
4
|
|
|
|
|
11
|
{ push @new, (undef) x ($count - $self->FETCHSIZE()) } |
1090
|
|
|
|
|
|
|
elsif ($count < $self->FETCHSIZE()) |
1091
|
2
|
|
|
|
|
10
|
{ @new = @new[0..($count-1)] } |
1092
|
1
|
|
|
|
|
4
|
else { return } # no actual change |
1093
|
6
|
|
|
|
|
19
|
$T->field($self->[KEY] => \@new); |
1094
|
|
|
|
|
|
|
} |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
0
|
|
|
sub EXTEND { |
1097
|
|
|
|
|
|
|
# do nothing |
1098
|
|
|
|
|
|
|
} |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
sub EXISTS { |
1101
|
5
|
|
|
5
|
|
535
|
my $self = shift; |
1102
|
5
|
|
|
|
|
8
|
my $offset = shift; |
1103
|
|
|
|
|
|
|
|
1104
|
5
|
100
|
|
|
|
7
|
$self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX]; |
|
5
|
|
|
|
|
24
|
|
1105
|
|
|
|
|
|
|
|
1106
|
5
|
|
|
|
|
11
|
return exists ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset]; |
|
5
|
|
|
|
|
33
|
|
1107
|
|
|
|
|
|
|
} |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
sub DELETE { |
1110
|
4
|
|
|
4
|
|
528
|
my $self = shift; |
1111
|
4
|
|
|
|
|
8
|
my $offset = shift; |
1112
|
|
|
|
|
|
|
|
1113
|
4
|
100
|
|
|
|
6
|
croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO]; |
|
4
|
|
|
|
|
121
|
|
1114
|
|
|
|
|
|
|
|
1115
|
3
|
100
|
|
|
|
15
|
$self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX]; |
|
3
|
|
|
|
|
15
|
|
1116
|
|
|
|
|
|
|
|
1117
|
3
|
|
|
|
|
6
|
my $row = ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset]; |
|
3
|
|
|
|
|
10
|
|
1118
|
3
|
|
|
|
|
5
|
my $old_value = ${$self->[TABLE]}->[VALUES][$row]; |
|
3
|
|
|
|
|
10
|
|
1119
|
3
|
|
|
|
|
6
|
${$self->[TABLE]}->[VALUES][$row] = undef; |
|
3
|
|
|
|
|
7
|
|
1120
|
3
|
|
|
|
|
12
|
return $old_value; |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
sub CLEAR { |
1124
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
1125
|
|
|
|
|
|
|
|
1126
|
2
|
100
|
|
|
|
5
|
croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO]; |
|
2
|
|
|
|
|
115
|
|
1127
|
|
|
|
|
|
|
|
1128
|
1
|
|
|
|
|
4
|
$self->[TABLE]->field($self->[KEY] => []); |
1129
|
|
|
|
|
|
|
} |
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
sub PUSH { |
1132
|
11
|
|
|
11
|
|
24
|
my $self = shift; |
1133
|
|
|
|
|
|
|
|
1134
|
11
|
|
|
|
|
16
|
my $T = $self->[TABLE]; |
1135
|
|
|
|
|
|
|
|
1136
|
11
|
100
|
100
|
|
|
143
|
croak "attempt to modify read-only object" |
1137
|
|
|
|
|
|
|
if $$T->[IS_RO] && scalar @_; |
1138
|
|
|
|
|
|
|
|
1139
|
10
|
100
|
|
|
|
26
|
$T->_rebuild_INDEX unless defined $$T->[INDEX]; |
1140
|
|
|
|
|
|
|
|
1141
|
10
|
100
|
|
|
|
25
|
if (defined $$T->[INDEX]{$self->[KEYc]}) { |
1142
|
|
|
|
|
|
|
# key exists ==> extend table efficiently |
1143
|
5
|
|
|
|
|
31
|
my $last_row = $$T->[INDEX]{$self->[KEYc]}[-1]; |
1144
|
5
|
|
|
|
|
10
|
my $new_rows = scalar @_; |
1145
|
5
|
|
|
|
|
8
|
splice @{$$T->[NAMES]}, 1+$last_row, 0, |
|
5
|
|
|
|
|
21
|
|
1146
|
|
|
|
|
|
|
(($$T->[NAMES][$last_row]) x $new_rows); |
1147
|
5
|
|
|
|
|
7
|
splice @{$$T->[VALUES]}, 1+$last_row, 0, map {"$_"} @_; |
|
5
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
14
|
|
1148
|
5
|
|
|
|
|
19
|
$T->_update_INDEX($last_row, $new_rows); |
1149
|
5
|
|
|
|
|
6
|
push @{$$T->[INDEX]{$self->[KEYc]}}, 1+$last_row .. $last_row+$new_rows; |
|
5
|
|
|
|
|
19
|
|
1150
|
|
|
|
|
|
|
} else { |
1151
|
|
|
|
|
|
|
# make key ==> use existing setter |
1152
|
5
|
|
|
|
|
27
|
$T->_set_multiple_value($self->[KEY], [map {"$_"} @_]); |
|
9909
|
|
|
|
|
18988
|
|
1153
|
|
|
|
|
|
|
} |
1154
|
10
|
|
|
|
|
1320
|
$$T->[MVOFF] = undef; |
1155
|
|
|
|
|
|
|
} |
1156
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
sub POP { |
1158
|
7
|
|
|
7
|
|
566
|
my $self = shift; |
1159
|
|
|
|
|
|
|
|
1160
|
7
|
|
|
|
|
14
|
my $T = $self->[TABLE]; |
1161
|
|
|
|
|
|
|
|
1162
|
7
|
100
|
|
|
|
126
|
croak "attempt to modify read-only object" if $$T->[IS_RO]; |
1163
|
|
|
|
|
|
|
|
1164
|
6
|
100
|
|
|
|
16
|
$T->_rebuild_INDEX unless defined $$T->[INDEX]; |
1165
|
|
|
|
|
|
|
|
1166
|
6
|
100
|
|
|
|
21
|
return undef unless defined $$T->[INDEX]{$self->[KEYc]}; |
1167
|
|
|
|
|
|
|
|
1168
|
5
|
|
|
|
|
10
|
my $rem_row = $$T->[INDEX]{$self->[KEYc]}[-1]; |
1169
|
5
|
|
|
|
|
9
|
my $value = $$T->[VALUES][$rem_row]; |
1170
|
|
|
|
|
|
|
|
1171
|
5
|
|
|
|
|
8
|
splice @{$$T->[NAMES]}, $rem_row, 1; |
|
5
|
|
|
|
|
12
|
|
1172
|
5
|
|
|
|
|
7
|
splice @{$$T->[VALUES]}, $rem_row, 1; |
|
5
|
|
|
|
|
11
|
|
1173
|
5
|
100
|
|
|
|
15
|
splice @{$$T->[MVOFF]}, $rem_row, 1 if defined $$T->[MVOFF]; |
|
1
|
|
|
|
|
3
|
|
1174
|
5
|
|
|
|
|
15
|
$T->_update_INDEX($rem_row, -1); |
1175
|
5
|
|
|
|
|
8
|
pop @{$$T->[INDEX]{$self->[KEYc]}}; |
|
5
|
|
|
|
|
10
|
|
1176
|
|
|
|
|
|
|
# special case: popped last value |
1177
|
|
|
|
|
|
|
delete $$T->[INDEX]{$self->[KEYc]} |
1178
|
5
|
100
|
|
|
|
10
|
if scalar @{$$T->[INDEX]{$self->[KEYc]}} == 0; |
|
5
|
|
|
|
|
14
|
|
1179
|
|
|
|
|
|
|
|
1180
|
5
|
|
|
|
|
36
|
return $value; |
1181
|
|
|
|
|
|
|
} |
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
sub SHIFT { |
1184
|
6
|
|
|
6
|
|
526
|
my $self = shift; |
1185
|
|
|
|
|
|
|
|
1186
|
6
|
|
|
|
|
12
|
my $T = $self->[TABLE]; |
1187
|
|
|
|
|
|
|
|
1188
|
6
|
100
|
|
|
|
123
|
croak "attempt to modify read-only object" if $$T->[IS_RO]; |
1189
|
|
|
|
|
|
|
|
1190
|
5
|
100
|
|
|
|
15
|
$T->_rebuild_INDEX unless defined $$T->[INDEX]; |
1191
|
|
|
|
|
|
|
|
1192
|
5
|
100
|
|
|
|
18
|
return undef unless defined $$T->[INDEX]{$self->[KEYc]}; |
1193
|
|
|
|
|
|
|
|
1194
|
4
|
|
|
|
|
10
|
my $rem_row = $$T->[INDEX]{$self->[KEYc]}[0]; |
1195
|
4
|
|
|
|
|
6
|
my $value = $$T->[VALUES][$rem_row]; |
1196
|
|
|
|
|
|
|
|
1197
|
4
|
|
|
|
|
6
|
splice @{$$T->[NAMES]}, $rem_row, 1; |
|
4
|
|
|
|
|
8
|
|
1198
|
4
|
|
|
|
|
8
|
splice @{$$T->[VALUES]}, $rem_row, 1; |
|
4
|
|
|
|
|
6
|
|
1199
|
4
|
|
|
|
|
9
|
$$T->[MVOFF] = undef; |
1200
|
4
|
|
|
|
|
11
|
$T->_update_INDEX($rem_row, -1); |
1201
|
4
|
|
|
|
|
8
|
shift @{$$T->[INDEX]{$self->[KEYc]}}; |
|
4
|
|
|
|
|
8
|
|
1202
|
|
|
|
|
|
|
# special case: shifted last value |
1203
|
|
|
|
|
|
|
delete $$T->[INDEX]{$self->[KEYc]} |
1204
|
4
|
100
|
|
|
|
6
|
if scalar @{$$T->[INDEX]{$self->[KEYc]}} == 0; |
|
4
|
|
|
|
|
13
|
|
1205
|
|
|
|
|
|
|
|
1206
|
4
|
|
|
|
|
20
|
return $value; |
1207
|
|
|
|
|
|
|
} |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
sub UNSHIFT { |
1210
|
5
|
|
|
5
|
|
10
|
my $self = shift; |
1211
|
|
|
|
|
|
|
|
1212
|
5
|
|
|
|
|
11
|
my $T = $self->[TABLE]; |
1213
|
|
|
|
|
|
|
|
1214
|
5
|
100
|
100
|
|
|
122
|
croak "attempt to modify read-only object" |
1215
|
|
|
|
|
|
|
if $$T->[IS_RO] && scalar @_; |
1216
|
|
|
|
|
|
|
|
1217
|
4
|
100
|
|
|
|
14
|
$T->_rebuild_INDEX unless defined $$T->[INDEX]; |
1218
|
|
|
|
|
|
|
|
1219
|
4
|
100
|
|
|
|
15
|
if (defined $$T->[INDEX]{$self->[KEYc]}) { |
1220
|
|
|
|
|
|
|
# key exists ==> extend table efficiently |
1221
|
3
|
|
|
|
|
9
|
my $first_row = $$T->[INDEX]{$self->[KEYc]}[0]; |
1222
|
3
|
|
|
|
|
6
|
my $new_rows = scalar @_; |
1223
|
3
|
|
|
|
|
4
|
splice @{$$T->[NAMES]}, $first_row, 0, |
|
3
|
|
|
|
|
13
|
|
1224
|
|
|
|
|
|
|
(($$T->[NAMES][$first_row]) x $new_rows); |
1225
|
3
|
|
|
|
|
6
|
splice @{$$T->[VALUES]}, $first_row, 0, map {"$_"} @_; |
|
3
|
|
|
|
|
10
|
|
|
2
|
|
|
|
|
9
|
|
1226
|
3
|
|
|
|
|
10
|
$T->_update_INDEX($first_row - 1, $new_rows); |
1227
|
3
|
|
|
|
|
6
|
unshift @{$$T->[INDEX]{$self->[KEYc]}}, |
|
3
|
|
|
|
|
11
|
|
1228
|
|
|
|
|
|
|
$first_row .. $first_row-1+$new_rows; |
1229
|
|
|
|
|
|
|
} else { |
1230
|
|
|
|
|
|
|
# make key ==> use existing setter |
1231
|
1
|
|
|
|
|
5
|
$T->_set_multiple_value($self->[KEY], [map {"$_"} @_]); |
|
4
|
|
|
|
|
11
|
|
1232
|
|
|
|
|
|
|
} |
1233
|
4
|
|
|
|
|
15
|
$$T->[MVOFF] = undef; |
1234
|
|
|
|
|
|
|
} |
1235
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
sub SPLICE { |
1237
|
9
|
|
|
9
|
|
18
|
my $self = shift; |
1238
|
9
|
|
|
|
|
12
|
my $offset = shift; |
1239
|
9
|
|
|
|
|
14
|
my $length = shift; |
1240
|
|
|
|
|
|
|
|
1241
|
9
|
100
|
|
|
|
22
|
$offset = 0 unless defined $offset; |
1242
|
9
|
100
|
|
|
|
19
|
$length = $self->FETCHSIZE() - $offset unless defined $length; |
1243
|
|
|
|
|
|
|
|
1244
|
9
|
100
|
100
|
|
|
36
|
return () unless ($length != 0 || scalar @_); |
1245
|
|
|
|
|
|
|
|
1246
|
8
|
|
|
|
|
14
|
my $T = $self->[TABLE]; |
1247
|
|
|
|
|
|
|
|
1248
|
8
|
100
|
|
|
|
225
|
croak "attempt to modify read-only object" if $$T->[IS_RO]; |
1249
|
|
|
|
|
|
|
|
1250
|
6
|
100
|
|
|
|
17
|
$T->_rebuild_INDEX unless defined $$T->[INDEX]; |
1251
|
|
|
|
|
|
|
|
1252
|
6
|
|
|
|
|
11
|
my @new = (); |
1253
|
5
|
|
|
|
|
16
|
@new = @{$$T->[VALUES]}[@{$$T->[INDEX]{$self->[KEYc]}}] |
|
5
|
|
|
|
|
11
|
|
1254
|
6
|
100
|
|
|
|
17
|
if defined $$T->[INDEX]{$self->[KEYc]}; |
1255
|
6
|
|
|
|
|
16
|
my @old = splice @new, $offset, $length, map {"$_"} @_; |
|
3
|
|
|
|
|
11
|
|
1256
|
6
|
|
|
|
|
21
|
$self->[TABLE]->field($self->[KEY] => \@new); |
1257
|
|
|
|
|
|
|
|
1258
|
6
|
|
|
|
|
41
|
return @old; |
1259
|
|
|
|
|
|
|
} |
1260
|
|
|
|
|
|
|
|
1261
|
458
|
|
|
458
|
|
571
|
sub UNTIE { our $_total_untied; $_total_untied++ } |
|
458
|
|
|
|
|
851
|
|
1262
|
|
|
|
|
|
|
|
1263
|
458
|
|
|
458
|
|
559
|
sub DESTROY { our $_total_destroyed; $_total_destroyed++ } |
|
458
|
|
|
|
|
1461
|
|
1264
|
|
|
|
|
|
|
} |
1265
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
{ |
1267
|
|
|
|
|
|
|
package WARC::Fields::TiedHash::Value; |
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
# This package is a magical array that appears to be a string if it has |
1270
|
|
|
|
|
|
|
# only one value, otherwise string conversion gives the usual |
1271
|
|
|
|
|
|
|
# nearly-useless debugging value. |
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
# The actual underlying array is a tied array that forwards mutating |
1274
|
|
|
|
|
|
|
# operations to the original WARC::Fields object. |
1275
|
|
|
|
|
|
|
|
1276
|
26
|
|
|
26
|
|
235
|
use overload '""' => '_as_string', fallback => 1; |
|
26
|
|
|
|
|
122
|
|
|
26
|
|
|
|
|
177
|
|
1277
|
|
|
|
|
|
|
|
1278
|
26
|
|
|
26
|
|
2221
|
use Scalar::Util qw/refaddr reftype/; |
|
26
|
|
|
|
|
117
|
|
|
26
|
|
|
|
|
5308
|
|
1279
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
sub _new { |
1281
|
458
|
|
|
458
|
|
651
|
my $class = shift; |
1282
|
458
|
|
|
|
|
661
|
my $parent = shift; |
1283
|
458
|
|
|
|
|
624
|
my $key = shift; |
1284
|
|
|
|
|
|
|
|
1285
|
458
|
|
|
|
|
631
|
my @values; |
1286
|
458
|
|
|
|
|
1342
|
tie @values, (ref($parent).'::ValueArray'), $$parent, $key; |
1287
|
|
|
|
|
|
|
|
1288
|
458
|
|
|
|
|
2492
|
bless \@values, $class; |
1289
|
|
|
|
|
|
|
} |
1290
|
|
|
|
|
|
|
|
1291
|
|
|
|
|
|
|
sub _as_string { |
1292
|
680
|
|
|
680
|
|
12249
|
my $self = shift; |
1293
|
|
|
|
|
|
|
|
1294
|
680
|
100
|
|
|
|
1905
|
return scalar @$self == 1 |
1295
|
|
|
|
|
|
|
? $self->[0] : sprintf ('%s(0x%x)', reftype $self, refaddr $self); |
1296
|
|
|
|
|
|
|
} |
1297
|
|
|
|
|
|
|
|
1298
|
458
|
|
|
458
|
|
2844
|
sub DESTROY { untie @{(shift)} } |
|
458
|
|
|
|
|
1036
|
|
1299
|
|
|
|
|
|
|
} |
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
{ |
1302
|
|
|
|
|
|
|
package WARC::Fields::TiedHash; |
1303
|
|
|
|
|
|
|
|
1304
|
26
|
|
|
26
|
|
213
|
use Carp; |
|
26
|
|
|
|
|
192
|
|
|
26
|
|
|
|
|
2278
|
|
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
BEGIN { $WARC::Fields::TiedHash::{$_} = $WARC::Fields::{$_} |
1307
|
26
|
|
|
26
|
|
22599
|
for WARC::Fields::OBJECT_INDEX; } |
1308
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
# The underlying object is a reference to a WARC::Fields object. |
1310
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
sub FETCH { |
1312
|
461
|
|
|
461
|
|
1971
|
my $self = shift; |
1313
|
461
|
|
|
|
|
656
|
my $key = shift; |
1314
|
461
|
100
|
|
|
|
2302
|
croak "reference to invalid field name" if $key !~ m/^$PARSE_RE__token$/o; |
1315
|
458
|
|
|
|
|
1451
|
return (ref($self).'::Value')->_new($self, $key); |
1316
|
|
|
|
|
|
|
} |
1317
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
sub STORE { |
1319
|
11
|
|
|
11
|
|
385
|
my $self = shift; |
1320
|
11
|
|
|
|
|
18
|
my $key = shift; |
1321
|
11
|
|
|
|
|
17
|
my $value = shift; |
1322
|
|
|
|
|
|
|
|
1323
|
11
|
100
|
|
|
|
246
|
croak "attempt to modify read-only object" if $$$self->[IS_RO]; |
1324
|
|
|
|
|
|
|
|
1325
|
9
|
|
|
|
|
25
|
$$self->field($key => $value); |
1326
|
|
|
|
|
|
|
} |
1327
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
sub DELETE { |
1329
|
3
|
|
|
3
|
|
6
|
my $self = shift; |
1330
|
3
|
|
|
|
|
5
|
my $key = shift; |
1331
|
|
|
|
|
|
|
|
1332
|
3
|
100
|
|
|
|
120
|
croak "attempt to modify read-only object" if $$$self->[IS_RO]; |
1333
|
|
|
|
|
|
|
|
1334
|
2
|
|
|
|
|
7
|
$$self->field($key => []); |
1335
|
|
|
|
|
|
|
} |
1336
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
sub CLEAR { |
1338
|
2
|
|
|
2
|
|
5
|
my $self = shift; |
1339
|
|
|
|
|
|
|
|
1340
|
2
|
100
|
|
|
|
113
|
croak "attempt to modify read-only object" if $$$self->[IS_RO]; |
1341
|
|
|
|
|
|
|
|
1342
|
1
|
|
|
|
|
4
|
$$$self->[NAMES] = []; |
1343
|
1
|
|
|
|
|
14
|
$$$self->[VALUES] = []; |
1344
|
1
|
|
|
|
|
3
|
$$$self->[MVOFF] = undef; |
1345
|
1
|
|
|
|
|
4
|
$$$self->[INDEX] = undef; |
1346
|
1
|
|
|
|
|
4
|
return undef; |
1347
|
|
|
|
|
|
|
} |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
sub EXISTS { |
1350
|
15
|
|
|
15
|
|
36
|
my $self = shift; |
1351
|
15
|
|
|
|
|
21
|
my $key = shift; |
1352
|
|
|
|
|
|
|
|
1353
|
15
|
100
|
|
|
|
49
|
$$self->_rebuild_INDEX unless defined $$$self->[INDEX]; |
1354
|
15
|
|
|
|
|
42
|
return exists $$$self->[INDEX]->{$$self->_find_key($key)}; |
1355
|
|
|
|
|
|
|
} |
1356
|
|
|
|
|
|
|
|
1357
|
|
|
|
|
|
|
sub FIRSTKEY { |
1358
|
61
|
|
|
61
|
|
98
|
my $self = shift; |
1359
|
|
|
|
|
|
|
|
1360
|
61
|
|
|
|
|
219
|
return $$$self->[NAMES][0]; |
1361
|
|
|
|
|
|
|
} |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
sub NEXTKEY { |
1364
|
307
|
|
|
307
|
|
452
|
my $self = shift; |
1365
|
307
|
|
|
|
|
389
|
my $from_key = shift; |
1366
|
|
|
|
|
|
|
|
1367
|
307
|
100
|
|
|
|
633
|
$$self->_rebuild_INDEX unless defined $$$self->[INDEX]; |
1368
|
|
|
|
|
|
|
|
1369
|
307
|
|
|
|
|
369
|
my $i; |
1370
|
307
|
|
100
|
|
|
542
|
for ($i = $$$self->[INDEX]{$$self->_find_key($from_key)}[0] + 1; |
1371
|
|
|
|
|
|
|
defined $$$self->[NAMES][$i] and |
1372
|
|
|
|
|
|
|
$i != $$$self->[INDEX]{$$self->_find_key($$$self->[NAMES][$i])}[0]; |
1373
|
|
|
|
|
|
|
$i++) {} |
1374
|
307
|
|
|
|
|
1093
|
return $$$self->[NAMES][$i]; |
1375
|
|
|
|
|
|
|
} |
1376
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
sub SCALAR { |
1378
|
3
|
|
|
3
|
|
10
|
my $self = shift; |
1379
|
3
|
|
|
|
|
5
|
return scalar @{$$$self->[NAMES]}; |
|
3
|
|
|
|
|
17
|
|
1380
|
|
|
|
|
|
|
} |
1381
|
|
|
|
|
|
|
|
1382
|
68
|
|
|
68
|
|
87
|
sub UNTIE { our $_total_untied; $_total_untied++ } |
|
68
|
|
|
|
|
160
|
|
1383
|
|
|
|
|
|
|
|
1384
|
68
|
|
|
68
|
|
91
|
sub DESTROY { our $_total_destroyed; $_total_destroyed++ } |
|
68
|
|
|
|
|
132
|
|
1385
|
|
|
|
|
|
|
} |
1386
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
=head2 Overloaded Dereference Operators |
1388
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
The C class provides overloaded dereference operators for |
1390
|
|
|
|
|
|
|
array and hash dereferencing. The overloaded operators provide an |
1391
|
|
|
|
|
|
|
anonymous tied array or hash as needed, allowing the object itself to be |
1392
|
|
|
|
|
|
|
used as a reference to its tied array and hash interfaces. There is a |
1393
|
|
|
|
|
|
|
caveat, however, so read on. |
1394
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
=cut |
1396
|
|
|
|
|
|
|
|
1397
|
|
|
|
|
|
|
sub _as_tied_array { |
1398
|
|
|
|
|
|
|
# To avoid confusing bugs due to typos producing overloaded dereferences |
1399
|
|
|
|
|
|
|
# instead of intended accesses to the internal object, this feature |
1400
|
|
|
|
|
|
|
# cannot be used from within this module. |
1401
|
168
|
100
|
|
168
|
|
7436
|
if (scalar caller =~ m/^WARC::Fields/) { |
1402
|
1
|
|
|
|
|
4
|
local $Carp::CarpLevel = 1; |
1403
|
1
|
|
|
|
|
138
|
confess "overloaded array dereference in internal code" |
1404
|
|
|
|
|
|
|
} |
1405
|
|
|
|
|
|
|
|
1406
|
167
|
|
|
|
|
251
|
my $self = shift; |
1407
|
|
|
|
|
|
|
|
1408
|
167
|
100
|
|
|
|
696
|
return $$self->[C_TA] if defined $$self->[C_TA]; |
1409
|
|
|
|
|
|
|
|
1410
|
3
|
|
|
|
|
6
|
my @array; $$self->[C_TA] = \@array; |
|
3
|
|
|
|
|
7
|
|
1411
|
3
|
|
|
|
|
6
|
Scalar::Util::weaken ${tie @array, ref $self, $self}; |
|
3
|
|
|
|
|
14
|
|
1412
|
3
|
|
|
|
|
14
|
return $$self->[C_TA]; |
1413
|
|
|
|
|
|
|
} |
1414
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
sub _as_tied_hash { |
1416
|
|
|
|
|
|
|
# To avoid confusing bugs due to typos producing overloaded dereferences |
1417
|
|
|
|
|
|
|
# instead of intended accesses to the internal object, this feature |
1418
|
|
|
|
|
|
|
# cannot be used from within this module. |
1419
|
266
|
100
|
|
266
|
|
8251
|
if (scalar caller =~ m/^WARC::Fields/) { |
1420
|
1
|
|
|
|
|
4
|
local $Carp::CarpLevel = 1; |
1421
|
1
|
|
|
|
|
71
|
confess "overloaded hash dereference in internal code" |
1422
|
|
|
|
|
|
|
} |
1423
|
|
|
|
|
|
|
|
1424
|
265
|
|
|
|
|
401
|
my $self = shift; |
1425
|
|
|
|
|
|
|
|
1426
|
265
|
100
|
|
|
|
1203
|
return $$self->[C_TH] if defined $$self->[C_TH]; |
1427
|
|
|
|
|
|
|
|
1428
|
61
|
|
|
|
|
91
|
my %hash; $$self->[C_TH] = \%hash; |
|
61
|
|
|
|
|
110
|
|
1429
|
61
|
|
|
|
|
92
|
Scalar::Util::weaken ${tie %hash, ref $self, $self}; |
|
61
|
|
|
|
|
256
|
|
1430
|
61
|
|
|
|
|
306
|
return $$self->[C_TH]; |
1431
|
|
|
|
|
|
|
} |
1432
|
|
|
|
|
|
|
|
1433
|
|
|
|
|
|
|
=head3 Reference Count Trickery with Overloaded Dereference Operators |
1434
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
To avoid problems, the underlying tied object is a reference to the parent |
1436
|
|
|
|
|
|
|
object. For ordinary use of C, this is a strong reference, however, |
1437
|
|
|
|
|
|
|
the anonymous tied array and hash are cached in the object to avoid having |
1438
|
|
|
|
|
|
|
to C a new object every time the dereference operators are used. |
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
To prevent memory leaks due to circular references, the overloaded |
1441
|
|
|
|
|
|
|
dereference operators tie a I reference to the parent object. The |
1442
|
|
|
|
|
|
|
tied aggregate always holds a strong reference to its object, but when the |
1443
|
|
|
|
|
|
|
dereference operators are used, that inner object is a I reference to |
1444
|
|
|
|
|
|
|
the actual C object. |
1445
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
The caveat is thus: do not attempt to save a reference to the array or hash |
1447
|
|
|
|
|
|
|
produced by dereferencing a C object. The parent |
1448
|
|
|
|
|
|
|
C object must remain in scope for as long as any anonymous |
1449
|
|
|
|
|
|
|
tied aggregates exist. |
1450
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
=cut |
1452
|
|
|
|
|
|
|
|
1453
|
|
|
|
|
|
|
1; |
1454
|
|
|
|
|
|
|
__END__ |