File Coverage

blib/lib/WARC/Fields.pm
Criterion Covered Total %
statement 720 720 100.0
branch 250 252 100.0
condition 32 32 100.0
subroutine 99 101 98.0
pod 7 7 100.0
total 1108 1112 99.8


line stmt bran cond sub pod time code
1             package WARC::Fields; # -*- CPerl -*-
2              
3 26     26   70061 use strict;
  26         70  
  26         792  
4 26     26   138 use warnings;
  26         55  
  26         681  
5              
6 26     26   132 use Carp;
  26         52  
  26         1379  
7 26     26   14845 use Encode;
  26         250914  
  26         1822  
8 26     26   191 use Scalar::Util;
  26         59  
  26         1342  
9              
10             our @ISA = qw();
11              
12 26     26   964 use WARC; *WARC::Fields::VERSION = \$WARC::VERSION;
  26         58  
  26         1360  
13              
14             =head1 NAME
15              
16             WARC::Fields - WARC record headers and application/warc-fields
17              
18             =head1 SYNOPSIS
19              
20             require WARC::Fields;
21              
22             $f = new WARC::Fields;
23             $f = $record->fields; # get WARC record headers
24             $g = $f->clone; # make writable copy
25              
26             $g->set_readonly; # make read-only
27              
28             $f->field('WARC-Type' => 'metadata'); # set
29             $value = $f->field('WARC-Type'); # get
30              
31             $fields_text = $f->as_string; # get WARC header lines for display
32             $fields_block = $f->as_block; # format for WARC file
33              
34             tie @field_names, ref $f, $f; # bind ordered list of field names
35              
36             tie %fields, ref $f, $f; # bind hash of field names => values
37              
38             $entry = $f->[$num]; # tie an anonymous array and access it
39             $value = $f->{$name}; # likewise with an anonymous tied hash
40              
41             $name = "$entry"; # tied array returns objects
42             $value = $entry->value; # one specific value
43             $offset = $entry->offset; # N of M with same name
44              
45             foreach (keys %{$f}) { ... } # iterate over names, in order
46              
47             =cut
48              
49 26     26   2575 use overload '@{}' => \&_as_tied_array, '%{}' => \&_as_tied_hash;
  26         2035  
  26         266  
50 26     26   2035 use overload fallback => 1;
  26         50  
  26         93  
51              
52             # This implementation uses column-oriented storage, with an array as the
53             # underlying object and constants to select array offsets.
54             #
55             # The NAMES and VALUES columns are always valid, but the MVOFF and INDEX
56             # positions may be undefined and are lazily rebuilt when needed.
57              
58 26         2982 use constant { NAMES => 0, VALUES => 1, MVOFF => 2,
59 26     26   1694 INDEX => 3, IS_RO => 4, C_TA => 5, C_TH => 6 };
  26         53  
60 26     26   174 use constant OBJECT_INDEX => qw/NAMES VALUES MVOFF INDEX IS_RO/;
  26         52  
  26         1668  
61 26     26   149 use constant OBJECT_INIT => undef, undef, undef, undef, 0, undef, undef;
  26         54  
  26         82552  
62              
63 706     706   8032 sub DESTROY { my $ob = shift;
64 706 100       1803 untie @{$$ob->[C_TA]} if defined $$ob->[C_TA];
  3         22  
65 706 100       1283 untie %{$$ob->[C_TH]} if defined $$ob->[C_TH];
  61         188  
66 706         891 our $_total_destroyed; $_total_destroyed++ }
  706         6997  
67              
68             # NAMES: array of field names, exactly as written
69             # VALUES: array of field values
70             # MVOFF: array of offsets for multiple-valued fields
71             # INDEX: hash of case-folded field names to array of row numbers
72             # IS_RO: boolean: TRUE if this object is read-only
73              
74             # C_TA: cache: tied array for array dereference
75             # C_TH: cache: tide hash for hash dereference
76              
77             sub _rebuild_INDEX {
78 30     30   45 my $self = shift;
79 30         49 my %idx = ();
80              
81 30         50 for (my $i = 0; $i < @{$$self->[NAMES]}; $i++) {
  113         243  
82 83         99 push @{$idx{lc $$self->[NAMES][$i]}}, $i;
  83         1199  
83             }
84              
85 30         66 $$self->[INDEX] = \%idx;
86             }
87              
88             sub _update_INDEX {
89 72     72   99 my $self = shift; # INDEX slot must be valid
90 72         99 my $base = shift; # row number where an insertion or removal was made
91 72         94 my $count = shift; # how many rows were inserted (+) or removed (-)
92              
93 72         109 my %done = ();
94              
95 72 100       169 for (my $i = ($base < 0) ? 0 : $base; $i < @{$$self->[NAMES]}; $i++) {
  186         440  
96 114         194 my $key = lc $$self->[NAMES][$i];
97 114 100       221 next if $done{$key};
98 72         159 @{$$self->[INDEX]{$key}} =
99 72 100       87 map { $_ + ($_ > $base ? $count : 0) } @{$$self->[INDEX]{$key}};
  118         270  
  72         149  
100 72         164 $done{$key}++;
101             }
102             }
103              
104             sub _rebuild_MVOFF {
105 6     6   11 my $self = shift;
106              
107 6         12 my @mvoff = ();#(undef) x scalar @{$$self->[NAMES]};
108 6         11 my %cidx = (); # counted index; references to unique entries
109              
110 6         11 foreach my $name (@{$$self->[NAMES]}) {
  6         18  
111 10028         12242 my $key = lc $name;
112 10028 100       18019 if (not defined $cidx{$key}) {
    100          
113             # first time this key is seen
114 26         54 push @mvoff, undef;
115 26         58 $cidx{$key} = \$mvoff[$#mvoff];
116             } elsif (ref $cidx{$key} eq 'SCALAR') {
117             # second time this key is seen
118 4         9 push @mvoff, 1;
119 4         7 ${$cidx{$key}} = 0; # replace undefined value
  4         10  
120 4         9 $cidx{$key} = 2; # prepare counter
121             } else {
122             # third or later time this key is seen
123 9998         13964 push @mvoff, $cidx{$key}++;
124             }
125             }
126              
127 6         31 $$self->[MVOFF] = \@mvoff;
128             }
129              
130             sub _dbg_dump {
131 51     51   5351 my $self = shift;
132              
133 51         82 my @mvoff = (' ') x @{$$self->[NAMES]};
  51         2499  
134 20016 100       29784 @mvoff = map { defined $_ ? $_ : 'U' }
135 51 100       174 @{$$self->[MVOFF]}[0 .. $#{$$self->[NAMES]}]
  5         206  
  5         280  
136             if defined $$self->[MVOFF];
137 51         511 my @widths = map {length} qw/ROW MVO NAME/;
  153         279  
138              
139 51         77 foreach my $row (0 .. $#{$$self->[NAMES]}) {
  51         174  
140 20233 100       30761 $widths[0] = length $row if length $row > $widths[0];
141 20233 100       33392 $widths[1] = length $mvoff[$row] if length $mvoff[$row] > $widths[1];
142 20233 100       34604 $widths[2] = length $$self->[NAMES][$row]
143             if length $$self->[NAMES][$row] > $widths[2];
144             }
145              
146 51         266 my $out = sprintf ' %4$*1$s %5$*2$s %6$*3$s %7$s', @widths,
147             qw/ ROW MVO NAME VALUE /;
148 51         182 $out .= "\n".('=' x length $out)."\n";
149             $out .= join "\n", map
150 20233 100       83612 { sprintf ' %4$*1$d %5$*2$s %6$*3$s%7$1s %8$s', @widths,
    100          
151             ($_, $mvoff[$_], $$self->[NAMES][$_],
152             (defined $$self->[VALUES][$_] ? ':' : ' '),
153             (defined $$self->[VALUES][$_] ? $$self->[VALUES][$_] : '*deleted*')) }
154 51         95 0 .. $#{$$self->[NAMES]};
  51         345  
155              
156 51         2384 return $out;
157             }
158              
159             # From RFC2616:
160             # CTL =
161             # (octets 0 - 31) and DEL (127)>
162             # LWS = [CRLF] 1*( SP | HT )
163             # separators = "(" | ")" | "<" | ">" | "@"
164             # | "," | ";" | ":" | "\" | <">
165             # | "/" | "[" | "]" | "?" | "="
166             # | "{" | "}" | SP | HT
167             my $PARSE_RE__LWS = qr/(?:\015\012)?[ \t]+/;
168             my $PARSE_RE__separator = qr[[][)(><}{@,;:/"\\?=[:space:]]];
169             my $PARSE_RE__not_separator = qr[[^][)(><}{@,;:/"\\?=[:space:]]];
170              
171             # From WARC specification:
172             # field-name = token
173             # token = 1*
174             # except CTLs or separators>
175             my $PARSE_RE__token = qr/[!#$%'*+-.0-9A-Z^_`a-z|~]+/;
176              
177             =head1 DESCRIPTION
178              
179             The C class encapsulates information in the
180             "application/warc-fields" format used for WARC record headers. This is a
181             simple key-value format closely analogous to HTTP headers, however
182             differences are significant enough that the C class cannot
183             be reliably reused for WARC fields.
184              
185             Instances of this class are usually created as member variables of the
186             C class, but can also be returned as the content of WARC
187             records with Content-Type "application/warc-fields".
188              
189             Instances of C retrieved from WARC files are read-only and
190             will croak() if any attempt is made to change their contents.
191              
192             This class strives to faithfully represent the contents of a WARC file,
193             while providing a simple interface to answer simple questions.
194              
195             =head2 Multiple Values
196              
197             Most WARC headers may only appear once and with a single value in valid
198             WARC records, with the notable exception of the WARC-Concurrent-To header.
199             C neither attempts to enforce nor relies upon this
200             constraint. Headers that appear multiple times are considered to have
201             multiple values. When iterating a tied hash, all values of a recurring
202             header are collected and returned with the B occurrence of its key.
203              
204             Multiple values are returned from the C method and tied hash
205             interface as array references, and are set by passing in an array
206             reference. Existing rows are reused where possible when updating a field
207             with multiple values. If the new array reference contains fewer items
208             (including the special case of replacing multiple values with a single
209             value) excess rows are deleted. If the new array reference requires
210             additional rows to be inserted, they are inserted immediately after the
211             last existing row for a field, with the same name case as that row.
212              
213             Precise control of the layout is available using the tied array interface,
214             but the ordering of the header rows is not constrained in the WARC
215             specification.
216              
217             =head2 Field Name Mangling
218              
219             As with C, the '_' character is converted to '-' in field
220             names unless the first character of the name is ':', which cannot itself
221             appear in a field name. Unlike C, the leading ':' is
222             stripped off immediately and the name stored otherwise exactly as given.
223             The C method and tied hash interface allow this convenience feature.
224             The field names exposed via the tied array interface are reported
225             B as they appear in the WARC file.
226              
227             Strictly, "X-Crazy-Header" and "X_Crazy_Header" are two B
228             headers that the above convenience mechanism conflates. The solution is
229             simple: if (and only if) a header field B with the B
230             name given, it is used, otherwise C occurs and the name is
231             rechecked for another exact match. If no match is found, case is folded
232             and a third check performed. If a match is found, the existing header is
233             updated, otherwise a new header is created with character case as given.
234              
235             The WARC specification specifically states that field names are
236             case-insensitive, accordingly, "X-Crazy-Header" and "X-CRAZY-HeAdEr" are
237             considered the same header for the C method and tied hash interface.
238             They will appear exactly as given in the tied array interface, however.
239              
240             =cut
241              
242             # This function handles two different canonicalizations:
243             # (1) case folding as required by the WARC specification
244             # (2) convenience translation s/_/-/g,
245             # (2a) suppressed if m/^:/, which is removed
246             # (2b) overridden by an exact match
247             # To make this work:
248             # --- all keys in INDEX are case-folded
249             # --- all keys in NAMES preserve case
250             # --- existing keys are case-folded by this function
251             # --- new keys translate s/_/-/g but preserve case
252             sub _find_key {
253 8444     8444   10542 my $self = shift; # INDEX slot must be valid
254 8444         10231 my $k = shift;
255 8444         9798 my $key; ($key = $k) =~ s/^://;
  8444         13290  
256 8444         10549 my $pad = $key;
257 8444         12086 my $is_quoted = ($k =~ m/^:/);
258              
259             # exact case-folded match?
260 8444 100       24258 return lc $key if defined $$self->[INDEX]{lc $key};
261              
262             # case-folded match after s/_/-/g?
263 3806         7365 $pad =~ s/_/-/g;
264 3806 100 100     10725 return lc $pad if defined $$self->[INDEX]{lc $pad} && !$is_quoted;
265              
266             # not found ==> a new key will be made
267 2817 100       5814 return $is_quoted ? $key : $pad;
268             }
269              
270             # called only if there is no or one current value
271             sub _set_single_value {
272 12     12   20 my $self = shift; # INDEX slot must be valid
273 12         21 my $key = shift; # as returned from _find_key
274 12         18 my $value = shift;
275              
276 12 100       221 croak "attempt to modify read-only object" if $$self->[IS_RO];
277              
278 11 100       32 unless (defined $$self->[INDEX]{lc $key}) {
279             # insert new key
280 6         10 push @{$$self->[NAMES]}, $key; # preserve original key
  6         16  
281 6         13 $key = lc $key; # fold key case
282 6         10 push @{$$self->[INDEX]{$key}}, $#{$$self->[NAMES]};
  6         16  
  6         15  
283             }
284              
285 11         34 $$self->[VALUES][$$self->[INDEX]{$key}[0]] = $value;
286             }
287              
288             sub _key_multiple_value_p {
289 4584     4584   5768 my $self = shift; # INDEX slot must be valid
290 4584         5675 my $key = shift; # as returned from _find_key
291              
292             # For this to be true, the key must already exist, which means that
293             # _find_key has case-folded it already.
294             return (defined $$self->[INDEX]{$key}
295 4584   100     9876 && 1 < scalar @{$$self->[INDEX]{$key}})
296             }
297              
298             # called in all cases where multiple values are involved
299             sub _set_multiple_value {
300 56     56   87 my $self = shift; # INDEX slot must be valid
301 56         83 my $key = shift; # as returned from _find_key
302 56         104 my $value_aref = shift;
303              
304 56 100       353 croak "attempt to modify read-only object" if $$self->[IS_RO];
305              
306             my $cur_count = (defined $$self->[INDEX]{$key}
307 54   100     140 && scalar @{$$self->[INDEX]{$key}});
308 54         101 my $new_count = scalar @$value_aref;
309              
310 54 100       103 unless ($cur_count) {
311             # insert new key
312 16         27 push @{$$self->[NAMES]}, $key; # preserve original key
  16         44  
313 16         34 push @{$$self->[VALUES]}, undef; # prepare slot
  16         37  
314 16         32 $key = lc $key; # fold key case
315 16         23 push @{$$self->[INDEX]{$key}}, $#{$$self->[NAMES]};
  16         46  
  16         43  
316 16         29 $cur_count = 1; # account for the added slot
317             }
318             # $key is always case-folded at this point
319              
320             # adjust table to accommodate new number of values
321 54 100       131 if ($cur_count > $new_count) {
    100          
322             # remove extra rows
323 27         34 foreach my $extra_row (reverse sort
324 27         124 splice @{$$self->[INDEX]{$key}}, $new_count) {
325 40         63 splice @{$$self->[NAMES]}, $extra_row, 1;
  40         69  
326 40         64 splice @{$$self->[VALUES]}, $extra_row, 1;
  40         64  
327 40         83 _update_INDEX($self, $extra_row, -1);
328             }
329             # special case: removing a field entirely
330 27 100       72 if ($new_count == 0) {
331             # This is here to catch a hypothetical bug before data is corrupted.
332             die "stray INDEX entries left after removing field"
333             # uncoverable branch true
334 19 50       24 unless scalar @{$$self->[INDEX]{$key}} == 0;
  19         61  
335 19         51 delete $$self->[INDEX]{$key};
336             }
337             } elsif ($cur_count < $new_count) {
338             # add more rows
339 15         32 my $last_row = $$self->[INDEX]{$key}[-1];
340 15         26 my $new_rows = $new_count - $cur_count;
341 15         36 _update_INDEX($self, $last_row, $new_rows);
342 15         22 splice @{$$self->[NAMES]}, 1+$last_row, 0,
  15         2414  
343             (($$self->[NAMES][$last_row]) x $new_rows);
344 15         25 splice @{$$self->[VALUES]}, 1+$last_row, 0, ((undef) x $new_rows);
  15         580  
345 15         24 push @{$$self->[INDEX]{$key}}, 1+$last_row .. $last_row+$new_rows;
  15         1275  
346             } # otherwise, $cur_count == $new_count
347 54 100       298 $$self->[MVOFF] = undef unless $cur_count == $new_count;
348             # there are always $new_count rows with $key at this point
349              
350 54         128 for (my $i = 0; $i < $new_count; $i++)
351 9968         22195 { $$self->[VALUES][$$self->[INDEX]{$key}[$i]] = $value_aref->[$i] }
352             }
353              
354             =head2 Methods
355              
356             =over
357              
358             =item $f = WARC::Fields-Enew
359              
360             Construct a new C object. Initial contents can be passed as
361             key-value pairs to this constructor and will be added in the given order.
362              
363             Repeating a key or supplying an array reference as a value assigns multiple
364             values to a key. To reduce the risk of confusion, only quoting with a
365             leading ':' overrides the convenience feature of applying C when
366             constructing a C object. The exact match rules used when
367             setting values on an existing object do not apply here.
368              
369             Field names given when constructing a WARC::Fields object are otherwise
370             stored exactly as given, with case preserved, even when other names that
371             fold to the same string have been given earlier in the argument list.
372              
373             =cut
374              
375             sub new {
376 220     220 1 15411 my $class = shift;
377 220         505 my $ob = [OBJECT_INIT];
378 220         351 my $k; my $v;
379              
380             # explicitly initialize NAMES and VALUES to allow as_string and as_block
381             # methods to be called on empty objects
382 220         331 $ob->[NAMES] = [];
383 220         361 $ob->[VALUES] = [];
384              
385 220         633 while (($k, $v) = splice @_, 0, 2) {
386 792 100       1742 croak "key without value" unless defined $v;
387              
388 791 100       1422 if ($k =~ m/^:/) { $k =~ s/^:// } else { $k =~ s/_/-/g }
  7         19  
  784         1845  
389              
390 791 100       2346 croak "reference to field with no name" unless $k =~ m/./;
391 789 100       2238 croak "reference to invalid field name" if $k !~ m/^$PARSE_RE__token$/o;
392              
393 787 100       1309 if (ref $v eq 'ARRAY') {
394 3         9 foreach my $value (@$v) {
395 8         24 push @{$ob->[NAMES]}, $k;
  8         19  
396 8         14 push @{$ob->[VALUES]}, $value;
  8         13  
397 8         12 push @{$ob->[INDEX]{lc $k}}, $#{$ob->[NAMES]};
  8         21  
  8         25  
398             }
399             } else {
400 784         914 push @{$ob->[NAMES]}, $k;
  784         1484  
401 784         1065 push @{$ob->[VALUES]}, $v;
  784         1295  
402 784         998 push @{$ob->[INDEX]{lc $k}}, $#{$ob->[NAMES]};
  784         1737  
  784         2666  
403             }
404             }
405              
406 215         302 {our $_total_newly_constructed; $_total_newly_constructed++}
  215         272  
  215         278  
407 215         1035 bless \ $ob, $class;
408             }
409              
410             =item $f-Eclone
411              
412             Copy a C object. A copy of a read-only object is writable.
413              
414             =cut
415              
416             sub clone {
417 8     8 1 14 my $self = shift;
418 8         22 my $new = [OBJECT_INIT];
419              
420 8         14 $new->[NAMES] = [@{$$self->[NAMES]}];
  8         663  
421 8         18 $new->[VALUES] = [@{$$self->[VALUES]}];
  8         1382  
422 8 100       38 $new->[MVOFF] = [@{$$self->[MVOFF]}] if defined $$self->[MVOFF];
  1         283  
423 30         43 $new->[INDEX] = {map {$_ => [@{$$self->[INDEX]{$_}}]}
  30         88  
424 8 100       40 keys %{$$self->[INDEX]}} if defined $$self->[INDEX];
  7         31  
425              
426 8         18 {our $_total_newly_cloned; $_total_newly_cloned++}
  8         12  
  8         16  
427 8         33 bless \ $new, ref $self;
428             }
429              
430             =item $f-Efield( $name )
431              
432             =item $f-Efield( $name =E $value )
433              
434             =item $f-Efield( $n1 =E $v1, $n2 =E $v2, ... )
435              
436             Get or set the value of one or more fields. The field name is not case
437             sensitive, but C will preserve its case if a new entry is
438             created.
439              
440             Setting a field to C effectively deletes that field, although it
441             remains visible in the tied array interface and will retain its position if
442             a new value is assigned. Setting a field to an empty array reference
443             removes that field entirely.
444              
445             =cut
446              
447             sub field {
448 7410     7410 1 37151 my $self = shift;
449              
450 7410 100       15093 _rebuild_INDEX($self) unless defined $$self->[INDEX];
451              
452 7410         10029 my $k; my $v; my $have_value_arg = scalar @_ > 1;
  7410         10746  
453 7410         16789 while (($k, $v) = splice @_, 0, 2) {
454 7410         11837 my $key = $self->_find_key($k);
455              
456 7410 100       20125 croak "reference to field with no name" unless $key =~ m/./;
457 7408 100       18389 croak "reference to invalid field name" if $key !~ m/^$PARSE_RE__token$/o;
458              
459 7405 100       11997 if (not $have_value_arg) {
    100          
    100          
460             # get a value
461 7343 100       17634 return undef unless defined $$self->[INDEX]{$key};
462 4570 100       7597 return $$self->[VALUES][$$self->[INDEX]{$key}[0]]
463             unless $self->_key_multiple_value_p($key);
464 78         303 return [grep {defined $_}
465 25         48 map {$$self->[VALUES][$_]} @{$$self->[INDEX]{$key}}];
  78         163  
  25         59  
466             } # otherwise set a value
467             elsif (UNIVERSAL::isa($v, 'ARRAY'))
468 48         107 { $self->_set_multiple_value($key, $v) }
469             elsif ($self->_key_multiple_value_p($key))
470             # has multiple values, but now only setting a single value
471 2         12 { $self->_set_multiple_value($key, [$v]) }
472             else
473 12         31 { $self->_set_single_value($key, $v) }
474 59         198 $have_value_arg = scalar @_ > 1;
475             }
476 59         157 return (); # return nothing
477             # Note that setting one or more fields and then getting a field is
478             # possible as a side-effect of this organization, but is explicitly NOT
479             # supported. That trick is NOT part of the stable API.
480             }
481              
482             =item $f = WARC::Fields-Eparse( $text )
483              
484             =item $f = WARC::Fields-Eparse( from =E $fh )
485              
486             =item $f = parse WARC::Fields from =E $fh
487              
488             Construct a new C object, reading initial contents from the
489             provided text string or filehandle.
490              
491             The C method throws an exception if it encounters input that it does
492             not understand.
493              
494             If the C method encounters a field name with a leading ':', which
495             implies an empty name and is not allowed, the leading ':' is silently
496             dropped from the line and parsing retried. If the line is not valid after
497             this change, the C method throws an exception. This feature is in
498             keeping with the general principle of "be liberal in what you accept" and
499             is a preemptive workaround for a predicted bug in other implementations.
500              
501             =cut
502              
503             sub parse {
504 608     608 1 1941 my $class = shift;
505 608         755 my $text = shift;
506 608         746 my $rd;
507              
508 608 100       1096 if ($text eq 'from') {
509 603         758 $rd = shift;
510             } else {
511             # This fails iff perl was built without PerlIO, which is non-default.
512             # uncoverable branch true
513 5 50       70 open $rd, '<', \$text or die "failure opening stream on variable: $!";
514             }
515              
516 608         987 my @names = ();
517 608         726 my @values = ();
518 608         812 my %idx = ();
519 608         728 my $at_end = 0;
520              
521 608         1291 local *_;
522 608         1836 while (<$rd>) {
523 4570         50239 s/[\015\012]+$//;
524 4570 100       14330 if (m/^:?($PARSE_RE__token):\s+(.*)$/o) # $1 -- name $2 -- value
    100          
    100          
525 3962         8688 { push @names, $1; push @values, $2; push @{$idx{lc $1}}, $#names }
  3962         6379  
  3962         4238  
  3962         15477  
526             elsif (m/^\s+(\S.*)$/) # $1 -- continued value
527 1         7 { $values[$#values] .= ' '.$1 }
528 605         785 elsif (m/^$/) { $at_end = 1; last }
  605         967  
529 2         380 else { croak "unrecognized input: $_" }
530             }
531              
532 606 100       1359 carp "end-of-input before end marker" unless $at_end;
533              
534 606         1012 @values = map {Encode::decode_utf8($_)} @values;
  3962         48119  
535              
536 606         9139 my $ob = [OBJECT_INIT];
537 606         1019 $ob->[NAMES] = \@names;
538 606         763 $ob->[VALUES] = \@values;
539 606         790 $ob->[INDEX] = \%idx;
540              
541 606         759 {our $_total_newly_parsed; $_total_newly_parsed++}
  606         679  
  606         740  
542 606         2279 bless \ $ob, $class;
543             }
544              
545             =item $f-Eas_block
546              
547             =item $f-Eas_string
548              
549             Return the contents as a formatted WARC header or application/warc-fields
550             block. The C method uses network line endings and UTF-8 as
551             specified for the WARC format, while the C method uses the local
552             line endings and does not perform encoding.
553              
554             =cut
555              
556             sub _as_text {
557 48     48   84 my $self = shift;
558 48         66 my $newline = shift;
559 48         68 my $out = '';
560              
561 48         79 for (my $i = 0; $i < @{$$self->[NAMES]}; $i++) {
  258         643  
562 210 100       396 next unless defined $$self->[VALUES][$i];
563 203         475 $out .= $$self->[NAMES][$i] . ': ' . $$self->[VALUES][$i] . $newline;
564             }
565              
566 48         196 return $out;
567             }
568              
569 1     1 1 4 sub as_block { Encode::encode('UTF-8', _as_text(shift, WARC::CRLF)) }
570 47     47 1 5029 sub as_string { _as_text(shift, "\n") }
571              
572             =item $f-Eset_readonly
573              
574             Mark a C object read-only. All methods that modify the
575             object will croak() if called on a read-only object.
576              
577             =cut
578              
579             sub set_readonly {
580 732     732 1 1196 my $self = shift;
581              
582 732         1825 $$self->[IS_RO] = 1;
583             }
584              
585             =back
586              
587             =head2 Tied Array Access
588              
589             The order of fields can be fully controlled by tying an array to a
590             C object and manipulating the array using ordinary Perl
591             operations. The C and C functions are likely to be useful
592             for reordering array elements if desired.
593              
594             C will croak() if an attempt is made to set a field name with
595             a leading ':' using the tied array interface.
596              
597             =cut
598              
599             sub TIEARRAY {
600 4     4   27 my $class = shift;
601 4         7 my $ob = shift;
602              
603             # This method must ignore the given class to allow the "empty subclass"
604             # test to pass. If a subclass really wants, an override for TIEARRAY
605             # itself can call SUPER::TIEARRAY and re-bless the returned reference
606             # into the desired class.
607 4         7 $WARC::Fields::TiedArray::_total_tied++;
608 4         26 bless \ $ob, 'WARC::Fields::TiedArray';
609             }
610              
611             {
612             package WARC::Fields::TiedArray::Entry;
613              
614 26     26   243 use Carp;
  26         127  
  26         2339  
615              
616             BEGIN { $WARC::Fields::TiedArray::Entry::{$_} = $WARC::Fields::{$_}
617 26     26   1199 for WARC::Fields::OBJECT_INDEX; }
618              
619 26     26   175 use constant { NAME => 0, VALUE => 1, TABLE => 2, ROW => 3 };
  26         64  
  26         2458  
620              
621 26     26   179 use overload '""' => 'name', fallback => 1;
  26         58  
  26         154  
622              
623             =pod
624              
625             The tied array interface accepts simple string values but returns objects
626             with additional information. The returned object has an overloaded string
627             conversion that yields the name for that entry but additionally has
628             C and C methods.
629              
630             An entry object is bound to a slot in its parent C object,
631             but will be copied if it is assigned to another slot in the same or another
632             C object.
633              
634             Due to complex aliasing rules necessary for array slice assignment to work
635             for permuting rows in the table, entry objects must be short-lived.
636             Storing the object read from a tied array and attempting to use it after
637             modifying its parent C object produces unspecified results.
638              
639             =over
640              
641             =item $entry = $array[$n]
642              
643             =item $entry = $f-E[$n]
644              
645             The tied array C method returns a "entry object" instead of the name
646             itself.
647              
648             =cut
649              
650             sub _new {
651 223     223   304 my $class = shift;
652 223         278 my $table = shift;
653 223         350 my $row = shift;
654              
655 223         1010 bless [$$table->[NAMES][$row], $$table->[VALUES][$row],
656             $table, $row], $class;
657             }
658              
659             =item $name = "$entry"
660              
661             =item $name = $entry-Ename
662              
663             =item $name = "$f-E[$n]"
664              
665             =item $name = $f-E[$n]-Ename
666              
667             The C method on a entry object returns the field name.
668             String conversion is overloaded to call this method.
669              
670             =cut
671              
672 62     62   9060 sub name { (shift)->[NAME] }
673              
674             =item $value = $entry-Evalue
675              
676             =item $value = $array[$n]-Evalue
677              
678             =item $value = $f-E[$n]-Evalue
679              
680             =item $entry-Evalue( $new_value )
681              
682             =item $array[$n]-Evalue( $new_value )
683              
684             =item $f-E[$n]-Evalue( $new_value )
685              
686             The C method on a entry object returns the field value for this
687             particular entry. Only a single scalar is returned, even if multiple
688             entries share the same name.
689              
690             If given an argument, the C method replaces the value for this
691             particular entry. The argument will be coerced to a string.
692              
693             =cut
694              
695             sub value {
696 154     154   245 my $self = shift;
697              
698 154 100       278 if (scalar @_ == 0) { # get this value
699 34         154 return $self->[VALUE];
700             } else { # update this value
701 120 100       137 croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO];
  120         374  
702 119         174 my $newval = shift;
703 119         193 ${$self->[TABLE]}->[VALUES]->[$self->[ROW]] = $self->[VALUE] = "$newval";
  119         214  
704 119         292 return (); # and return nothing
705             }
706             }
707              
708             =item $offset = $entry-Eoffset
709              
710             =item $offset = $array[$n]-Eoffset
711              
712             =item $offset = $f-E[$n]-Eoffset
713              
714             The C method on a entry object returns the position of this entry
715             amongst multiple entries with the same field name. These positions are
716             numbered from zero and are identical to the positions in the array
717             reference returned for this entry's field name from the C method or
718             the tied hash interface.
719              
720             =cut
721              
722             sub offset {
723 21     21   42 my $self = shift;
724 21 100       28 $self->[TABLE]->_rebuild_MVOFF unless defined ${$self->[TABLE]}->[MVOFF];
  21         80  
725 21         32 return ${$self->[TABLE]}->[MVOFF]->[$self->[ROW]];
  21         113  
726             }
727              
728             =back
729              
730             =cut
731              
732             }
733              
734             {
735             package WARC::Fields::TiedArray::LooseEntry;
736              
737 26     26   9672 use Carp;
  26         68  
  26         2184  
738              
739             BEGIN { $WARC::Fields::TiedArray::LooseEntry::{$_} = $WARC::Fields::{$_}
740 26     26   1018 for WARC::Fields::OBJECT_INDEX; }
741              
742 26     26   189 use constant { NAME => 0, VALUE => 1 };
  26         63  
  26         2089  
743              
744 26     26   4934 BEGIN { our @ISA = qw(WARC::Fields::TiedArray::Entry) }
745              
746             # This is a special type of "entry object" that is not associated with a
747             # table row, returned from POP, SHIFT, and SPLICE when needed.
748              
749             sub _new {
750 24     24   401 my $class = shift;
751 24         31 my $name = shift;
752 24         34 my $value = shift;
753              
754 24         77 bless [$name, $value], $class;
755             }
756              
757 18     18   832 sub name { return (shift)->[NAME] }
758              
759             sub value {
760 17     17   27 my $self = shift;
761              
762 17 100       34 if (scalar @_ == 0) # get
763 16         38 { return $self->[VALUE] }
764             else # set
765 1         119 { croak "Loose array entries are read-only." }
766             }
767              
768 1     1   5 sub offset { return undef }
769             }
770              
771             {
772             package WARC::Fields::TiedArray;
773              
774 26     26   199 use Carp;
  26         59  
  26         2192  
775              
776             BEGIN { $WARC::Fields::TiedArray::{$_} = $WARC::Fields::{$_}
777 26     26   43650 for WARC::Fields::OBJECT_INDEX; }
778              
779             # The underlying object is a reference to a WARC::Fields object.
780              
781             sub FETCH {
782 223     223   8843 my $self = shift;
783 223         268 my $row = shift;
784 223         645 return (ref($self).'::Entry')->_new($$self, $row);
785             }
786              
787             sub STORE {
788 13     13   420 my $self = shift;
789 13         19 my $row = shift;
790 13         21 my $name = shift;
791              
792 13 100       140 croak "attempt to modify read-only object" if $$$self->[IS_RO];
793              
794 12 100       15 $self->STORESIZE($row + 1) if $#{$$$self->[NAMES]} < $row;
  12         37  
795              
796 12 100       64 if (UNIVERSAL::isa($name, ref($self).'::Entry')) {
797             # copy entry
798 8 100       21 croak "attempt to set invalid name"
799             if $name->name !~ m/^$PARSE_RE__token$/o;
800 7         18 $$$self->[NAMES]->[$row] = $name->name;
801 7         11 $$$self->[VALUES]->[$row] = $name->value;
802             } else {
803             # set name
804 4 100       369 croak "attempt to set invalid name"
805             if "$name" !~ m/^$PARSE_RE__token$/o;
806 2         10 $$$self->[NAMES]->[$row] = "$name";
807             }
808 9         14 $$$self->[MVOFF] = undef;
809 9         37 $$$self->[INDEX] = undef;
810             }
811              
812             sub FETCHSIZE {
813 38     38   4786 my $self = shift;
814 38         50 return scalar @{$$$self->[NAMES]};
  38         299  
815             }
816              
817             sub STORESIZE {
818 5     5   10 my $self = shift;
819 5         9 my $count = shift;
820              
821 5 100 100     18 croak "attempt to modify read-only object"
822             if $$$self->[IS_RO] && $count != $self->FETCHSIZE();
823              
824 4 100       12 if ($count > $self->FETCHSIZE()) {
    100          
825 2         7 my $needed = $count - $self->FETCHSIZE();
826 2         5 push @{$$$self->[NAMES]}, ('X-Undefined-Field-Name') x $needed;
  2         8  
827 2         3 push @{$$$self->[VALUES]}, (undef) x $needed;
  2         6  
828             } elsif ($count < $self->FETCHSIZE()) {
829 1         2 splice @{$$$self->[NAMES]}, $count;
  1         4  
830 1         2 splice @{$$$self->[VALUES]}, $count;
  1         4  
831 1         6 $$$self->[INDEX] = undef;
832 1         4 } else { return } # no actual change
833             }
834              
835       0     sub EXTEND {
836             # do nothing
837             }
838              
839             sub EXISTS {
840 14     14   2278 my $self = shift;
841 14         19 my $row = shift;
842 14         90 return defined $$$self->[VALUES]->[$row];
843             }
844              
845             sub DELETE {
846 2     2   6 my $self = shift;
847 2         3 my $row = shift;
848              
849 2 100       152 croak "attempt to modify read-only object" if $$$self->[IS_RO];
850              
851 1         3 my $old_value = $$$self->[VALUES]->[$row];
852 1         3 $$$self->[VALUES]->[$row] = undef;
853 1         4 $$$self->[MVOFF] = undef;
854 1         5 $$$self->[INDEX] = undef;
855 1         5 return $old_value;
856             }
857              
858             sub CLEAR {
859 3     3   10 my $self = shift;
860              
861 3 100       115 croak "attempt to modify read-only object" if $$$self->[IS_RO];
862              
863 2         7 $$$self->[NAMES] = [];
864 2         7 $$$self->[VALUES] = [];
865 2         7 $$$self->[MVOFF] = undef;
866 2         6 $$$self->[INDEX] = undef;
867 2         5 return undef;
868             }
869              
870             sub PUSH {
871 7     7   781 my $self = shift;
872              
873 7 100 100     137 croak "attempt to modify read-only object"
874             if $$$self->[IS_RO] && scalar @_;
875              
876 6         15 foreach my $item (@_) {
877 103         137 my $name; my $value;
878 103 100       336 if (UNIVERSAL::isa($item, ref($self).'::Entry'))
879 2         8 { $name = $item->name; $value = $item->value }
  2         8  
880             else
881 101         150 { $name = "$item"; $value = undef }
  101         128  
882 103 100       562 croak "attempt to set invalid name" if $name !~ m/^$PARSE_RE__token$/o;
883 101         137 push @{$$$self->[NAMES]}, $name;
  101         591  
884 101         137 push @{$$$self->[VALUES]}, $value;
  101         294  
885             }
886 4         9 $$$self->[MVOFF] = undef;
887 4         120 $$$self->[INDEX] = undef;
888 4         8 return scalar @{$$$self->[NAMES]};
  4         17  
889             }
890              
891             sub POP {
892 5     5   11 my $self = shift;
893              
894 5 100       126 croak "attempt to modify read-only object" if $$$self->[IS_RO];
895              
896             my $ret = WARC::Fields::TiedArray::LooseEntry->_new
897 4         6 (pop @{$$$self->[NAMES]}, pop @{$$$self->[VALUES]});
  4         9  
  4         13  
898 4 100       12 pop @{$$$self->[MVOFF]} if defined $$$self->[MVOFF];
  2         5  
899 4         10 $$$self->[INDEX] = undef;
900              
901 4         14 return $ret;
902             }
903              
904             sub SHIFT {
905 2     2   4 my $self = shift;
906              
907 2 100       115 croak "attempt to modify read-only object" if $$$self->[IS_RO];
908              
909             my $ret = WARC::Fields::TiedArray::LooseEntry->_new
910 1         3 (shift @{$$$self->[NAMES]}, shift @{$$$self->[VALUES]});
  1         4  
  1         4  
911 1         4 $$$self->[MVOFF] = undef;
912 1         2 $$$self->[INDEX] = undef;
913              
914 1         4 return $ret;
915             }
916              
917             sub UNSHIFT {
918 6     6   722 my $self = shift;
919              
920 6 100 100     129 croak "attempt to modify read-only object"
921             if $$$self->[IS_RO] && scalar @_;
922              
923 5         13 foreach my $item (@_) {
924 4         6 my $name; my $value;
925 4 100       24 if (UNIVERSAL::isa($item, ref($self).'::Entry'))
926 2         6 { $name = $item->name; $value = $item->value }
  2         6  
927             else
928 2         4 { $name = "$item"; $value = undef }
  2         4  
929 4 100       269 croak "attempt to set invalid name" if $name !~ m/^$PARSE_RE__token$/o;
930 2         4 unshift @{$$$self->[NAMES]}, $name;
  2         7  
931 2         4 unshift @{$$$self->[VALUES]}, $value;
  2         8  
932             }
933 3         8 $$$self->[MVOFF] = undef;
934 3         7 $$$self->[INDEX] = undef;
935 3         3 return scalar @{$$$self->[NAMES]};
  3         9  
936             }
937              
938             sub SPLICE {
939 10     10   773 my $self = shift;
940 10         17 my $offset = shift;
941 10         15 my $length = shift;
942              
943 10 100       27 $offset = 0 unless defined $offset;
944 10 100       27 $length = $self->FETCHSIZE() - $offset unless defined $length;
945              
946 10 100 100     38 return () unless ($length != 0 || scalar @_);
947              
948 9 100       270 croak "attempt to modify read-only object" if $$$self->[IS_RO];
949              
950 7         15 my @new_names = (); my @new_values = (); my @old_names; my @old_values;
  7         9  
  7         15  
951              
952 7         14 foreach my $item (@_) {
953 17 100       61 if (UNIVERSAL::isa($item, ref($self).'::Entry')) {
954 11         20 push @new_names, $item->name;
955 11         19 push @new_values, $item->value;
956             } else {
957 6         13 push @new_names, "$item";
958 6         17 push @new_values, undef;
959             }
960             }
961              
962             croak "attempt to set invalid name"
963 7 100       12 if grep { $_ !~ m/^$PARSE_RE__token$/o } @new_names;
  17         318  
964              
965 5         8 @old_names = splice @{$$$self->[NAMES]}, $offset, $length, @new_names;
  5         20  
966 5         10 @old_values = splice @{$$$self->[VALUES]}, $offset, $length, @new_values;
  5         15  
967              
968 5         11 my @ret = ();
969              
970 5         14 for (my $i = 0; $i < scalar @old_names; $i++)
971 18         37 { push @ret, WARC::Fields::TiedArray::LooseEntry->_new
972             ($old_names[$i], $old_values[$i]) }
973              
974 5         12 $$$self->[MVOFF] = undef;
975 5         12 $$$self->[INDEX] = undef;
976 5         26 return @ret;
977             }
978              
979 4     4   10 sub UNTIE { our $_total_untied; $_total_untied++ }
  4         18  
980              
981 4     4   6 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  4         17  
982             }
983              
984             =head2 Tied Hash Access
985              
986             The contents of a C object can be easily examined by tying a
987             hash to the object. Reading or setting a hash key is equivalent to the
988             C method, but the tied hash will iterate keys and values in the
989             order in which each key B appears in the internal table.
990              
991             Like the tied array interface, the tied hash interface returns magical
992             objects that internally refer back to the parent C object.
993             These objects remain valid if the underlying C object is
994             changed, but further use may produce surprising and unspecified results.
995              
996             The use of magical objects enables the values in a tied hash to B
997             be arrays, even for keys that do not exist (the array will have zero
998             elements) or that have only one value (the array will have a string
999             conversion that produces that one value). This allows a tied hash to
1000             support autovivification of an array value just as Perl's own hashes do.
1001              
1002             =cut
1003              
1004             sub TIEHASH {
1005 68     68   122 my $class = shift;
1006 68         88 my $ob = shift;
1007              
1008             # This method must ignore the given class to allow the "empty subclass"
1009             # test to pass. If a subclass really wants, an override for TIEHASH
1010             # itself can call SUPER::TIEHASH and re-bless the returned reference
1011             # into the desired class.
1012 68         103 $WARC::Fields::TiedHash::_total_tied++;
1013 68         359 bless \ $ob, 'WARC::Fields::TiedHash';
1014             }
1015              
1016             {
1017             package WARC::Fields::TiedHash::ValueArray;
1018              
1019 26     26   276 use Carp;
  26         55  
  26         2202  
1020              
1021             BEGIN { $WARC::Fields::TiedHash::ValueArray::{$_} = $WARC::Fields::{$_}
1022 26     26   1046 for WARC::Fields::OBJECT_INDEX; }
1023              
1024 26     26   194 use constant { TABLE => 0, KEY => 1, KEYc => 2 }; # KEYc -- canonical KEY
  26         69  
  26         55144  
1025              
1026             sub TIEARRAY {
1027 458     458   671 my $class = shift;
1028 458         626 my $table = shift;
1029 458 100       996 $table->_rebuild_INDEX unless defined $$table->[INDEX];
1030 458         869 my $key = $table->_find_key(shift); # needs INDEX
1031              
1032 458         741 { our $_total_tied; $_total_tied++ }
  458         552  
  458         619  
1033 458         1569 bless [$table, $key, lc $key], $class;
1034             }
1035              
1036             sub FETCH {
1037 768     768   1672 my $self = shift;
1038 768         936 my $offset = shift;
1039              
1040 768 100       908 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  768         1669  
1041              
1042 768         990 my $row = ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset];
  768         1442  
1043 768 100       1279 return defined $row ? ${$self->[TABLE]}->[VALUES][$row] : undef;
  752         3449  
1044             }
1045              
1046             sub STORE {
1047 10     10   543 my $self = shift;
1048 10         14 my $offset = shift;
1049 10         17 my $value = shift;
1050              
1051 10         13 my $T = $self->[TABLE];
1052              
1053 10 100       154 croak "attempt to modify read-only object" if $$T->[IS_RO];
1054              
1055 9 100       21 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1056              
1057             $self->STORESIZE($offset + 1)
1058             if not defined $$T->[INDEX]{$self->[KEYc]}
1059 9 100 100     29 or $#{$$T->[INDEX]{$self->[KEYc]}} < $offset;
  8         30  
1060              
1061 9         40 $$T->[VALUES][$$T->[INDEX]{$self->[KEYc]}[$offset]] = "$value";
1062             }
1063              
1064             sub FETCHSIZE {
1065 792     792   1627 my $self = shift;
1066              
1067 792 100       1021 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  792         1942  
1068              
1069 782         995 return scalar @{${$self->[TABLE]}->[INDEX]{$self->[KEYc]}}
  782         3321  
1070 792 100       1115 if defined ${$self->[TABLE]}->[INDEX]{$self->[KEYc]};
  792         1864  
1071 10         77 return 0; # otherwise: key does not exist
1072             }
1073              
1074             sub STORESIZE {
1075 8     8   14 my $self = shift;
1076 8         13 my $count = shift;
1077              
1078 8         10 my $T = $self->[TABLE];
1079              
1080 8 100 100     28 croak "attempt to modify read-only object"
1081             if $$T->[IS_RO] && $count != $self->FETCHSIZE();
1082              
1083 7 100       22 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1084              
1085 7         13 my @new = ();
1086 6         21 @new = @{$$T->[VALUES]}[@{$$T->[INDEX]{$self->[KEYc]}}]
  6         14  
1087 7 100       17 if defined $$T->[INDEX]{$self->[KEYc]};
1088 7 100       16 if ($count > $self->FETCHSIZE())
    100          
1089 4         11 { push @new, (undef) x ($count - $self->FETCHSIZE()) }
1090             elsif ($count < $self->FETCHSIZE())
1091 2         10 { @new = @new[0..($count-1)] }
1092 1         4 else { return } # no actual change
1093 6         19 $T->field($self->[KEY] => \@new);
1094             }
1095              
1096       0     sub EXTEND {
1097             # do nothing
1098             }
1099              
1100             sub EXISTS {
1101 5     5   535 my $self = shift;
1102 5         8 my $offset = shift;
1103              
1104 5 100       7 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  5         24  
1105              
1106 5         11 return exists ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset];
  5         33  
1107             }
1108              
1109             sub DELETE {
1110 4     4   528 my $self = shift;
1111 4         8 my $offset = shift;
1112              
1113 4 100       6 croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO];
  4         121  
1114              
1115 3 100       15 $self->[TABLE]->_rebuild_INDEX unless defined ${$self->[TABLE]}->[INDEX];
  3         15  
1116              
1117 3         6 my $row = ${$self->[TABLE]}->[INDEX]{$self->[KEYc]}[$offset];
  3         10  
1118 3         5 my $old_value = ${$self->[TABLE]}->[VALUES][$row];
  3         10  
1119 3         6 ${$self->[TABLE]}->[VALUES][$row] = undef;
  3         7  
1120 3         12 return $old_value;
1121             }
1122              
1123             sub CLEAR {
1124 2     2   4 my $self = shift;
1125              
1126 2 100       5 croak "attempt to modify read-only object" if ${$self->[TABLE]}->[IS_RO];
  2         115  
1127              
1128 1         4 $self->[TABLE]->field($self->[KEY] => []);
1129             }
1130              
1131             sub PUSH {
1132 11     11   24 my $self = shift;
1133              
1134 11         16 my $T = $self->[TABLE];
1135              
1136 11 100 100     143 croak "attempt to modify read-only object"
1137             if $$T->[IS_RO] && scalar @_;
1138              
1139 10 100       26 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1140              
1141 10 100       25 if (defined $$T->[INDEX]{$self->[KEYc]}) {
1142             # key exists ==> extend table efficiently
1143 5         31 my $last_row = $$T->[INDEX]{$self->[KEYc]}[-1];
1144 5         10 my $new_rows = scalar @_;
1145 5         8 splice @{$$T->[NAMES]}, 1+$last_row, 0,
  5         21  
1146             (($$T->[NAMES][$last_row]) x $new_rows);
1147 5         7 splice @{$$T->[VALUES]}, 1+$last_row, 0, map {"$_"} @_;
  5         13  
  4         14  
1148 5         19 $T->_update_INDEX($last_row, $new_rows);
1149 5         6 push @{$$T->[INDEX]{$self->[KEYc]}}, 1+$last_row .. $last_row+$new_rows;
  5         19  
1150             } else {
1151             # make key ==> use existing setter
1152 5         27 $T->_set_multiple_value($self->[KEY], [map {"$_"} @_]);
  9909         18988  
1153             }
1154 10         1320 $$T->[MVOFF] = undef;
1155             }
1156              
1157             sub POP {
1158 7     7   566 my $self = shift;
1159              
1160 7         14 my $T = $self->[TABLE];
1161              
1162 7 100       126 croak "attempt to modify read-only object" if $$T->[IS_RO];
1163              
1164 6 100       16 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1165              
1166 6 100       21 return undef unless defined $$T->[INDEX]{$self->[KEYc]};
1167              
1168 5         10 my $rem_row = $$T->[INDEX]{$self->[KEYc]}[-1];
1169 5         9 my $value = $$T->[VALUES][$rem_row];
1170              
1171 5         8 splice @{$$T->[NAMES]}, $rem_row, 1;
  5         12  
1172 5         7 splice @{$$T->[VALUES]}, $rem_row, 1;
  5         11  
1173 5 100       15 splice @{$$T->[MVOFF]}, $rem_row, 1 if defined $$T->[MVOFF];
  1         3  
1174 5         15 $T->_update_INDEX($rem_row, -1);
1175 5         8 pop @{$$T->[INDEX]{$self->[KEYc]}};
  5         10  
1176             # special case: popped last value
1177             delete $$T->[INDEX]{$self->[KEYc]}
1178 5 100       10 if scalar @{$$T->[INDEX]{$self->[KEYc]}} == 0;
  5         14  
1179              
1180 5         36 return $value;
1181             }
1182              
1183             sub SHIFT {
1184 6     6   526 my $self = shift;
1185              
1186 6         12 my $T = $self->[TABLE];
1187              
1188 6 100       123 croak "attempt to modify read-only object" if $$T->[IS_RO];
1189              
1190 5 100       15 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1191              
1192 5 100       18 return undef unless defined $$T->[INDEX]{$self->[KEYc]};
1193              
1194 4         10 my $rem_row = $$T->[INDEX]{$self->[KEYc]}[0];
1195 4         6 my $value = $$T->[VALUES][$rem_row];
1196              
1197 4         6 splice @{$$T->[NAMES]}, $rem_row, 1;
  4         8  
1198 4         8 splice @{$$T->[VALUES]}, $rem_row, 1;
  4         6  
1199 4         9 $$T->[MVOFF] = undef;
1200 4         11 $T->_update_INDEX($rem_row, -1);
1201 4         8 shift @{$$T->[INDEX]{$self->[KEYc]}};
  4         8  
1202             # special case: shifted last value
1203             delete $$T->[INDEX]{$self->[KEYc]}
1204 4 100       6 if scalar @{$$T->[INDEX]{$self->[KEYc]}} == 0;
  4         13  
1205              
1206 4         20 return $value;
1207             }
1208              
1209             sub UNSHIFT {
1210 5     5   10 my $self = shift;
1211              
1212 5         11 my $T = $self->[TABLE];
1213              
1214 5 100 100     122 croak "attempt to modify read-only object"
1215             if $$T->[IS_RO] && scalar @_;
1216              
1217 4 100       14 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1218              
1219 4 100       15 if (defined $$T->[INDEX]{$self->[KEYc]}) {
1220             # key exists ==> extend table efficiently
1221 3         9 my $first_row = $$T->[INDEX]{$self->[KEYc]}[0];
1222 3         6 my $new_rows = scalar @_;
1223 3         4 splice @{$$T->[NAMES]}, $first_row, 0,
  3         13  
1224             (($$T->[NAMES][$first_row]) x $new_rows);
1225 3         6 splice @{$$T->[VALUES]}, $first_row, 0, map {"$_"} @_;
  3         10  
  2         9  
1226 3         10 $T->_update_INDEX($first_row - 1, $new_rows);
1227 3         6 unshift @{$$T->[INDEX]{$self->[KEYc]}},
  3         11  
1228             $first_row .. $first_row-1+$new_rows;
1229             } else {
1230             # make key ==> use existing setter
1231 1         5 $T->_set_multiple_value($self->[KEY], [map {"$_"} @_]);
  4         11  
1232             }
1233 4         15 $$T->[MVOFF] = undef;
1234             }
1235              
1236             sub SPLICE {
1237 9     9   18 my $self = shift;
1238 9         12 my $offset = shift;
1239 9         14 my $length = shift;
1240              
1241 9 100       22 $offset = 0 unless defined $offset;
1242 9 100       19 $length = $self->FETCHSIZE() - $offset unless defined $length;
1243              
1244 9 100 100     36 return () unless ($length != 0 || scalar @_);
1245              
1246 8         14 my $T = $self->[TABLE];
1247              
1248 8 100       225 croak "attempt to modify read-only object" if $$T->[IS_RO];
1249              
1250 6 100       17 $T->_rebuild_INDEX unless defined $$T->[INDEX];
1251              
1252 6         11 my @new = ();
1253 5         16 @new = @{$$T->[VALUES]}[@{$$T->[INDEX]{$self->[KEYc]}}]
  5         11  
1254 6 100       17 if defined $$T->[INDEX]{$self->[KEYc]};
1255 6         16 my @old = splice @new, $offset, $length, map {"$_"} @_;
  3         11  
1256 6         21 $self->[TABLE]->field($self->[KEY] => \@new);
1257              
1258 6         41 return @old;
1259             }
1260              
1261 458     458   571 sub UNTIE { our $_total_untied; $_total_untied++ }
  458         851  
1262              
1263 458     458   559 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  458         1461  
1264             }
1265              
1266             {
1267             package WARC::Fields::TiedHash::Value;
1268              
1269             # This package is a magical array that appears to be a string if it has
1270             # only one value, otherwise string conversion gives the usual
1271             # nearly-useless debugging value.
1272              
1273             # The actual underlying array is a tied array that forwards mutating
1274             # operations to the original WARC::Fields object.
1275              
1276 26     26   235 use overload '""' => '_as_string', fallback => 1;
  26         122  
  26         177  
1277              
1278 26     26   2221 use Scalar::Util qw/refaddr reftype/;
  26         117  
  26         5308  
1279              
1280             sub _new {
1281 458     458   651 my $class = shift;
1282 458         661 my $parent = shift;
1283 458         624 my $key = shift;
1284              
1285 458         631 my @values;
1286 458         1342 tie @values, (ref($parent).'::ValueArray'), $$parent, $key;
1287              
1288 458         2492 bless \@values, $class;
1289             }
1290              
1291             sub _as_string {
1292 680     680   12249 my $self = shift;
1293              
1294 680 100       1905 return scalar @$self == 1
1295             ? $self->[0] : sprintf ('%s(0x%x)', reftype $self, refaddr $self);
1296             }
1297              
1298 458     458   2844 sub DESTROY { untie @{(shift)} }
  458         1036  
1299             }
1300              
1301             {
1302             package WARC::Fields::TiedHash;
1303              
1304 26     26   213 use Carp;
  26         192  
  26         2278  
1305              
1306             BEGIN { $WARC::Fields::TiedHash::{$_} = $WARC::Fields::{$_}
1307 26     26   22599 for WARC::Fields::OBJECT_INDEX; }
1308              
1309             # The underlying object is a reference to a WARC::Fields object.
1310              
1311             sub FETCH {
1312 461     461   1971 my $self = shift;
1313 461         656 my $key = shift;
1314 461 100       2302 croak "reference to invalid field name" if $key !~ m/^$PARSE_RE__token$/o;
1315 458         1451 return (ref($self).'::Value')->_new($self, $key);
1316             }
1317              
1318             sub STORE {
1319 11     11   385 my $self = shift;
1320 11         18 my $key = shift;
1321 11         17 my $value = shift;
1322              
1323 11 100       246 croak "attempt to modify read-only object" if $$$self->[IS_RO];
1324              
1325 9         25 $$self->field($key => $value);
1326             }
1327              
1328             sub DELETE {
1329 3     3   6 my $self = shift;
1330 3         5 my $key = shift;
1331              
1332 3 100       120 croak "attempt to modify read-only object" if $$$self->[IS_RO];
1333              
1334 2         7 $$self->field($key => []);
1335             }
1336              
1337             sub CLEAR {
1338 2     2   5 my $self = shift;
1339              
1340 2 100       113 croak "attempt to modify read-only object" if $$$self->[IS_RO];
1341              
1342 1         4 $$$self->[NAMES] = [];
1343 1         14 $$$self->[VALUES] = [];
1344 1         3 $$$self->[MVOFF] = undef;
1345 1         4 $$$self->[INDEX] = undef;
1346 1         4 return undef;
1347             }
1348              
1349             sub EXISTS {
1350 15     15   36 my $self = shift;
1351 15         21 my $key = shift;
1352              
1353 15 100       49 $$self->_rebuild_INDEX unless defined $$$self->[INDEX];
1354 15         42 return exists $$$self->[INDEX]->{$$self->_find_key($key)};
1355             }
1356              
1357             sub FIRSTKEY {
1358 61     61   98 my $self = shift;
1359              
1360 61         219 return $$$self->[NAMES][0];
1361             }
1362              
1363             sub NEXTKEY {
1364 307     307   452 my $self = shift;
1365 307         389 my $from_key = shift;
1366              
1367 307 100       633 $$self->_rebuild_INDEX unless defined $$$self->[INDEX];
1368              
1369 307         369 my $i;
1370 307   100     542 for ($i = $$$self->[INDEX]{$$self->_find_key($from_key)}[0] + 1;
1371             defined $$$self->[NAMES][$i] and
1372             $i != $$$self->[INDEX]{$$self->_find_key($$$self->[NAMES][$i])}[0];
1373             $i++) {}
1374 307         1093 return $$$self->[NAMES][$i];
1375             }
1376              
1377             sub SCALAR {
1378 3     3   10 my $self = shift;
1379 3         5 return scalar @{$$$self->[NAMES]};
  3         17  
1380             }
1381              
1382 68     68   87 sub UNTIE { our $_total_untied; $_total_untied++ }
  68         160  
1383              
1384 68     68   91 sub DESTROY { our $_total_destroyed; $_total_destroyed++ }
  68         132  
1385             }
1386              
1387             =head2 Overloaded Dereference Operators
1388              
1389             The C class provides overloaded dereference operators for
1390             array and hash dereferencing. The overloaded operators provide an
1391             anonymous tied array or hash as needed, allowing the object itself to be
1392             used as a reference to its tied array and hash interfaces. There is a
1393             caveat, however, so read on.
1394              
1395             =cut
1396              
1397             sub _as_tied_array {
1398             # To avoid confusing bugs due to typos producing overloaded dereferences
1399             # instead of intended accesses to the internal object, this feature
1400             # cannot be used from within this module.
1401 168 100   168   7436 if (scalar caller =~ m/^WARC::Fields/) {
1402 1         4 local $Carp::CarpLevel = 1;
1403 1         138 confess "overloaded array dereference in internal code"
1404             }
1405              
1406 167         251 my $self = shift;
1407              
1408 167 100       696 return $$self->[C_TA] if defined $$self->[C_TA];
1409              
1410 3         6 my @array; $$self->[C_TA] = \@array;
  3         7  
1411 3         6 Scalar::Util::weaken ${tie @array, ref $self, $self};
  3         14  
1412 3         14 return $$self->[C_TA];
1413             }
1414              
1415             sub _as_tied_hash {
1416             # To avoid confusing bugs due to typos producing overloaded dereferences
1417             # instead of intended accesses to the internal object, this feature
1418             # cannot be used from within this module.
1419 266 100   266   8251 if (scalar caller =~ m/^WARC::Fields/) {
1420 1         4 local $Carp::CarpLevel = 1;
1421 1         71 confess "overloaded hash dereference in internal code"
1422             }
1423              
1424 265         401 my $self = shift;
1425              
1426 265 100       1203 return $$self->[C_TH] if defined $$self->[C_TH];
1427              
1428 61         91 my %hash; $$self->[C_TH] = \%hash;
  61         110  
1429 61         92 Scalar::Util::weaken ${tie %hash, ref $self, $self};
  61         256  
1430 61         306 return $$self->[C_TH];
1431             }
1432              
1433             =head3 Reference Count Trickery with Overloaded Dereference Operators
1434              
1435             To avoid problems, the underlying tied object is a reference to the parent
1436             object. For ordinary use of C, this is a strong reference, however,
1437             the anonymous tied array and hash are cached in the object to avoid having
1438             to C a new object every time the dereference operators are used.
1439              
1440             To prevent memory leaks due to circular references, the overloaded
1441             dereference operators tie a I reference to the parent object. The
1442             tied aggregate always holds a strong reference to its object, but when the
1443             dereference operators are used, that inner object is a I reference to
1444             the actual C object.
1445              
1446             The caveat is thus: do not attempt to save a reference to the array or hash
1447             produced by dereferencing a C object. The parent
1448             C object must remain in scope for as long as any anonymous
1449             tied aggregates exist.
1450              
1451             =cut
1452              
1453             1;
1454             __END__