File Coverage

blib/lib/File/ANVL.pm
Criterion Covered Total %
statement 181 274 66.0
branch 66 160 41.2
condition 25 120 20.8
subroutine 22 27 81.4
pod 0 17 0.0
total 294 598 49.1


line stmt bran cond sub pod time code
1             package File::ANVL;
2              
3             # XXXXXxxxx make adding a value policy-driven, eg,
4             # "add" could mean (a) replace, (b) push on end array,
5             # (c) push on start of array, (d) string concatenation,
6             # (d) error.
7              
8 4     4   139546 use 5.006;
  4         13  
  4         187  
9 4     4   24 use strict;
  4         6  
  4         144  
10 4     4   20 use warnings;
  4         14  
  4         152  
11              
12 4     4   18 use constant NL => "\n";
  4         7  
  4         345  
13              
14             # ANVL flavors
15             #
16 4     4   18 use constant ANVL => 1;
  4         8  
  4         159  
17 4     4   20 use constant ANVLR => 2;
  4         6  
  4         160  
18 4     4   18 use constant ANVLS => 3;
  4         7  
  4         14216  
19              
20             our $VERSION;
21             $VERSION = sprintf "%d.%02d", q$Name: Release-1-05 $ =~ /Release-(\d+)-(\d+)/;
22              
23             require Exporter;
24             our @ISA = qw(Exporter);
25              
26             our @EXPORT = qw();
27              
28             our @EXPORT_OK = qw(
29             anvl_recarray anvl_arrayhash
30             anvl_name_naturalize
31             anvl_rechash anvl_valsplit
32             erc_anvl_expand_array kernel_labels
33             xgetlines trimlines
34             make_get_anvl
35             anvl_opt_defaults anvl_decode anvl_om
36              
37             anvl_encode anvl_recsplit
38              
39             ANVL ANVLR ANVLS ANVLSH
40             );
41              
42             our %EXPORT_TAGS = (all => [ @EXPORT_OK ]);
43              
44             # All these symbols must be listed also in EXPORT_OK (?)
45             #
46             our @EXPORT_FAIL = qw(
47             ANVL ANVLR ANVLS ANVLSH
48             );
49              
50             our $anvl_mode = 'ANVL'; # default mode
51              
52             # This is a magic routine that the Exporter calls for any unknown symbols.
53             #
54 3     3 0 490 sub export_fail { my( $class, @symbols )=@_;
55              
56 3         14 $anvl_mode = $_ for (@symbols);
57 3         2767 return ();
58             }
59              
60             # Initialize or re-initialize options to factory defaults.
61             #
62             sub anvl_opt_defaults { return {
63              
64             # Input options
65             #
66 7     7 0 61 autoindent => 1, # yes, fix recoverably bad indention
67             comments => 0, # no, don't parse comments
68             elemsproc => # to expand short form ERCs (if any)
69             \&File::ANVL::erc_anvl_expand_array,
70             elemsprocpat => # no call from anvl_om if no match
71             qr/^erc:/m, # in rec; no call if set and matches
72             };
73             }
74              
75             # xxx decide on good name for short form and long form ERC
76              
77             # Returns a closure that calls an input reader with that's set to *ARGV
78             # by default. If $reader and $readee are defined, they are stored in the
79             # closure and all reads will be performed by calling &$reader($readee).
80             #
81             # The default reader collects text lines from a file and returns all the
82             # lines associated with the next "record", which is considered to start
83             # wherever the read pointer happens to be and continues to the first two
84             # blank lines encountered that occur after "substance" is detected.
85             # Substance is defined to be at least one non-whitespace character
86             # occurring on a non-comment line. Comment and blank lines that precede
87             # a record with substance are returned, but any such lines that follow
88             # that the final record are discarded.
89             #
90 1     1 0 389 sub make_get_anvl { my( $reader, $readee ) = shift;
91              
92 1 50       6 unless ($reader) {
93              
94 1         3 my $rec; # returned record
95             my $s; # next increment of input
96 0         0 my $substance; # boolean detecting substance
97              
98 4     4   1393 return sub { my( $filehandle ) = shift;
99              
100             # Returns a subroutine, call it get_anvl()
101             #
102             # Usage: $record = get_anvl( [$filehandle] );
103             #
104             # It reads ANVL input records as text lines from the
105             # file given by $filehandle (*ARGV by default, which can
106             # process multiple files via while loop magic). Usually,
107             # the closure holds enough state information, set up by
108             # make_get_anvl(), that get_anvl() can be called without
109             # arguments. get_anvl() returns the record read as a
110             # string, or returns undef on end of input or error.
111             #
112 4   33     18 $filehandle ||= *ARGV;
113 4         22 local $/ = NL.NL; # a kind of "paragraph" input mode
114             # $/ === $INPUT_RECORD_SEPARATOR
115 4         6 $rec = '';
116 4   66     118 1 while (
      100        
117             defined($s = <$filehandle>) and # read to eof and
118             ($rec .= $s), # save everything, but stop
119             $substance = # when we detect substance, ie,
120             $s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m,
121             ! $substance # non-comment with non-space
122             );
123 4 100       24 return $substance ?
124             $rec : # return either collected record or undef
125             undef; # any final blank or comment lines are tossed
126              
127             # yyy If more than one file, line numbers normally accumulate
128             # across files. Should we preserve line numbers within each
129             # files? (If so, use "close ARGV" (Perl idiom) to cause $.
130             # (linenum) to be reset between files.
131 1         9 };
132             }
133              
134             # If we get here, $reader should reference an input method and
135             # $readee is assumed to be any value (eg, BDB handle) that may
136             # permit &reader to get the next record. Any other arguments
137             # passed to the get_anvl() function below will be passed along
138             # too, ie, $reader($readee, @_).
139             #
140 0 0       0 ref($reader) eq "CODE" or return undef;
141              
142 0         0 my $rec; # returned record
143             my $s; # next increment of input
144 0         0 my $substance; # boolean detecting substance
145              
146             return sub {
147 0     0   0 $rec = '';
148 0   0     0 1 while (
      0        
149             # XXX should this accumulate in general??? or
150             # should we leave it to the definer of $reader?
151             defined($s = &reader($readee, @_)) and # read and
152             ($rec .= $s), # save everything, but stop
153             $substance = # when we detect substance, ie,
154             $s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m,
155             ! $substance # non-comment with non-space
156             );
157 0 0       0 return $substance ?
158             $rec : # return either collected record or undef
159             undef; # any final blank or comment lines are tossed
160 0         0 };
161             }
162              
163             # XXX deprecated! see sub make_get_anvl
164 0     0 0 0 sub xgetlines { my( $filehandle )=@_;
165              
166 0         0 my $rec = ''; # returned record
167 0         0 my $s; # next increment of input
168 0         0 local $/ = NL.NL; # a kind of "paragraph" input mode
169             # $/ === $INPUT_RECORD_SEPARATOR
170              
171             # If $filehandle is specified, use the Perl <$filehandle> idiom to
172             # return next unit of input (normally a line, but here a para).
173             #
174 0   0     0 $filehandle ||= *ARGV;
175 0   0     0 1 while ( defined( $s = <$filehandle> ) and # read up to two \n's
      0        
176              
177             # If we get here, $s now contains a block to save.
178             #
179             $rec .= $s,
180              
181             # We continue reading only if there's no substance,
182             # ie, no line read starts with a non-comment and no
183             # non-comment line read contains non-whitespace
184             #
185             #$s !~ /^[^#\s]/m and # if no line read starts with
186             # $s !~ /^[^#].*\S/m # or contains substance
187             #(! ($s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m) and
188             # $rec .= "substance found in <$s>\n"),
189             ! ($s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m)
190             );
191              
192             #$s !~ /^\s*[^#\s]/m || # match no line of susbstance
193             #$rec .= $s
194              
195             #($rec .= $s), # only "paragraphs"; save everything
196             #$s !~ /^\s*[^#\s]/m # but stop when substance seen
197              
198             #$s !~ /^\s*[^#\s]/m # but stop when substance seen
199             # and while $s matches no line starting with ^#
200             # while every line in $s is either all whitespace
201             # or all comment (ie, first non-ws char is #)
202             #$s =~ /\S/ # but stop when we see substance
203             #$s !~ /\S/ # but stop when we see substance
204             # substance means \S on a non-comment line
205             # $s !~ /^\S|[^#].*\S/m
206             # ! ($s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m)
207             #
208             #); # /^\s*[^#\s]/m
209 0 0 0     0 defined($s) or
210             return $rec || undef; # almost eof or real eof
211 0         0 return $rec;
212              
213             # XXXX what happens when one file ends prematurely and
214             # another begins? does last record for first file get
215             # returned glued to beginning of first recond of 2nd file?
216             # If more than one file, line numbers normally just accumulate.
217             # We want to preserve line numbers within files, so we use this
218             # next Perl idiom to cause $. (linenum) to be reset between files.
219             #
220             #close ARGV if eof; # reset line numbers between files
221             }
222              
223             # args: record, reference to whitespace lines, reference to real record lines
224             # xxx replace \n with NL throughout
225             # returns undef when $rec trims to nothing (EOF)
226 5     5 0 1076 sub trimlines { my( $rec, $r_wslines, $r_rrlines )=@_;
227              
228             # $rec might legitimately be undefined if called as
229             # trimlines(getlines(), ...)
230             #
231 5   100     15 $rec ||= '';
232              
233 5         19 $rec =~ s/^(\s*)//; # '*' guarantees $1 will be defined
234 5         14 my $blanksection = $1;
235 5         7 my @newlines;
236              
237 5 50       32 ref($r_wslines) eq 'SCALAR' and # if given, define it
238             $$r_wslines = scalar(@newlines = $blanksection =~ /\n/g);
239              
240 5 50       42 ref($r_rrlines) eq 'SCALAR' and # if given, define it
241             $$r_rrlines = scalar(@newlines = $rec =~ /\n/g);
242              
243             #$$r_rrlines = scalar($rec =~ /$/gm); # xxx why doesn't this work?
244              
245             # At this point $r_wslines and $r_rrlines (if supplied) are safely
246             # defined and ready for return.
247             #
248 5 100       20 $rec or # empty record (but $r_wslines may be defined)
249             return undef; # signal eof-style return
250              
251             #$rec =~ /\n\n$/ and # ok record ending -- this is
252             # return $rec; # the usual return
253             #$rec =~ s/\n*$/\n\n/; # normalize premature eof ending
254 3         9 return $rec;
255             }
256              
257             # returns empty string on success or string beginning "warning:..."
258             # third arg (0 or 1) optional
259             # elems is returned array of name value pairs
260             #DEPRECATED:
261 1     1 0 1 sub anvl_recsplit { my( $record, $r_elems, $strict )=@_;
262              
263 1 50       3 ! defined($record) and
264             return "needs an ANVL record";
265 1 50       5 ref($r_elems) ne "ARRAY" and
266             return "2nd arg must reference an array";
267              
268 1         1 my $strict_default = 0;
269 1 50       3 ! defined($strict) and
270             $strict = $strict_default;
271              
272 1         2 local $_ = $record;
273 1         5 s/^\s*//; s/\s*$//; # trim both ends
  1         5  
274 1 50       5 /\n$/ or s/$/\n/; # normalize end of record to \n
275              
276 1 50       5 /\n\n/ and
277             return "record should have no internal blank line(s)";
278             # xxx adjust regexp for ANVLR
279 1 50       5 ! /^[^\s:][\w ]*:/ and # match against first element
280             return "well-formed record begins with a label and colon";
281              
282 1 50       5 $anvl_mode ne ANVLR and
283             s/^#.*\n//gm; # remove comments plus final \n
284              
285             # If we're not in strict parse mode, correct for common error
286             # where continued value is not indented. We can pretty safely
287             # assume a continued value if a line is flush left and contains
288             # no colon at all.
289             #
290             # This next substitution match needs to be multi-line to avoid
291             # more explicit looping.
292             #
293             # XXX there's probably a more efficient way to do this.
294 1         2 my $msg = "";
295 1         5 my $indented = s/^([^\s:][^:]*)$/ $1/gm;
296 1 50       3 if ($indented) {
297 0 0       0 $strict and
298             (@$r_elems = undef),
299             return "error: $indented unindented value line(s)";
300 0         0 $msg = "warning: indenting $indented value line(s)";
301             }
302             # if we get here, assume standard continuation lines, and join them
303             # (GRANVL style)
304             #
305 1         3 s/\n\s+/ /g;
306             # XXX should have a newline-preserving form of parse?
307              
308             # Split into array element pairs. Toss first "false" split.
309             # xxx buggy limited patterns, how not to match newline
310              
311             # This is the critical splitting step.
312             # splits line beginning ..... xxx
313             #
314 1         4 s/\n$//; # strip final \n
315 1         7 (undef, @$r_elems) = split /\n*^([^\s:][\w ]*):\s*/m;
316              
317 1         7 return $msg;
318             }
319              
320             # xxxxxxxx respond to 'comments' (def. off), 'autoindent' (def. on),
321             # 'anvlr' (def. off), 'granvl' ?
322              
323             # This is the closest thing to a reference implementation of an ANVL
324             # record parser.
325             # It returns "" on success, or "error: ..." or "warning: ..."
326              
327 10     10 0 6286 sub anvl_recarray { my( $record, $r_elems, $linenum, $o )=@_;
328              
329 10 50       31 ! defined($record) and
330             return "error: no input record";
331 10 100       32 ref($r_elems) ne "ARRAY" and
332             return "error: 2nd arg must reference an array";
333              
334             # Note: this input $linenum is pure digits, while $lineno on
335             # output is a combination of digits and type (':' or '#')
336             #
337 9 100       23 defined($linenum) or $linenum = 1;
338 9 50       30 $linenum =~ /\D/ and
339             return "error: 3rd arg ($linenum) must be a positive integer";
340             # XXX can't this be optimized a bit to keep defaults around?
341 9   66     34 $o ||= anvl_opt_defaults();
342 9 50       26 ref($o) ne "HASH" and
343             return "error: 4th arg must reference a hash";
344              
345 9         12 local $_ = $record; # localizing $_ prevents modifying global $_
346              
347 9         34 s/^\s*//; s/\s*$//; # trim both ends
  9         172  
348 9 50       42 /\n$/ or s/$/\n/; # normalize end of record to \n
349             #s/\n?$/\nEOR:/; # whether record ends in \n or not, normalize
350             # # end of record to \nEOR: (note no \n after \nEOR:)
351              
352             # Reject some malformed cases.
353             #
354             #/\n\n/ and
355             # return "error: record should have no internal blank line(s)";
356             # xxx adjust regexp for ANVLR
357             # XXX fix so record can consist of nothing but comments and/or whitespace;
358             # comments _may_ be recognized in regular records, but not in this kind
359             #/^[^\s:][\w ]*:/m or # match against first element
360             # return "error: record ($_) should begin with a label and colon";
361              
362             # Any other unindented line not containing a colon will either
363             # cause an error or will be automatically indented.
364              
365             # xxx what about $anvl_mode ne ANVLR and??
366              
367             # Now we synthesize stuff (line numbers and pseudo-element names for
368             # any comments) in order to create a uniform structure on each line,
369             # so that we can finally call 'split' to bust apart that structure
370             # into a Perl array in which every 3-element group corresponds to
371             # 1. a line number,
372             # 2. a label, and
373             # 3. a value.
374             #
375             # First insert a line number and ":" in front of each line.
376             #
377 9         13 my $num = $linenum;
378 9         55 s/^/ $num++ . ":" /gem; # put a line number on each line
  38         121  
379              
380             # Remove blank lines, now that line numbers have been preserved.
381             #
382 9         48 s/^\d+:[^\S\n]*\n//gm;
383              
384             # Now, if we're not deleting comments, insert a pseudo-element
385             # name '#:' in front of each comment while also changing the ':'
386             # after the line numer to '#'. This means that all lines will
387             # begin with a line number followed by ':' for real elements or
388             # by '#' for comment elements. Eg, '# foo' on line 3 becomes
389             # '3##:# foo', which conforms to the eventual split pattern we
390             # rely on (at end).
391             #
392             # xxx problem with line #K:value, which becomes, eg, 4##:K:value
393 9 100 66     61 $$o{comments} and # if we're keeping comments
394             s/^(\d+):#/$1##:/gm, 1
395             # ^^^
396             # 123
397             # 1=separator, 2=pseudo-name,
398             # 3=original value minus '#' starts after :
399              
400             or # else completely delete comments
401             s/^\d+:#.*\n//gm # up to and including final \n
402             ;
403            
404             # Return if nothing's left after deleting blank lines and comments.
405             #
406 9 50       31 /^\s*$/s and
407             return "warning: record at line $linenum has no content";
408              
409 9         14 my $msg = ""; # default return message
410              
411             # If we're not in strict parse mode, correct for common error
412             # where continued value is not indented. We can pretty safely
413             # assume a continued value if a line is flush left and contains
414             # no colon at all.
415             #
416             # This next substitution match is multi-line to avoid explicit
417             # looping (yyy is this an efficient way to do it?). It indents
418             # by one space any line starting without a space or colon and
419             # that has no instance of a colon until end of line.
420             #
421 9         57 my $indented = s/^(\d+:)([^\s:][^:]*)$/$1 $2/gm;
422 9 100       21 if ($indented) {
423 3 100       12 unless ($$o{autoindent}) {
424 1         6 @$r_elems = undef; # XXXXX isn't this too much?
425 1         7 return "error: $indented unindented value line(s)";
426             }
427 2         7 $msg = "warning: indenting $indented value line(s)";
428             }
429              
430             # Now we join the (normalized) continuation lines (GRANVL style)
431             # so each element-value pair is on one line. The + in [ \t]+ is
432             # very important; we can't use \s+ here because \s matches a \n.
433             #
434 8         41 s/\n\d+:[ \t]+/ /g;
435             #s/\n\d+:\s+/ /g;
436             # XXX should we have a newline-preserving form of parse?
437              
438             # Get rid of initial whitespace from all non-comment GRANVL values.
439 8         66 s/^(\d+:[^\s:][^:]*:)[ \t]+/$1/gm;
440             # xxx problem with line #K:value, which becomes, eg, 4##:K:value
441              
442             # Split into array of element pairs. Toss first "false" split.
443             # xxx buggy limited patterns, how not to match newline
444              
445             # This is the critical splitting step.
446             # splits line beginning ..... xxx
447             #
448 8         27 s/\n$//; # strip final \n
449 8         177 @$r_elems = ('', 'ANVL', # 3rd elem of 1st triple is
450             # provided by first element resulting from the split
451              
452             split /\n*^(\d+[:#])([^\s:][^:]*):/m
453              
454             # xxx problem with line #K:value, which becomes, eg, 4##:K:value
455             );
456              
457 8 50       33 defined($$r_elems[2]) or
458             return "error: split failed ($_) on '$record', " .
459             "record at line $linenum";
460              
461             # If there was a value with no label at the start of the record,
462             # we deem that interesting enough to keep even though it's not
463             # ANVL-compliant; the caller can prevent this by turning off
464             # 'autoindent', the processing for which will either flag this as
465             # an error or will have inserted one space in front of the value.
466             #
467 8 100       23 $$r_elems[2] =~ /^(\d+): (.*)/ and
468             ($$r_elems[0], $$r_elems[2]) = ($1, $2);
469              
470             #(undef, @$r_elems) = split /\n*^([^\s:][\w ]*):\s*/m;
471             # yyy an approach once considered but not used
472             # $num = $.; # linenum
473             # s/^/ $num++ . ":" /e while (/\n/g);
474             # /\G ($N\#.*\n)+ (?=$N[^\#]) /gx # comment block
475             # /\G ($N\S.*\n)+ (?=$N[^\S]) /gx # element on one or more lines
476             # /\G (#.*\n)+(?=[^#])/g
477             # /^#.*?\n[^#]/s # (?=lookahead)
478             #return "_=$_\n" . join(", ", @$r_elems); # to check results
479              
480 8         51 return $msg;
481             }
482              
483             # XXXXXX for consolidating a:b and a:c into a:b;c, MAJOR constraint
484             # is that b and c CANNOT contain '|' or we refuse...
485              
486 3     3 0 2181 sub anvl_arrayhash { my( $r_elems, $r_hash, $first_only )=@_;
487              
488 3 100       13 ref($r_elems) ne "ARRAY" and
489             return "error: 1st arg must reference an array";
490 2 100       7 ref($r_hash) ne "HASH" and
491             return "error: 2nd arg must reference a hash";
492 1 50       3 defined($first_only) or $first_only = 0;
493              
494 1         3 my $num_elems = scalar @$r_elems;
495 1 50       4 $num_elems % 3 != 0 and
496             return "error: input array length must be a multiple of 3";
497              
498 1 50       3 $num_elems < 1 and return ""; # no elements, we're done
499            
500 1         3 my $msg = ''; # xxx needed?
501 1         2 my ($name, $value, $n, $v);
502              
503             # We know there must be at least 3 elements, so it's safe to check
504             # the special first triple (index 2 is the only one we look at now)
505             # for an initial unlabeled record element (non-standard ANVL).
506             # If we find something, we make up the name, '_'.
507             #
508 1 50       4 if ($$r_elems[2]) { # first triple is special
509 1         2 $name = '_';
510 0         0 ! defined $$r_hash{$name} and
511             $$r_hash{$name} = [ 0 ] # initialize array
512             or
513 1 50 50     12 push @{ $$r_hash{$name} }, 0 # add to array
514             }
515              
516 1         5 for ($n = 3; $n < $num_elems; $n += 3) {
517              
518 4         7 $name = $$r_elems[$n + 1];
519 1         4 ! defined $$r_hash{$name} and
520             $$r_hash{$name} = [ $n ] # initialize array
521             or
522 4 100 100     24 push @{ $$r_hash{$name} }, $n # add to array
523             ;
524             }
525 1         3 return $msg;
526             }
527              
528             # ANVL value split
529             # xxx rename to anvl_valarray?
530             # returns empty string on success or string beginning "warning:..."
531             # r_svals is reference to an array that will be filled upon return
532 5     5 0 5052 sub anvl_valsplit { my( $value, $r_svals )=@_;
533              
534 5 50       13 ! defined($value) and
535             return "needs an ANVL value";
536 5 100       16 ref($r_svals) ne "ARRAY" and
537             return "2nd arg must reference an array";
538 4         5 local $_;
539              
540             #xxx print "r_svals=$r_svals\n";
541             #xxx print "value=$value\n";
542 4         7 my $warning = ""; # xxx used?
543             #my $ret_subvalues = \$_[1];
544              
545             # Assume value is all on one line and split it.
546             #my @svals = split /\|/, $value;
547 4         23 @$r_svals = split /\|/, $value;
548             #$_[1] = \@svals;
549             $_ = [ split(/;/, $_) ] # create array of arrays
550 4         41 for (@$r_svals);
551             #xxxprint("svals=", join(", ", @$_), "\n") for (@$r_svals);
552              
553             # xxxx need to look for all 3 levels: (change spec)
554             # XXXXXXX value ::= one or more svals (sval1 | sval2 | ...)
555             # XXXXXXX sval ::= one or more rvals (rval1 ; rval2 ; ...)
556             # XXXXXXX rval ::= one or more qvals (qval1 (=) qval2 (=) ...)
557             # where s=sub, r=repeated, q=equivalent
558             # XXXXXXX or ?? rval ::= one or more avals (aval1 (=) aval2 (=) ...)
559 4 50       19 return $warning ? "warning: $warning" : "";
560             }
561              
562             # Create record hash, elem is key, value is value
563             #
564 2     2 0 2113 sub anvl_rechash { my( $record, $r_hash, $strict )=@_;
565              
566 2 50       8 ! defined($record) and
567             return "needs an ANVL record";
568 2 100       6 ref($r_hash) ne "HASH" and
569             return "2nd arg must reference a hash";
570              
571 1         3 my $msg = "";
572 1         1 my @elems;
573 1 50       5 ($msg = anvl_recsplit($record, \@elems, $strict)) and
574             return "anvl_recsplit: $msg";
575              
576 1         3 my ($name, $value);
577 1         2 while (1) {
578 2         4 $name = shift @elems;
579 2 100       6 last unless defined $name; # nothing left
580 1         2 $value = shift @elems;
581 1 50       4 if (! defined $$r_hash{$name}) {
582             # Nothing there, so store scalar and continue.
583 1         3 $$r_hash{$name} = $value; # 1st value (non-array)
584 1         2 next;
585             }
586             # If we get here there's something's already there.
587             # Don't overwrite if we're in $strict mode.
588             # xxx document this
589             #
590 0 0       0 $strict and next; # don't overwrite
591              
592             # XXXXXxxxx make adding a value policy-driven, eg,
593             # "add" could mean (a) replace, (b) push on end array,
594             # (c) push on start of array, (d) string concatenation,
595             # (d) error.
596             # xxx should anvl_rechash save line numbers?
597             # xxx should anvl_recsplit save line numbers?
598              
599             # Whatever is there could be a scalar or an array reference.
600             # If not a reference, create an anonymous array, put a
601             # scalar into it, and refer to the array.
602             #
603 0         0 my $v = $$r_hash{$name}; # add to current
604 0 0       0 $v = [ $v ] # make an array if currently
605             unless ref $v; # there's only one value
606              
607             # If we get here, we have a reference to an array,
608             # possibly empty. Either way, we can push onto it.
609             #
610 0         0 push @$v, $value;
611             }
612 1         3 return $msg;
613             }
614              
615             # [ !"#\$%&'\(\)\*\+,/:;<=>\?@\[\\\]\|\0]
616             our %anvl_decoding = (
617              
618             'sp' => ' ', # decodes to space (0x20)
619             'ex' => '!', # decodes to ! (0x21)
620             'dq' => '"', # decodes to " (0x22)
621             'ns' => '#', # decodes to # (0x23)
622             'do' => '$', # decodes to $ (0x24)
623             'pe' => '%', # decodes to % (0x25)
624             'am' => '&', # decodes to & (0x26)
625             'sq' => "'", # decodes to ' (0x27)
626             'op' => '(', # decodes to ( (0x28)
627             'cp' => ')', # decodes to ) (0x29)
628             'as' => '*', # decodes to * (0x2a)
629             'pl' => '+', # decodes to + (0x2b)
630             'co' => ',', # decodes to , (0x2c)
631             'sl' => '/', # decodes to / (0x2f)
632             'cn' => ':', # decodes to : (0x3a)
633             'sc' => ';', # decodes to ; (0x3b)
634             'lt' => '<', # decodes to < (0x3c)
635             'eq' => '=', # decodes to = (0x3d)
636             'gt' => '>', # decodes to > (0x3e)
637             'qu' => '?', # decodes to ? (0x3f)
638             'at' => '@', # decodes to @ (0x40)
639             'ox' => '[', # decodes to [ (0x5b)
640             'ls' => '\\', # decodes to \ (0x5c)
641             'cx' => ']', # decodes to ] (0x5d)
642             'vb' => '|', # decodes to | (0x7c)
643             'nu' => "\0", # decodes to null (0x00)
644             );
645             # XXXXXXX need way to encode newlines (using '\n' in interim)
646              
647             our %anvl_encoding;
648              
649             #%cn :
650             #%sc ;
651              
652             # xxxxx handle these separately
653             # # XXXX remove %% from erc/anvlspec?
654             # '%' => '%pe', # decodes to % (0x25) xxxx do this first?
655             # '_' => '', # a non-character used as a syntax shim
656             # '{' => '', # a non-character that begins an expansion block
657             # '}' => '', # a non-character that ends an expansion block
658              
659             # Takes a single arg.
660             sub anvl_decode {
661              
662 5   50 5 0 18 local $_ = shift(@_) || '';
663              
664 5         18 pos() = 0; # reset \G for $_ just to be safe
665 5         29 while (/(?=\%\{)/g) { # lookahead; \G matches just before
666 7         11 my $p = pos(); # note \G position before it changes
667 7 100       41 s/\G \%\{ (.*?) \%\}//xs # 's' modifier makes . match \n
668             or last; # if no closing brace, skip match
669 6         14 my $exp_block = $1; # save removed expansion block
670 6         33 $exp_block =~ s/\s+//g; # strip it of all whitespace
671 6         11 pos() = $p; # revert \G to where we started and
672 6         34 s/\G/$exp_block/; # re-insert changed expansion block
673             }
674 5         14 s/\%[}{]//g; # remove any remaining unmatched
675 5         8 s/\%_//g; # xxx %_ -> ''
676 5         11 s/\%\%/\%pe/g; # xxx ??? xxxx???
677             # decode %XY where XY together don't form a valid pair of hex digits
678 5         17 s/\%([g-z][a-z]|[a-z][g-z])/$anvl_decoding{$1}/g;
679 5         25 return $_;
680             }
681              
682             # xxx encoding should be context-sensitive, eg, name, value
683 1     1 0 5515 sub anvl_encode { my( $s )=@_;
684            
685             # XXXX just define this in the module??
686 1 50       7 unless (%anvl_encoding) { # one-time definition
687             # This just defines an inverse mapping so we can encode.
688             $anvl_encoding{$anvl_decoding{$_}} = $_
689 1         149 for (keys %anvl_decoding);
690             }
691             $s =~
692 1         14 s/([ !\"#\$\%&'\(\)\*\+,\/:;<=>\?@\[\\\]\|\0])/\%$anvl_encoding{$1}/g;
693 1         10 return $s;
694             }
695              
696             # return $name in natural word order, using ANVL inversion points
697             # repeat for each final comma present
698 6     6 0 21 sub anvl_name_naturalize { my( $name )=@_;
699              
700 6   50     14 $name ||= '';
701 6 50       38 $name =~ /^\s*$/ and return $name; # empty
702              
703             # "McCartney, Paul, Sir,,"
704             # a, b, c, d, e,,, -> e d c a, b
705 6         16 my $prefix = '';
706 6         42 while ($name =~ s/,\s*$//) {
707 9 100       106 $name =~ s/^(.*),\s*([^,]+)(,*$)/$1$3/ and
708             $prefix .= $2 . ' ';
709             }
710 6         43 return $prefix . $name;
711             }
712              
713 0     0 0 0 sub anvl_summarize { my( @nodes )=@_; }
714              
715             # XXXXX doesn't this really belong in an ERC.pm module?
716             #
717             # ordered list of kernel element names
718             our @kernel_labels = qw(
719             who
720             what
721             when
722             where
723             how
724             why
725             huh
726             );
727             #
728             # This routine inspects and possibly modifies in place the kind of element
729             # array resulting from a call to anvl_recarray(), which splits and ANVL
730             # record. It is useful for transforming short form ERC elements into full
731             # form elements, for example, to expand "erc:a|b|c|d" into the equivalent,
732             # "erc:\nwho:a\nwhat:b\nwhen:c\nwhere:d".
733             # It returns the empty string on success, otherwise an error message.
734             #
735 4     4 0 1636 sub erc_anvl_expand_array { my( $r_elems )=@_;
736              
737 4     4   60 use File::ANVL;
  4         7  
  4         5380  
738 4         4 my ($lineno, $name, $value, $msg, @svals, $sval);
739 4         5 my $me = 'erc_anvl_expand_array';
740 4         5 my $i = 3; # skip first 3 elems (anvl array preamble)
741 4         7 while (1) {
742 21         23 $lineno = $$r_elems[$i++];
743 21   100     45 $name = $$r_elems[$i++] || '';
744 21   100     41 $value = $$r_elems[$i++] || '';
745 21 100       32 last unless defined $lineno; # end of record
746             next # skip unless we have erc-type thing
747 17 100 100     53 if ($name ne 'erc' || $value =~ /^\s*$/);
748             #if ($name !~ /^erc\b/ || $value =~ /^\s*$/);
749             # xxx should do this for full generality
750              
751             # If here, we have an erc-type thing with a non-empty value.
752             #
753 2 50       8 ($msg = anvl_valsplit($value, \@svals)) and
754             return "error: $me: anvl_valsplit: $msg";
755            
756             # XXXX only doing straight "erc" (eg, not erc-about)
757 2         3 my $j = 0;
758 2         4 my @extras = ();
759             # If we exceed known labels, we'll re-use last known label.
760 2         4 my $unknown = $kernel_labels[$#kernel_labels];
761 2         3 foreach $sval (@svals) {
762              
763             # xxx not (yet) tranferring subvalue structure
764             # to anvl_om or other conversion
765             # Recall that each $sval is itself a reference to
766             # an array of subvalues (often just one element).
767             #
768 8   33     65 push @extras, # trust kernel_labels order
769             $lineno,
770             $kernel_labels[$j++] || $unknown,
771             join('; ', # trim ends of subvalues
772             map(m/^\s*(.*?)\s*$/, @$sval)
773             );
774             }
775             # Finally, replace our $value element with '' and append
776             # the new extra values we've just expanded.
777 2         15 splice @$r_elems, $i-1, 1,
778             '', # replaces $value we just used up
779             @extras; # adds new elements from $value
780             }
781 4         12 return ''; # success
782             }
783              
784             # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX=============
785             # xxx checkm _in_ obj1 obj2 ... --> returns noids
786             # xxx checkm _out_ id1 id2 ... --> returns objects
787              
788             # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX=============
789             # xxx do metadata scan of object before ingest and confirm with user that
790             # the object is correctly identified. This could even be done remotely.
791             # Start with informal staff service for depositing objects, returning a
792             # short url to a stable object, and not clogging up allstaff inboxes with
793             # huge attachments. Also applies to any number of draft docs for review
794             # but in temporary storage (but stable).
795              
796             # XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX=============
797             # xxx do id generator service with 'expiring' ids. To mint, you tell us
798             # who you are first. To get a perm. id, you agree to use your minted id
799             # and bind it within N months. We track, and warn you several times
800             # until N months as elapsed and then reclaim/recycle the id.
801              
802             ############################################
803             # Output Multiplexer routines
804             ############################################
805              
806             # #$erc = "erc: Smith, J.|The Whole Truth|2004|http://example.com/foo/bar";
807             # $errmsg = File::ERC::erc_anvl2erc_turtle ($erc, $rec);
808             # $errmsg and
809             # print("$errmsg\n")
810             # or
811             # print("turtle record:\n$rec\n")
812             # ;
813              
814             # xxx anvl_fmt not consistent with om_anvl!
815              
816             # Input file(s) from ARGV.
817              
818 0     0 0   sub anvl_om { my( $om, $o, $get_anvl ) = (shift, shift, shift);
819              
820 0 0         return "anvl_om: 1st arg not an OM object"
821             if ref($om) !~ /^File::OM::/;
822 0           my $p = $om->{outhandle}; # whether 'print' status or small
823 0   0       $o ||= anvl_opt_defaults();
824 0   0       $get_anvl ||= File::ANVL::make_get_anvl(); # xxx set input here?
825             # XXX test return value!
826              
827 0           my $s = ''; # output strings are returned to $s
828 0 0         my $st = $p ? 1 : ''; # returns (stati or strings) accumulate
829 0           my ($msg, $allmsgs, $anvlrec, $lineno, $name, $value, $pat, $n, $nmax);
830 0           my (%rechash, $ne, $nemax, $elem_name); # for alt. element ordering
831 0           my $r_elem_order = $$o{elem_order};
832              
833 0           $s = $om->ostream(); # open stream
834              
835             # This next line is a fast and compact (if cryptic) way to
836             # accumulate $om->method calls. Used after each method call, it
837             # concatenates strings or ANDs up print statuses, depending on the
838             # outhandle setting. It makes several appearances in this routine.
839             #
840 0 0 0       $p and ($st &&= $s), 1 or ($st .= $s); # accumulate method returns
      0        
841              
842             # Numbers: record, element in record, and start line
843             #
844 0           my ($startline, $recnum, $elemnum) = (1, 0, 0);
845 0           my ($wslines, $rrlines);
846 0           my $r_elems = $om->{elemsref}; # abbreviation
847             # xxx is that reference kosher?
848              
849 0           while (1) {
850              
851             # Get an ANVL record and count lines therein. ANVL
852             # records can come from anywhere, but typically from
853             # a file (read in "paragraph" mode) or a BDB database.
854             #
855 0           $anvlrec = trimlines(&$get_anvl(), \$wslines, \$rrlines);
856 0           $startline += $wslines;
857 0 0         last unless $anvlrec;
858              
859 0           $recnum++; # increment record counter
860             =for later
861             # XXX anvl_recarray is expensive, do we _need_ to do it if the output is
862             # also in anvl? Maybe call modified [2] here so "find" can work?
863             if (ref($om) eq 'File::OM::ANVL' and ! $r_elem_order) {
864             # xxx do quick expand (short->long erc) here?
865             # xxx _will_ disturb input line numbering
866             $$o{find} and ($anvlrec !~ /$$o{find}/m) and
867             next; # no output has occurred
868             # xxx do quick check for 'show' and next
869             # XXXXXXXX must define lineno for verbose case
870             $s = $om->anvl_rec($anvlrec, $startline, $rrlines);
871             $p and ($st &&= $s), 1 or ($st .= $s);
872             next;
873             }
874             =cut
875 0           $msg = anvl_recarray($anvlrec, $r_elems, $startline, $o);
876 0 0         $msg =~ /^error/ and return "anvl_recarray: $msg";
877 0 0         $msg eq "" or
878             #print $msg, "\n";
879             #$o->{verbose} && print $msg, "\n";
880             $allmsgs .= $msg . "\n"; # save other message
881              
882             # NB: apply 'find' here before possible expansion, which
883             # means that a pattern like "who:\s*smith" won't work on
884             # on a short form ANVL record.
885             #
886 0 0 0       $$o{find} and ($anvlrec !~ /$$o{find}/m) and
887             next; # no output has occurred
888              
889             # If caller has set $$o{elemsproc} to a code reference,
890             # it is called to process the element array just returned
891             # from anvl_recarray. Typically this is used to convert
892             # (with erc_anvl_expand_array) short form ERCs to long
893             # form ERCs. As an optimization, the code is not called
894             # if $$o{elemsprocpat} (typically, "erc") is set and
895             # doesn't match the raw record string.
896             #
897 0 0 0       if (ref($$o{elemsproc}) eq "CODE" and # if code and either
      0        
898             (! ($pat = $$o{elemsprocpat})) # no pattern or
899             || $anvlrec =~ $pat) { # the pattern matches
900              
901             # [2] XXX can we call elemsproc directly on the $anvlrec? so we don't need
902             # to call expensive anvl_recarray first?
903 0 0         ($msg = &{$$o{elemsproc}}($r_elems)) and
  0            
904             return "File::ANVL::elemsproc: $msg";
905             }
906 0 0         ref($om) eq 'File::OM::Turtle' and
907             turtle_set_subject($om, $anvlrec);
908              
909             # The orec method is given first crack at a new record.
910             # It sets and/or clears a number of values for keys (eg,
911             # for turtle, $$o{subject}). $recnum is useful for
912             # outputting json separators (eg, no comma if $recnum eq 1)
913             # or record numbers in comments (eg, if $$o{verbose}).
914             # $startline is useful for parser diagnostics (eg, "error
915             # on line 5"). $r_elems and $r_elem_order are needed for
916             # discovering what elements will populate CSV/PSV records.
917             #
918             # XXX this next isn't needed if output is anvl ?! (assuming final NL is
919             # written when closing the record
920 0           $s = $om->orec($recnum, $startline, $r_elems, $r_elem_order);
921 0 0 0       $p and ($st &&= $s), 1 or ($st .= $s);
      0        
922              
923 0 0         if ($r_elem_order) {
924 0           undef %rechash; # don't want prior indices
925 0 0         ($msg = anvl_arrayhash($r_elems, \%rechash)) and
926             return "anvl_arrayhash: $msg";
927 0           $ne = -1; # index into $$r_elem_order
928 0           $nemax = scalar @$r_elem_order;
929             } else {
930 0 0         $n = # index into $$r_elems
931             # XXX don't reference r_elems if we haven't called anvl_recarray
932             $$r_elems[2] # if a no-label value starts
933             ? -3 # rec, make sure to output it,
934             : 0; # else skip it (normal)
935 0           $nmax = scalar @$r_elems;
936             }
937              
938             # XXX if output is to anvl, can we not skip the entire loop below? but
939             # not if it's possible to output anvl _and_ to care about element order
940             # XXX but still perform $show check and skip not-shown elems
941             # XXX and still perform value inversion if {invert} options
942 0           $elemnum = 0; # true elements, not comments
943 0           undef $name;
944 0           while (1) {
945              
946             # Select next candidate element. If we need to
947             # output elements in a certain order, consult the
948             # hash; otherwise, just use "found" order.
949             #
950 0 0         if ($r_elem_order) { # use specified order
951              
952 0           $ne++;
953 0 0         $ne >= $nemax and last;
954              
955             # For CSV and PSV, the element name at this
956             # position may be deliberately undefined, or
957             # may correspond to a named element missing
958             # in this record, in which case we skip it.
959             #
960 0           $elem_name = $r_elem_order->[$ne];
961 0 0 0       ! defined($elem_name) || ! defined(
962             #XXX ignore multiple instances for now
963             $n = $rechash{$elem_name}->[0]
964             ) and
965             # for CSV/PSV, output an empty element
966             next;
967              
968             } else { # use natural array order
969 0           $n += 3;
970 0 0         $n >= $nmax and last;
971             }
972             # If we get here, $n is defined.
973              
974 0           $lineno = $$r_elems[$n];
975 0 0         $name = $n < 3 # for special first triple
976             ? '_' # use synthesized name '_'
977             : $$r_elems[$n + 1]; # else real name
978 0   0       $value = $$r_elems[$n + 2] || "";
979              
980 0 0         $elemnum++ unless $name eq '#';
981              
982             # Skip if 'show' given and not requested.
983 0 0 0       $$o{show} and ("$name: $value" !~ /$$o{show}/m) and
984             (undef $name), # cause elem to be skipped
985             next;
986              
987             # Instead of $om->oelem, $om->celem, $om->contelem,
988             # combine open and close into one, but first
989             # naturalize values if called upon.
990             #
991 0 0 0       $$o{invert} and $value =~ /,\s*$/ and
992             $value = anvl_name_naturalize($value);
993             }
994             continue {
995 0           $s = $om->elem($name, $value, $lineno);
996 0 0 0       $p and ($st &&= $s), 1 or ($st .= $s);
      0        
997 0           undef $name; # clean the slate
998             }
999 0           $s = $om->crec($recnum);
1000 0 0 0       $p and ($st &&= $s), 1 or ($st .= $s);
      0        
1001             }
1002             continue {
1003 0           $startline += $rrlines;
1004             }
1005             # XXX currently doing nothing with $allmsgs warnings!
1006             # should probably print if verbose mode on
1007 0           $s = $om->cstream();
1008 0 0 0       $p and ($st &&= $s), 1 or ($st .= $s);
      0        
1009              
1010 0           return $st;
1011             }
1012              
1013             # xxx document all om options
1014             # xxx should om also have a recstring slot (for anvlrec)?
1015             # xxx pass in turtle_nosubject (default)?
1016             sub turtle_set_subject {
1017              
1018 0     0 0   my ($om, $anvlrec) = (shift, shift);
1019 0           my $r_elems = $om->{elemsref};
1020              
1021             # In order to find the subject element for Turtle/RDF
1022             # assertions, we need an element name pattern. If one is
1023             # defined in $om->{turtle_subjelpat}, use it. If it's undefined,
1024             # per-record code will use 'where' if it thinks the record
1025             # is an ERC, or use 'identifier|subject' as a last resort.
1026             # If no element matching subjelpat is found, $om->{subject}
1027             # will default to $om->{turtle_nosubject}.
1028             #
1029 0   0       my $subjpat = $om->{turtle_subjelpat} ||
1030             ($anvlrec =~ /^erc\s*:/m
1031             ? "^where\$" : # 1st where in an 'erc', or
1032             ($anvlrec =~ /^(identifier|subject)\s*:/m
1033             ? "^$1\$" : # 1st identifier or subject,
1034             ($anvlrec =~ /^(.+)\s*:\s*(\n\s+)*\w/
1035             ? "^$1\$" : # or 1st non-empty element
1036             ''))); # or nothing (always matches)
1037              
1038             # Now find a 'subject' for our Turtle/RDF assertions.
1039             #
1040 0           my $j = 1; # element names in positions 1, 4, 7, ...
1041 0   0       1 while ($j < $#$r_elems and # quickly find it
      0        
1042             @$r_elems[$j] !~ $subjpat and ($j += 3));
1043 0 0 0       $om->{subject} = $j < $#$r_elems && $subjpat ? # if found,
1044             @$r_elems[$j + 1] : # use associated value
1045             $om->{turtle_nosubject}; # else use default
1046 0           return $om->{subject};
1047             }
1048              
1049             1;
1050              
1051             __END__