File Coverage

blib/lib/File/ANVL.pm

Criterion	Covered	Total	%
statement	181	274	66.0
branch	66	160	41.2
condition	25	120	20.8
subroutine	22	27	81.4
pod	0	17	0.0
total	294	598	49.1

line	stmt	bran	cond	sub	pod	time	code
1							package File::ANVL;
2
3							# XXXXXxxxx make adding a value policy-driven, eg,
4							# "add" could mean (a) replace, (b) push on end array,
5							# (c) push on start of array, (d) string concatenation,
6							# (d) error.
7
8	4			4		139546	use 5.006;
	4					13
	4					187
9	4			4		24	use strict;
	4					6
	4					144
10	4			4		20	use warnings;
	4					14
	4					152
11
12	4			4		18	use constant NL => "\n";
	4					7
	4					345
13
14							# ANVL flavors
15							#
16	4			4		18	use constant ANVL => 1;
	4					8
	4					159
17	4			4		20	use constant ANVLR => 2;
	4					6
	4					160
18	4			4		18	use constant ANVLS => 3;
	4					7
	4					14216
19
20							our $VERSION;
21							$VERSION = sprintf "%d.%02d", q$Name: Release-1-05 $ =~ /Release-(\d+)-(\d+)/;
22
23							require Exporter;
24							our @ISA = qw(Exporter);
25
26							our @EXPORT = qw();
27
28							our @EXPORT_OK = qw(
29							anvl_recarray anvl_arrayhash
30							anvl_name_naturalize
31							anvl_rechash anvl_valsplit
32							erc_anvl_expand_array kernel_labels
33							xgetlines trimlines
34							make_get_anvl
35							anvl_opt_defaults anvl_decode anvl_om
36
37							anvl_encode anvl_recsplit
38
39							ANVL ANVLR ANVLS ANVLSH
40							);
41
42							our %EXPORT_TAGS = (all => [ @EXPORT_OK ]);
43
44							# All these symbols must be listed also in EXPORT_OK (?)
45							#
46							our @EXPORT_FAIL = qw(
47							ANVL ANVLR ANVLS ANVLSH
48							);
49
50							our $anvl_mode = 'ANVL'; # default mode
51
52							# This is a magic routine that the Exporter calls for any unknown symbols.
53							#
54	3			3	0	490	sub export_fail { my( $class, @symbols )=@_;
55
56	3					14	$anvl_mode = $_ for (@symbols);
57	3					2767	return ();
58							}
59
60							# Initialize or re-initialize options to factory defaults.
61							#
62							sub anvl_opt_defaults { return {
63
64							# Input options
65							#
66	7			7	0	61	autoindent => 1, # yes, fix recoverably bad indention
67							comments => 0, # no, don't parse comments
68							elemsproc => # to expand short form ERCs (if any)
69							\&File::ANVL::erc_anvl_expand_array,
70							elemsprocpat => # no call from anvl_om if no match
71							qr/^erc:/m, # in rec; no call if set and matches
72							};
73							}
74
75							# xxx decide on good name for short form and long form ERC
76
77							# Returns a closure that calls an input reader with that's set to *ARGV
78							# by default. If $reader and $readee are defined, they are stored in the
79							# closure and all reads will be performed by calling &$reader($readee).
80							#
81							# The default reader collects text lines from a file and returns all the
82							# lines associated with the next "record", which is considered to start
83							# wherever the read pointer happens to be and continues to the first two
84							# blank lines encountered that occur after "substance" is detected.
85							# Substance is defined to be at least one non-whitespace character
86							# occurring on a non-comment line. Comment and blank lines that precede
87							# a record with substance are returned, but any such lines that follow
88							# that the final record are discarded.
89							#
90	1			1	0	389	sub make_get_anvl { my( $reader, $readee ) = shift;
91
92	1	50				6	unless ($reader) {
93
94	1					3	my $rec; # returned record
95							my $s; # next increment of input
96	0					0	my $substance; # boolean detecting substance
97
98	4			4		1393	return sub { my( $filehandle ) = shift;
99
100							# Returns a subroutine, call it get_anvl()
101							#
102							# Usage: $record = get_anvl( [$filehandle] );
103							#
104							# It reads ANVL input records as text lines from the
105							# file given by $filehandle (*ARGV by default, which can
106							# process multiple files via while loop magic). Usually,
107							# the closure holds enough state information, set up by
108							# make_get_anvl(), that get_anvl() can be called without
109							# arguments. get_anvl() returns the record read as a
110							# string, or returns undef on end of input or error.
111							#
112	4		33			18	$filehandle \|\|= *ARGV;
113	4					22	local $/ = NL.NL; # a kind of "paragraph" input mode
114							# $/ === $INPUT_RECORD_SEPARATOR
115	4					6	$rec = '';
116	4		66			118	1 while (
			100
117							defined($s = <$filehandle>) and # read to eof and
118							($rec .= $s), # save everything, but stop
119							$substance = # when we detect substance, ie,
120							$s =~ /^[^#\s]/m \|\| $s =~ /^[^#].*\S/m,
121							! $substance # non-comment with non-space
122							);
123	4	100				24	return $substance ?
124							$rec : # return either collected record or undef
125							undef; # any final blank or comment lines are tossed
126
127							# yyy If more than one file, line numbers normally accumulate
128							# across files. Should we preserve line numbers within each
129							# files? (If so, use "close ARGV" (Perl idiom) to cause $.
130							# (linenum) to be reset between files.
131	1					9	};
132							}
133
134							# If we get here, $reader should reference an input method and
135							# $readee is assumed to be any value (eg, BDB handle) that may
136							# permit &reader to get the next record. Any other arguments
137							# passed to the get_anvl() function below will be passed along
138							# too, ie, $reader($readee, @_).
139							#
140	0	0				0	ref($reader) eq "CODE" or return undef;
141
142	0					0	my $rec; # returned record
143							my $s; # next increment of input
144	0					0	my $substance; # boolean detecting substance
145
146							return sub {
147	0			0		0	$rec = '';
148	0		0			0	1 while (
			0
149							# XXX should this accumulate in general??? or
150							# should we leave it to the definer of $reader?
151							defined($s = &reader($readee, @_)) and # read and
152							($rec .= $s), # save everything, but stop
153							$substance = # when we detect substance, ie,
154							$s =~ /^[^#\s]/m \|\| $s =~ /^[^#].*\S/m,
155							! $substance # non-comment with non-space
156							);
157	0	0				0	return $substance ?
158							$rec : # return either collected record or undef
159							undef; # any final blank or comment lines are tossed
160	0					0	};
161							}
162
163							# XXX deprecated! see sub make_get_anvl
164	0			0	0	0	sub xgetlines { my( $filehandle )=@_;
165
166	0					0	my $rec = ''; # returned record
167	0					0	my $s; # next increment of input
168	0					0	local $/ = NL.NL; # a kind of "paragraph" input mode
169							# $/ === $INPUT_RECORD_SEPARATOR
170
171							# If $filehandle is specified, use the Perl <$filehandle> idiom to
172							# return next unit of input (normally a line, but here a para).
173							#
174	0		0			0	$filehandle \|\|= *ARGV;
175	0		0			0	1 while ( defined( $s = <$filehandle> ) and # read up to two \n's
			0
176
177							# If we get here, $s now contains a block to save.
178							#
179							$rec .= $s,
180
181							# We continue reading only if there's no substance,
182							# ie, no line read starts with a non-comment and no
183							# non-comment line read contains non-whitespace
184							#
185							#$s !~ /^[^#\s]/m and # if no line read starts with
186							# $s !~ /^[^#].*\S/m # or contains substance
187							#(! ($s =~ /^[^#\s]/m \|\| $s =~ /^[^#].*\S/m) and
188							# $rec .= "substance found in <$s>\n"),
189							! ($s =~ /^[^#\s]/m \|\| $s =~ /^[^#].*\S/m)
190							);
191
192							#$s !~ /^\s*[^#\s]/m \|\| # match no line of susbstance
193							#$rec .= $s
194
195							#($rec .= $s), # only "paragraphs"; save everything
196							#$s !~ /^\s*[^#\s]/m # but stop when substance seen
197
198							#$s !~ /^\s*[^#\s]/m # but stop when substance seen
199							# and while $s matches no line starting with ^#
200							# while every line in $s is either all whitespace
201							# or all comment (ie, first non-ws char is #)
202							#$s =~ /\S/ # but stop when we see substance
203							#$s !~ /\S/ # but stop when we see substance
204							# substance means \S on a non-comment line
205							# $s !~ /^\S\|[^#].*\S/m
206							# ! ($s =~ /^[^#\s]/m \|\| $s =~ /^[^#].*\S/m)
207							#
208							#); # /^\s*[^#\s]/m
209	0	0	0			0	defined($s) or
210							return $rec \|\| undef; # almost eof or real eof
211	0					0	return $rec;
212
213							# XXXX what happens when one file ends prematurely and
214							# another begins? does last record for first file get
215							# returned glued to beginning of first recond of 2nd file?
216							# If more than one file, line numbers normally just accumulate.
217							# We want to preserve line numbers within files, so we use this
218							# next Perl idiom to cause $. (linenum) to be reset between files.
219							#
220							#close ARGV if eof; # reset line numbers between files
221							}
222
223							# args: record, reference to whitespace lines, reference to real record lines
224							# xxx replace \n with NL throughout
225							# returns undef when $rec trims to nothing (EOF)
226	5			5	0	1076	sub trimlines { my( $rec, $r_wslines, $r_rrlines )=@_;
227
228							# $rec might legitimately be undefined if called as
229							# trimlines(getlines(), ...)
230							#
231	5		100			15	$rec \|\|= '';
232
233	5					19	$rec =~ s/^(\s)//; # '' guarantees $1 will be defined
234	5					14	my $blanksection = $1;
235	5					7	my @newlines;
236
237	5	50				32	ref($r_wslines) eq 'SCALAR' and # if given, define it
238							$$r_wslines = scalar(@newlines = $blanksection =~ /\n/g);
239
240	5	50				42	ref($r_rrlines) eq 'SCALAR' and # if given, define it
241							$$r_rrlines = scalar(@newlines = $rec =~ /\n/g);
242
243							#$$r_rrlines = scalar($rec =~ /$/gm); # xxx why doesn't this work?
244
245							# At this point $r_wslines and $r_rrlines (if supplied) are safely
246							# defined and ready for return.
247							#
248	5	100				20	$rec or # empty record (but $r_wslines may be defined)
249							return undef; # signal eof-style return
250
251							#$rec =~ /\n\n$/ and # ok record ending -- this is
252							# return $rec; # the usual return
253							#$rec =~ s/\n*$/\n\n/; # normalize premature eof ending
254	3					9	return $rec;
255							}
256
257							# returns empty string on success or string beginning "warning:..."
258							# third arg (0 or 1) optional
259							# elems is returned array of name value pairs
260							#DEPRECATED:
261	1			1	0	1	sub anvl_recsplit { my( $record, $r_elems, $strict )=@_;
262
263	1	50				3	! defined($record) and
264							return "needs an ANVL record";
265	1	50				5	ref($r_elems) ne "ARRAY" and
266							return "2nd arg must reference an array";
267
268	1					1	my $strict_default = 0;
269	1	50				3	! defined($strict) and
270							$strict = $strict_default;
271
272	1					2	local $_ = $record;
273	1					5	s/^\s//; s/\s$//; # trim both ends
	1					5
274	1	50				5	/\n$/ or s/$/\n/; # normalize end of record to \n
275
276	1	50				5	/\n\n/ and
277							return "record should have no internal blank line(s)";
278							# xxx adjust regexp for ANVLR
279	1	50				5	! /^[^\s:][\w ]*:/ and # match against first element
280							return "well-formed record begins with a label and colon";
281
282	1	50				5	$anvl_mode ne ANVLR and
283							s/^#.*\n//gm; # remove comments plus final \n
284
285							# If we're not in strict parse mode, correct for common error
286							# where continued value is not indented. We can pretty safely
287							# assume a continued value if a line is flush left and contains
288							# no colon at all.
289							#
290							# This next substitution match needs to be multi-line to avoid
291							# more explicit looping.
292							#
293							# XXX there's probably a more efficient way to do this.
294	1					2	my $msg = "";
295	1					5	my $indented = s/^([^\s:][^:]*)$/ $1/gm;
296	1	50				3	if ($indented) {
297	0	0				0	$strict and
298							(@$r_elems = undef),
299							return "error: $indented unindented value line(s)";
300	0					0	$msg = "warning: indenting $indented value line(s)";
301							}
302							# if we get here, assume standard continuation lines, and join them
303							# (GRANVL style)
304							#
305	1					3	s/\n\s+/ /g;
306							# XXX should have a newline-preserving form of parse?
307
308							# Split into array element pairs. Toss first "false" split.
309							# xxx buggy limited patterns, how not to match newline
310
311							# This is the critical splitting step.
312							# splits line beginning ..... xxx
313							#
314	1					4	s/\n$//; # strip final \n
315	1					7	(undef, @$r_elems) = split /\n^([^\s:][\w ]):\s*/m;
316
317	1					7	return $msg;
318							}
319
320							# xxxxxxxx respond to 'comments' (def. off), 'autoindent' (def. on),
321							# 'anvlr' (def. off), 'granvl' ?
322
323							# This is the closest thing to a reference implementation of an ANVL
324							# record parser.
325							# It returns "" on success, or "error: ..." or "warning: ..."
326
327	10			10	0	6286	sub anvl_recarray { my( $record, $r_elems, $linenum, $o )=@_;
328
329	10	50				31	! defined($record) and
330							return "error: no input record";
331	10	100				32	ref($r_elems) ne "ARRAY" and
332							return "error: 2nd arg must reference an array";
333
334							# Note: this input $linenum is pure digits, while $lineno on
335							# output is a combination of digits and type (':' or '#')
336							#
337	9	100				23	defined($linenum) or $linenum = 1;
338	9	50				30	$linenum =~ /\D/ and
339							return "error: 3rd arg ($linenum) must be a positive integer";
340							# XXX can't this be optimized a bit to keep defaults around?
341	9		66			34	$o \|\|= anvl_opt_defaults();
342	9	50				26	ref($o) ne "HASH" and
343							return "error: 4th arg must reference a hash";
344
345	9					12	local $_ = $record; # localizing $_ prevents modifying global $_
346
347	9					34	s/^\s//; s/\s$//; # trim both ends
	9					172
348	9	50				42	/\n$/ or s/$/\n/; # normalize end of record to \n
349							#s/\n?$/\nEOR:/; # whether record ends in \n or not, normalize
350							# # end of record to \nEOR: (note no \n after \nEOR:)
351
352							# Reject some malformed cases.
353							#
354							#/\n\n/ and
355							# return "error: record should have no internal blank line(s)";
356							# xxx adjust regexp for ANVLR
357							# XXX fix so record can consist of nothing but comments and/or whitespace;
358							# comments _may_ be recognized in regular records, but not in this kind
359							#/^[^\s:][\w ]*:/m or # match against first element
360							# return "error: record ($_) should begin with a label and colon";
361
362							# Any other unindented line not containing a colon will either
363							# cause an error or will be automatically indented.
364
365							# xxx what about $anvl_mode ne ANVLR and??
366
367							# Now we synthesize stuff (line numbers and pseudo-element names for
368							# any comments) in order to create a uniform structure on each line,
369							# so that we can finally call 'split' to bust apart that structure
370							# into a Perl array in which every 3-element group corresponds to
371							# 1. a line number,
372							# 2. a label, and
373							# 3. a value.
374							#
375							# First insert a line number and ":" in front of each line.
376							#
377	9					13	my $num = $linenum;
378	9					55	s/^/ $num++ . ":" /gem; # put a line number on each line
	38					121
379
380							# Remove blank lines, now that line numbers have been preserved.
381							#
382	9					48	s/^\d+:[^\S\n]*\n//gm;
383
384							# Now, if we're not deleting comments, insert a pseudo-element
385							# name '#:' in front of each comment while also changing the ':'
386							# after the line numer to '#'. This means that all lines will
387							# begin with a line number followed by ':' for real elements or
388							# by '#' for comment elements. Eg, '# foo' on line 3 becomes
389							# '3##:# foo', which conforms to the eventual split pattern we
390							# rely on (at end).
391							#
392							# xxx problem with line #K:value, which becomes, eg, 4##:K:value
393	9	100	66			61	$$o{comments} and # if we're keeping comments
394							s/^(\d+):#/$1##:/gm, 1
395							# ^^^
396							# 123
397							# 1=separator, 2=pseudo-name,
398							# 3=original value minus '#' starts after :
399
400							or # else completely delete comments
401							s/^\d+:#.*\n//gm # up to and including final \n
402							;
403
404							# Return if nothing's left after deleting blank lines and comments.
405							#
406	9	50				31	/^\s*$/s and
407							return "warning: record at line $linenum has no content";
408
409	9					14	my $msg = ""; # default return message
410
411							# If we're not in strict parse mode, correct for common error
412							# where continued value is not indented. We can pretty safely
413							# assume a continued value if a line is flush left and contains
414							# no colon at all.
415							#
416							# This next substitution match is multi-line to avoid explicit
417							# looping (yyy is this an efficient way to do it?). It indents
418							# by one space any line starting without a space or colon and
419							# that has no instance of a colon until end of line.
420							#
421	9					57	my $indented = s/^(\d+:)([^\s:][^:]*)$/$1 $2/gm;
422	9	100				21	if ($indented) {
423	3	100				12	unless ($$o{autoindent}) {
424	1					6	@$r_elems = undef; # XXXXX isn't this too much?
425	1					7	return "error: $indented unindented value line(s)";
426							}
427	2					7	$msg = "warning: indenting $indented value line(s)";
428							}
429
430							# Now we join the (normalized) continuation lines (GRANVL style)
431							# so each element-value pair is on one line. The + in [ \t]+ is
432							# very important; we can't use \s+ here because \s matches a \n.
433							#
434	8					41	s/\n\d+:[ \t]+/ /g;
435							#s/\n\d+:\s+/ /g;
436							# XXX should we have a newline-preserving form of parse?
437
438							# Get rid of initial whitespace from all non-comment GRANVL values.
439	8					66	s/^(\d+:[^\s:][^:]*:)[ \t]+/$1/gm;
440							# xxx problem with line #K:value, which becomes, eg, 4##:K:value
441
442							# Split into array of element pairs. Toss first "false" split.
443							# xxx buggy limited patterns, how not to match newline
444
445							# This is the critical splitting step.
446							# splits line beginning ..... xxx
447							#
448	8					27	s/\n$//; # strip final \n
449	8					177	@$r_elems = ('', 'ANVL', # 3rd elem of 1st triple is
450							# provided by first element resulting from the split
451
452							split /\n^(\d+[:#])([^\s:][^:]):/m
453
454							# xxx problem with line #K:value, which becomes, eg, 4##:K:value
455							);
456
457	8	50				33	defined($$r_elems[2]) or
458							return "error: split failed ($_) on '$record', " .
459							"record at line $linenum";
460
461							# If there was a value with no label at the start of the record,
462							# we deem that interesting enough to keep even though it's not
463							# ANVL-compliant; the caller can prevent this by turning off
464							# 'autoindent', the processing for which will either flag this as
465							# an error or will have inserted one space in front of the value.
466							#
467	8	100				23	$$r_elems[2] =~ /^(\d+): (.*)/ and
468							($$r_elems[0], $$r_elems[2]) = ($1, $2);
469
470							#(undef, @$r_elems) = split /\n^([^\s:][\w ]):\s*/m;
471							# yyy an approach once considered but not used
472							# $num = $.; # linenum
473							# s/^/ $num++ . ":" /e while (/\n/g);
474							# /\G ($N\#.*\n)+ (?=$N[^\#]) /gx # comment block
475							# /\G ($N\S.*\n)+ (?=$N[^\S]) /gx # element on one or more lines
476							# /\G (#.*\n)+(?=[^#])/g
477							# /^#.*?\n[^#]/s # (?=lookahead)
478							#return "_=$_\n" . join(", ", @$r_elems); # to check results
479
480	8					51	return $msg;
481							}
482
483							# XXXXXX for consolidating a:b and a:c into a:b;c, MAJOR constraint
484							# is that b and c CANNOT contain '\|' or we refuse...
485
486	3			3	0	2181	sub anvl_arrayhash { my( $r_elems, $r_hash, $first_only )=@_;
487
488	3	100				13	ref($r_elems) ne "ARRAY" and
489							return "error: 1st arg must reference an array";
490	2	100				7	ref($r_hash) ne "HASH" and
491							return "error: 2nd arg must reference a hash";
492	1	50				3	defined($first_only) or $first_only = 0;
493
494	1					3	my $num_elems = scalar @$r_elems;
495	1	50				4	$num_elems % 3 != 0 and
496							return "error: input array length must be a multiple of 3";
497
498	1	50				3	$num_elems < 1 and return ""; # no elements, we're done
499
500	1					3	my $msg = ''; # xxx needed?
501	1					2	my ($name, $value, $n, $v);
502
503							# We know there must be at least 3 elements, so it's safe to check
504							# the special first triple (index 2 is the only one we look at now)
505							# for an initial unlabeled record element (non-standard ANVL).
506							# If we find something, we make up the name, '_'.
507							#
508	1	50				4	if ($$r_elems[2]) { # first triple is special
509	1					2	$name = '_';
510	0					0	! defined $$r_hash{$name} and
511							$$r_hash{$name} = [ 0 ] # initialize array
512							or
513	1	50	50			12	push @{ $$r_hash{$name} }, 0 # add to array
514							}
515
516	1					5	for ($n = 3; $n < $num_elems; $n += 3) {
517
518	4					7	$name = $$r_elems[$n + 1];
519	1					4	! defined $$r_hash{$name} and
520							$$r_hash{$name} = [ $n ] # initialize array
521							or
522	4	100	100			24	push @{ $$r_hash{$name} }, $n # add to array
523							;
524							}
525	1					3	return $msg;
526							}
527
528							# ANVL value split
529							# xxx rename to anvl_valarray?
530							# returns empty string on success or string beginning "warning:..."
531							# r_svals is reference to an array that will be filled upon return
532	5			5	0	5052	sub anvl_valsplit { my( $value, $r_svals )=@_;
533
534	5	50				13	! defined($value) and
535							return "needs an ANVL value";
536	5	100				16	ref($r_svals) ne "ARRAY" and
537							return "2nd arg must reference an array";
538	4					5	local $_;
539
540							#xxx print "r_svals=$r_svals\n";
541							#xxx print "value=$value\n";
542	4					7	my $warning = ""; # xxx used?
543							#my $ret_subvalues = \$_[1];
544
545							# Assume value is all on one line and split it.
546							#my @svals = split /\\|/, $value;
547	4					23	@$r_svals = split /\\|/, $value;
548							#$_[1] = \@svals;
549							$_ = [ split(/;/, $_) ] # create array of arrays
550	4					41	for (@$r_svals);
551							#xxxprint("svals=", join(", ", @$_), "\n") for (@$r_svals);
552
553							# xxxx need to look for all 3 levels: (change spec)
554							# XXXXXXX value ::= one or more svals (sval1 \| sval2 \| ...)
555							# XXXXXXX sval ::= one or more rvals (rval1 ; rval2 ; ...)
556							# XXXXXXX rval ::= one or more qvals (qval1 (=) qval2 (=) ...)
557							# where s=sub, r=repeated, q=equivalent
558							# XXXXXXX or ?? rval ::= one or more avals (aval1 (=) aval2 (=) ...)
559	4	50				19	return $warning ? "warning: $warning" : "";
560							}
561
562							# Create record hash, elem is key, value is value
563							#
564	2			2	0	2113	sub anvl_rechash { my( $record, $r_hash, $strict )=@_;
565
566	2	50				8	! defined($record) and
567							return "needs an ANVL record";
568	2	100				6	ref($r_hash) ne "HASH" and
569							return "2nd arg must reference a hash";
570
571	1					3	my $msg = "";
572	1					1	my @elems;
573	1	50				5	($msg = anvl_recsplit($record, \@elems, $strict)) and
574							return "anvl_recsplit: $msg";
575
576	1					3	my ($name, $value);
577	1					2	while (1) {
578	2					4	$name = shift @elems;
579	2	100				6	last unless defined $name; # nothing left
580	1					2	$value = shift @elems;
581	1	50				4	if (! defined $$r_hash{$name}) {
582							# Nothing there, so store scalar and continue.
583	1					3	$$r_hash{$name} = $value; # 1st value (non-array)
584	1					2	next;
585							}
586							# If we get here there's something's already there.
587							# Don't overwrite if we're in $strict mode.
588							# xxx document this
589							#
590	0	0				0	$strict and next; # don't overwrite
591
592							# XXXXXxxxx make adding a value policy-driven, eg,
593							# "add" could mean (a) replace, (b) push on end array,
594							# (c) push on start of array, (d) string concatenation,
595							# (d) error.
596							# xxx should anvl_rechash save line numbers?
597							# xxx should anvl_recsplit save line numbers?
598
599							# Whatever is there could be a scalar or an array reference.
600							# If not a reference, create an anonymous array, put a
601							# scalar into it, and refer to the array.
602							#
603	0					0	my $v = $$r_hash{$name}; # add to current
604	0	0				0	$v = [ $v ] # make an array if currently
605							unless ref $v; # there's only one value
606
607							# If we get here, we have a reference to an array,
608							# possibly empty. Either way, we can push onto it.
609							#
610	0					0	push @$v, $value;
611							}
612	1					3	return $msg;
613							}
614
615							# [ !"#\$%&'\*\+,/:;<=>\?@\[\\\]\\|\0]
616							our %anvl_decoding = (
617
618							'sp' => ' ', # decodes to space (0x20)
619							'ex' => '!', # decodes to ! (0x21)
620							'dq' => '"', # decodes to " (0x22)
621							'ns' => '#', # decodes to # (0x23)
622							'do' => '$', # decodes to $ (0x24)
623							'pe' => '%', # decodes to % (0x25)
624							'am' => '&', # decodes to & (0x26)
625							'sq' => "'", # decodes to ' (0x27)
626							'op' => '(', # decodes to ( (0x28)
627							'cp' => ')', # decodes to ) (0x29)
628							'as' => '', # decodes to (0x2a)
629							'pl' => '+', # decodes to + (0x2b)
630							'co' => ',', # decodes to , (0x2c)
631							'sl' => '/', # decodes to / (0x2f)
632							'cn' => ':', # decodes to : (0x3a)
633							'sc' => ';', # decodes to ; (0x3b)
634							'lt' => '<', # decodes to < (0x3c)
635							'eq' => '=', # decodes to = (0x3d)
636							'gt' => '>', # decodes to > (0x3e)
637							'qu' => '?', # decodes to ? (0x3f)
638							'at' => '@', # decodes to @ (0x40)
639							'ox' => '[', # decodes to [ (0x5b)
640							'ls' => '\\', # decodes to \ (0x5c)
641							'cx' => ']', # decodes to ] (0x5d)
642							'vb' => '\|', # decodes to \| (0x7c)
643							'nu' => "\0", # decodes to null (0x00)
644							);
645							# XXXXXXX need way to encode newlines (using '\n' in interim)
646
647							our %anvl_encoding;
648
649							#%cn :
650							#%sc ;
651
652							# xxxxx handle these separately
653							# # XXXX remove %% from erc/anvlspec?
654							# '%' => '%pe', # decodes to % (0x25) xxxx do this first?
655							# '_' => '', # a non-character used as a syntax shim
656							# '{' => '', # a non-character that begins an expansion block
657							# '}' => '', # a non-character that ends an expansion block
658
659							# Takes a single arg.
660							sub anvl_decode {
661
662	5		50	5	0	18	local $_ = shift(@_) \|\| '';
663
664	5					18	pos() = 0; # reset \G for $_ just to be safe
665	5					29	while (/(?=\%\{)/g) { # lookahead; \G matches just before
666	7					11	my $p = pos(); # note \G position before it changes
667	7	100				41	s/\G \%\{ (.*?) \%\}//xs # 's' modifier makes . match \n
668							or last; # if no closing brace, skip match
669	6					14	my $exp_block = $1; # save removed expansion block
670	6					33	$exp_block =~ s/\s+//g; # strip it of all whitespace
671	6					11	pos() = $p; # revert \G to where we started and
672	6					34	s/\G/$exp_block/; # re-insert changed expansion block
673							}
674	5					14	s/\%[}{]//g; # remove any remaining unmatched
675	5					8	s/\%_//g; # xxx %_ -> ''
676	5					11	s/\%\%/\%pe/g; # xxx ??? xxxx???
677							# decode %XY where XY together don't form a valid pair of hex digits
678	5					17	s/\%([g-z][a-z]\|[a-z][g-z])/$anvl_decoding{$1}/g;
679	5					25	return $_;
680							}
681
682							# xxx encoding should be context-sensitive, eg, name, value
683	1			1	0	5515	sub anvl_encode { my( $s )=@_;
684
685							# XXXX just define this in the module??
686	1	50				7	unless (%anvl_encoding) { # one-time definition
687							# This just defines an inverse mapping so we can encode.
688							$anvl_encoding{$anvl_decoding{$_}} = $_
689	1					149	for (keys %anvl_decoding);
690							}
691							$s =~
692	1					14	s/([ !\"#\$\%&'\*\+,\/:;<=>\?@\[\\\]\\|\0])/\%$anvl_encoding{$1}/g;
693	1					10	return $s;
694							}
695
696							# return $name in natural word order, using ANVL inversion points
697							# repeat for each final comma present
698	6			6	0	21	sub anvl_name_naturalize { my( $name )=@_;
699
700	6		50			14	$name \|\|= '';
701	6	50				38	$name =~ /^\s*$/ and return $name; # empty
702
703							# "McCartney, Paul, Sir,,"
704							# a, b, c, d, e,,, -> e d c a, b
705	6					16	my $prefix = '';
706	6					42	while ($name =~ s/,\s*$//) {
707	9	100				106	$name =~ s/^(.),\s([^,]+)(,*$)/$1$3/ and
708							$prefix .= $2 . ' ';
709							}
710	6					43	return $prefix . $name;
711							}
712
713	0			0	0	0	sub anvl_summarize { my( @nodes )=@_; }
714
715							# XXXXX doesn't this really belong in an ERC.pm module?
716							#
717							# ordered list of kernel element names
718							our @kernel_labels = qw(
719							who
720							what
721							when
722							where
723							how
724							why
725							huh
726							);
727							#
728							# This routine inspects and possibly modifies in place the kind of element
729							# array resulting from a call to anvl_recarray(), which splits and ANVL
730							# record. It is useful for transforming short form ERC elements into full
731							# form elements, for example, to expand "erc:a\|b\|c\|d" into the equivalent,
732							# "erc:\nwho:a\nwhat:b\nwhen:c\nwhere:d".
733							# It returns the empty string on success, otherwise an error message.
734							#
735	4			4	0	1636	sub erc_anvl_expand_array { my( $r_elems )=@_;
736
737	4			4		60	use File::ANVL;
	4					7
	4					5380
738	4					4	my ($lineno, $name, $value, $msg, @svals, $sval);
739	4					5	my $me = 'erc_anvl_expand_array';
740	4					5	my $i = 3; # skip first 3 elems (anvl array preamble)
741	4					7	while (1) {
742	21					23	$lineno = $$r_elems[$i++];
743	21		100			45	$name = $$r_elems[$i++] \|\| '';
744	21		100			41	$value = $$r_elems[$i++] \|\| '';
745	21	100				32	last unless defined $lineno; # end of record
746							next # skip unless we have erc-type thing
747	17	100	100			53	if ($name ne 'erc' \|\| $value =~ /^\s*$/);
748							#if ($name !~ /^erc\b/ \|\| $value =~ /^\s*$/);
749							# xxx should do this for full generality
750
751							# If here, we have an erc-type thing with a non-empty value.
752							#
753	2	50				8	($msg = anvl_valsplit($value, \@svals)) and
754							return "error: $me: anvl_valsplit: $msg";
755
756							# XXXX only doing straight "erc" (eg, not erc-about)
757	2					3	my $j = 0;
758	2					4	my @extras = ();
759							# If we exceed known labels, we'll re-use last known label.
760	2					4	my $unknown = $kernel_labels[$#kernel_labels];
761	2					3	foreach $sval (@svals) {
762
763							# xxx not (yet) tranferring subvalue structure
764							# to anvl_om or other conversion
765							# Recall that each $sval is itself a reference to
766							# an array of subvalues (often just one element).
767							#
768	8		33			65	push @extras, # trust kernel_labels order
769							$lineno,
770							$kernel_labels[$j++] \|\| $unknown,
771							join('; ', # trim ends of subvalues
772							map(m/^\s(.?)\s*$/, @$sval)
773							);
774							}
775							# Finally, replace our $value element with '' and append
776							# the new extra values we've just expanded.
777	2					15	splice @$r_elems, $i-1, 1,
778							'', # replaces $value we just used up
779							@extras; # adds new elements from $value
780							}
781	4					12	return ''; # success
782							}
783
784							# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX=============
785							# xxx checkm _in_ obj1 obj2 ... --> returns noids
786							# xxx checkm _out_ id1 id2 ... --> returns objects
787
788							# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX=============
789							# xxx do metadata scan of object before ingest and confirm with user that
790							# the object is correctly identified. This could even be done remotely.
791							# Start with informal staff service for depositing objects, returning a
792							# short url to a stable object, and not clogging up allstaff inboxes with
793							# huge attachments. Also applies to any number of draft docs for review
794							# but in temporary storage (but stable).
795
796							# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX=============
797							# xxx do id generator service with 'expiring' ids. To mint, you tell us
798							# who you are first. To get a perm. id, you agree to use your minted id
799							# and bind it within N months. We track, and warn you several times
800							# until N months as elapsed and then reclaim/recycle the id.
801
802							############################################
803							# Output Multiplexer routines
804							############################################
805
806							# #$erc = "erc: Smith, J.\|The Whole Truth\|2004\|http://example.com/foo/bar";
807							# $errmsg = File::ERC::erc_anvl2erc_turtle ($erc, $rec);
808							# $errmsg and
809							# print("$errmsg\n")
810							# or
811							# print("turtle record:\n$rec\n")
812							# ;
813
814							# xxx anvl_fmt not consistent with om_anvl!
815
816							# Input file(s) from ARGV.
817
818	0			0	0		sub anvl_om { my( $om, $o, $get_anvl ) = (shift, shift, shift);
819
820	0	0					return "anvl_om: 1st arg not an OM object"
821							if ref($om) !~ /^File::OM::/;
822	0						my $p = $om->{outhandle}; # whether 'print' status or small
823	0		0				$o \|\|= anvl_opt_defaults();
824	0		0				$get_anvl \|\|= File::ANVL::make_get_anvl(); # xxx set input here?
825							# XXX test return value!
826
827	0						my $s = ''; # output strings are returned to $s
828	0	0					my $st = $p ? 1 : ''; # returns (stati or strings) accumulate
829	0						my ($msg, $allmsgs, $anvlrec, $lineno, $name, $value, $pat, $n, $nmax);
830	0						my (%rechash, $ne, $nemax, $elem_name); # for alt. element ordering
831	0						my $r_elem_order = $$o{elem_order};
832
833	0						$s = $om->ostream(); # open stream
834
835							# This next line is a fast and compact (if cryptic) way to
836							# accumulate $om->method calls. Used after each method call, it
837							# concatenates strings or ANDs up print statuses, depending on the
838							# outhandle setting. It makes several appearances in this routine.
839							#
840	0	0	0				$p and ($st &&= $s), 1 or ($st .= $s); # accumulate method returns
			0
841
842							# Numbers: record, element in record, and start line
843							#
844	0						my ($startline, $recnum, $elemnum) = (1, 0, 0);
845	0						my ($wslines, $rrlines);
846	0						my $r_elems = $om->{elemsref}; # abbreviation
847							# xxx is that reference kosher?
848
849	0						while (1) {
850
851							# Get an ANVL record and count lines therein. ANVL
852							# records can come from anywhere, but typically from
853							# a file (read in "paragraph" mode) or a BDB database.
854							#
855	0						$anvlrec = trimlines(&$get_anvl(), \$wslines, \$rrlines);
856	0						$startline += $wslines;
857	0	0					last unless $anvlrec;
858
859	0						$recnum++; # increment record counter
860							=for later
861							# XXX anvl_recarray is expensive, do we _need_ to do it if the output is
862							# also in anvl? Maybe call modified [2] here so "find" can work?
863							if (ref($om) eq 'File::OM::ANVL' and ! $r_elem_order) {
864							# xxx do quick expand (short->long erc) here?
865							# xxx _will_ disturb input line numbering
866							$$o{find} and ($anvlrec !~ /$$o{find}/m) and
867							next; # no output has occurred
868							# xxx do quick check for 'show' and next
869							# XXXXXXXX must define lineno for verbose case
870							$s = $om->anvl_rec($anvlrec, $startline, $rrlines);
871							$p and ($st &&= $s), 1 or ($st .= $s);
872							next;
873							}
874							=cut
875	0						$msg = anvl_recarray($anvlrec, $r_elems, $startline, $o);
876	0	0					$msg =~ /^error/ and return "anvl_recarray: $msg";
877	0	0					$msg eq "" or
878							#print $msg, "\n";
879							#$o->{verbose} && print $msg, "\n";
880							$allmsgs .= $msg . "\n"; # save other message
881
882							# NB: apply 'find' here before possible expansion, which
883							# means that a pattern like "who:\s*smith" won't work on
884							# on a short form ANVL record.
885							#
886	0	0	0				$$o{find} and ($anvlrec !~ /$$o{find}/m) and
887							next; # no output has occurred
888
889							# If caller has set $$o{elemsproc} to a code reference,
890							# it is called to process the element array just returned
891							# from anvl_recarray. Typically this is used to convert
892							# (with erc_anvl_expand_array) short form ERCs to long
893							# form ERCs. As an optimization, the code is not called
894							# if $$o{elemsprocpat} (typically, "erc") is set and
895							# doesn't match the raw record string.
896							#
897	0	0	0				if (ref($$o{elemsproc}) eq "CODE" and # if code and either
			0
898							(! ($pat = $$o{elemsprocpat})) # no pattern or
899							\|\| $anvlrec =~ $pat) { # the pattern matches
900
901							# [2] XXX can we call elemsproc directly on the $anvlrec? so we don't need
902							# to call expensive anvl_recarray first?
903	0	0					($msg = &{$$o{elemsproc}}($r_elems)) and
	0
904							return "File::ANVL::elemsproc: $msg";
905							}
906	0	0					ref($om) eq 'File::OM::Turtle' and
907							turtle_set_subject($om, $anvlrec);
908
909							# The orec method is given first crack at a new record.
910							# It sets and/or clears a number of values for keys (eg,
911							# for turtle, $$o{subject}). $recnum is useful for
912							# outputting json separators (eg, no comma if $recnum eq 1)
913							# or record numbers in comments (eg, if $$o{verbose}).
914							# $startline is useful for parser diagnostics (eg, "error
915							# on line 5"). $r_elems and $r_elem_order are needed for
916							# discovering what elements will populate CSV/PSV records.
917							#
918							# XXX this next isn't needed if output is anvl ?! (assuming final NL is
919							# written when closing the record
920	0						$s = $om->orec($recnum, $startline, $r_elems, $r_elem_order);
921	0	0	0				$p and ($st &&= $s), 1 or ($st .= $s);
			0
922
923	0	0					if ($r_elem_order) {
924	0						undef %rechash; # don't want prior indices
925	0	0					($msg = anvl_arrayhash($r_elems, \%rechash)) and
926							return "anvl_arrayhash: $msg";
927	0						$ne = -1; # index into $$r_elem_order
928	0						$nemax = scalar @$r_elem_order;
929							} else {
930	0	0					$n = # index into $$r_elems
931							# XXX don't reference r_elems if we haven't called anvl_recarray
932							$$r_elems[2] # if a no-label value starts
933							? -3 # rec, make sure to output it,
934							: 0; # else skip it (normal)
935	0						$nmax = scalar @$r_elems;
936							}
937
938							# XXX if output is to anvl, can we not skip the entire loop below? but
939							# not if it's possible to output anvl _and_ to care about element order
940							# XXX but still perform $show check and skip not-shown elems
941							# XXX and still perform value inversion if {invert} options
942	0						$elemnum = 0; # true elements, not comments
943	0						undef $name;
944	0						while (1) {
945
946							# Select next candidate element. If we need to
947							# output elements in a certain order, consult the
948							# hash; otherwise, just use "found" order.
949							#
950	0	0					if ($r_elem_order) { # use specified order
951
952	0						$ne++;
953	0	0					$ne >= $nemax and last;
954
955							# For CSV and PSV, the element name at this
956							# position may be deliberately undefined, or
957							# may correspond to a named element missing
958							# in this record, in which case we skip it.
959							#
960	0						$elem_name = $r_elem_order->[$ne];
961	0	0	0				! defined($elem_name) \|\| ! defined(
962							#XXX ignore multiple instances for now
963							$n = $rechash{$elem_name}->[0]
964							) and
965							# for CSV/PSV, output an empty element
966							next;
967
968							} else { # use natural array order
969	0						$n += 3;
970	0	0					$n >= $nmax and last;
971							}
972							# If we get here, $n is defined.
973
974	0						$lineno = $$r_elems[$n];
975	0	0					$name = $n < 3 # for special first triple
976							? '_' # use synthesized name '_'
977							: $$r_elems[$n + 1]; # else real name
978	0		0				$value = $$r_elems[$n + 2] \|\| "";
979
980	0	0					$elemnum++ unless $name eq '#';
981
982							# Skip if 'show' given and not requested.
983	0	0	0				$$o{show} and ("$name: $value" !~ /$$o{show}/m) and
984							(undef $name), # cause elem to be skipped
985							next;
986
987							# Instead of $om->oelem, $om->celem, $om->contelem,
988							# combine open and close into one, but first
989							# naturalize values if called upon.
990							#
991	0	0	0				$$o{invert} and $value =~ /,\s*$/ and
992							$value = anvl_name_naturalize($value);
993							}
994							continue {
995	0						$s = $om->elem($name, $value, $lineno);
996	0	0	0				$p and ($st &&= $s), 1 or ($st .= $s);
			0
997	0						undef $name; # clean the slate
998							}
999	0						$s = $om->crec($recnum);
1000	0	0	0				$p and ($st &&= $s), 1 or ($st .= $s);
			0
1001							}
1002							continue {
1003	0						$startline += $rrlines;
1004							}
1005							# XXX currently doing nothing with $allmsgs warnings!
1006							# should probably print if verbose mode on
1007	0						$s = $om->cstream();
1008	0	0	0				$p and ($st &&= $s), 1 or ($st .= $s);
			0
1009
1010	0						return $st;
1011							}
1012
1013							# xxx document all om options
1014							# xxx should om also have a recstring slot (for anvlrec)?
1015							# xxx pass in turtle_nosubject (default)?
1016							sub turtle_set_subject {
1017
1018	0			0	0		my ($om, $anvlrec) = (shift, shift);
1019	0						my $r_elems = $om->{elemsref};
1020
1021							# In order to find the subject element for Turtle/RDF
1022							# assertions, we need an element name pattern. If one is
1023							# defined in $om->{turtle_subjelpat}, use it. If it's undefined,
1024							# per-record code will use 'where' if it thinks the record
1025							# is an ERC, or use 'identifier\|subject' as a last resort.
1026							# If no element matching subjelpat is found, $om->{subject}
1027							# will default to $om->{turtle_nosubject}.
1028							#
1029	0		0				my $subjpat = $om->{turtle_subjelpat} \|\|
1030							($anvlrec =~ /^erc\s*:/m
1031							? "^where\$" : # 1st where in an 'erc', or
1032							($anvlrec =~ /^(identifier\|subject)\s*:/m
1033							? "^$1\$" : # 1st identifier or subject,
1034							($anvlrec =~ /^(.+)\s:\s(\n\s+)*\w/
1035							? "^$1\$" : # or 1st non-empty element
1036							''))); # or nothing (always matches)
1037
1038							# Now find a 'subject' for our Turtle/RDF assertions.
1039							#
1040	0						my $j = 1; # element names in positions 1, 4, 7, ...
1041	0		0				1 while ($j < $#$r_elems and # quickly find it
			0
1042							@$r_elems[$j] !~ $subjpat and ($j += 3));
1043	0	0	0				$om->{subject} = $j < $#$r_elems && $subjpat ? # if found,
1044							@$r_elems[$j + 1] : # use associated value
1045							$om->{turtle_nosubject}; # else use default
1046	0						return $om->{subject};
1047							}
1048
1049							1;
1050
1051							__END__