line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package File::ANVL; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# XXXXXxxxx make adding a value policy-driven, eg, |
4
|
|
|
|
|
|
|
# "add" could mean (a) replace, (b) push on end array, |
5
|
|
|
|
|
|
|
# (c) push on start of array, (d) string concatenation, |
6
|
|
|
|
|
|
|
# (d) error. |
7
|
|
|
|
|
|
|
|
8
|
4
|
|
|
4
|
|
139546
|
use 5.006; |
|
4
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
187
|
|
9
|
4
|
|
|
4
|
|
24
|
use strict; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
144
|
|
10
|
4
|
|
|
4
|
|
20
|
use warnings; |
|
4
|
|
|
|
|
14
|
|
|
4
|
|
|
|
|
152
|
|
11
|
|
|
|
|
|
|
|
12
|
4
|
|
|
4
|
|
18
|
use constant NL => "\n"; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
345
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# ANVL flavors |
15
|
|
|
|
|
|
|
# |
16
|
4
|
|
|
4
|
|
18
|
use constant ANVL => 1; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
159
|
|
17
|
4
|
|
|
4
|
|
20
|
use constant ANVLR => 2; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
160
|
|
18
|
4
|
|
|
4
|
|
18
|
use constant ANVLS => 3; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
14216
|
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our $VERSION; |
21
|
|
|
|
|
|
|
$VERSION = sprintf "%d.%02d", q$Name: Release-1-05 $ =~ /Release-(\d+)-(\d+)/; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
require Exporter; |
24
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our @EXPORT = qw(); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
29
|
|
|
|
|
|
|
anvl_recarray anvl_arrayhash |
30
|
|
|
|
|
|
|
anvl_name_naturalize |
31
|
|
|
|
|
|
|
anvl_rechash anvl_valsplit |
32
|
|
|
|
|
|
|
erc_anvl_expand_array kernel_labels |
33
|
|
|
|
|
|
|
xgetlines trimlines |
34
|
|
|
|
|
|
|
make_get_anvl |
35
|
|
|
|
|
|
|
anvl_opt_defaults anvl_decode anvl_om |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
anvl_encode anvl_recsplit |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
ANVL ANVLR ANVLS ANVLSH |
40
|
|
|
|
|
|
|
); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
our %EXPORT_TAGS = (all => [ @EXPORT_OK ]); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# All these symbols must be listed also in EXPORT_OK (?) |
45
|
|
|
|
|
|
|
# |
46
|
|
|
|
|
|
|
our @EXPORT_FAIL = qw( |
47
|
|
|
|
|
|
|
ANVL ANVLR ANVLS ANVLSH |
48
|
|
|
|
|
|
|
); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
our $anvl_mode = 'ANVL'; # default mode |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# This is a magic routine that the Exporter calls for any unknown symbols. |
53
|
|
|
|
|
|
|
# |
54
|
3
|
|
|
3
|
0
|
490
|
sub export_fail { my( $class, @symbols )=@_; |
55
|
|
|
|
|
|
|
|
56
|
3
|
|
|
|
|
14
|
$anvl_mode = $_ for (@symbols); |
57
|
3
|
|
|
|
|
2767
|
return (); |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# Initialize or re-initialize options to factory defaults. |
61
|
|
|
|
|
|
|
# |
62
|
|
|
|
|
|
|
sub anvl_opt_defaults { return { |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# Input options |
65
|
|
|
|
|
|
|
# |
66
|
7
|
|
|
7
|
0
|
61
|
autoindent => 1, # yes, fix recoverably bad indention |
67
|
|
|
|
|
|
|
comments => 0, # no, don't parse comments |
68
|
|
|
|
|
|
|
elemsproc => # to expand short form ERCs (if any) |
69
|
|
|
|
|
|
|
\&File::ANVL::erc_anvl_expand_array, |
70
|
|
|
|
|
|
|
elemsprocpat => # no call from anvl_om if no match |
71
|
|
|
|
|
|
|
qr/^erc:/m, # in rec; no call if set and matches |
72
|
|
|
|
|
|
|
}; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# xxx decide on good name for short form and long form ERC |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# Returns a closure that calls an input reader with that's set to *ARGV |
78
|
|
|
|
|
|
|
# by default. If $reader and $readee are defined, they are stored in the |
79
|
|
|
|
|
|
|
# closure and all reads will be performed by calling &$reader($readee). |
80
|
|
|
|
|
|
|
# |
81
|
|
|
|
|
|
|
# The default reader collects text lines from a file and returns all the |
82
|
|
|
|
|
|
|
# lines associated with the next "record", which is considered to start |
83
|
|
|
|
|
|
|
# wherever the read pointer happens to be and continues to the first two |
84
|
|
|
|
|
|
|
# blank lines encountered that occur after "substance" is detected. |
85
|
|
|
|
|
|
|
# Substance is defined to be at least one non-whitespace character |
86
|
|
|
|
|
|
|
# occurring on a non-comment line. Comment and blank lines that precede |
87
|
|
|
|
|
|
|
# a record with substance are returned, but any such lines that follow |
88
|
|
|
|
|
|
|
# that the final record are discarded. |
89
|
|
|
|
|
|
|
# |
90
|
1
|
|
|
1
|
0
|
389
|
sub make_get_anvl { my( $reader, $readee ) = shift; |
91
|
|
|
|
|
|
|
|
92
|
1
|
50
|
|
|
|
6
|
unless ($reader) { |
93
|
|
|
|
|
|
|
|
94
|
1
|
|
|
|
|
3
|
my $rec; # returned record |
95
|
|
|
|
|
|
|
my $s; # next increment of input |
96
|
0
|
|
|
|
|
0
|
my $substance; # boolean detecting substance |
97
|
|
|
|
|
|
|
|
98
|
4
|
|
|
4
|
|
1393
|
return sub { my( $filehandle ) = shift; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# Returns a subroutine, call it get_anvl() |
101
|
|
|
|
|
|
|
# |
102
|
|
|
|
|
|
|
# Usage: $record = get_anvl( [$filehandle] ); |
103
|
|
|
|
|
|
|
# |
104
|
|
|
|
|
|
|
# It reads ANVL input records as text lines from the |
105
|
|
|
|
|
|
|
# file given by $filehandle (*ARGV by default, which can |
106
|
|
|
|
|
|
|
# process multiple files via while loop magic). Usually, |
107
|
|
|
|
|
|
|
# the closure holds enough state information, set up by |
108
|
|
|
|
|
|
|
# make_get_anvl(), that get_anvl() can be called without |
109
|
|
|
|
|
|
|
# arguments. get_anvl() returns the record read as a |
110
|
|
|
|
|
|
|
# string, or returns undef on end of input or error. |
111
|
|
|
|
|
|
|
# |
112
|
4
|
|
33
|
|
|
18
|
$filehandle ||= *ARGV; |
113
|
4
|
|
|
|
|
22
|
local $/ = NL.NL; # a kind of "paragraph" input mode |
114
|
|
|
|
|
|
|
# $/ === $INPUT_RECORD_SEPARATOR |
115
|
4
|
|
|
|
|
6
|
$rec = ''; |
116
|
4
|
|
66
|
|
|
118
|
1 while ( |
|
|
|
100
|
|
|
|
|
117
|
|
|
|
|
|
|
defined($s = <$filehandle>) and # read to eof and |
118
|
|
|
|
|
|
|
($rec .= $s), # save everything, but stop |
119
|
|
|
|
|
|
|
$substance = # when we detect substance, ie, |
120
|
|
|
|
|
|
|
$s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m, |
121
|
|
|
|
|
|
|
! $substance # non-comment with non-space |
122
|
|
|
|
|
|
|
); |
123
|
4
|
100
|
|
|
|
24
|
return $substance ? |
124
|
|
|
|
|
|
|
$rec : # return either collected record or undef |
125
|
|
|
|
|
|
|
undef; # any final blank or comment lines are tossed |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# yyy If more than one file, line numbers normally accumulate |
128
|
|
|
|
|
|
|
# across files. Should we preserve line numbers within each |
129
|
|
|
|
|
|
|
# files? (If so, use "close ARGV" (Perl idiom) to cause $. |
130
|
|
|
|
|
|
|
# (linenum) to be reset between files. |
131
|
1
|
|
|
|
|
9
|
}; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# If we get here, $reader should reference an input method and |
135
|
|
|
|
|
|
|
# $readee is assumed to be any value (eg, BDB handle) that may |
136
|
|
|
|
|
|
|
# permit &reader to get the next record. Any other arguments |
137
|
|
|
|
|
|
|
# passed to the get_anvl() function below will be passed along |
138
|
|
|
|
|
|
|
# too, ie, $reader($readee, @_). |
139
|
|
|
|
|
|
|
# |
140
|
0
|
0
|
|
|
|
0
|
ref($reader) eq "CODE" or return undef; |
141
|
|
|
|
|
|
|
|
142
|
0
|
|
|
|
|
0
|
my $rec; # returned record |
143
|
|
|
|
|
|
|
my $s; # next increment of input |
144
|
0
|
|
|
|
|
0
|
my $substance; # boolean detecting substance |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
return sub { |
147
|
0
|
|
|
0
|
|
0
|
$rec = ''; |
148
|
0
|
|
0
|
|
|
0
|
1 while ( |
|
|
|
0
|
|
|
|
|
149
|
|
|
|
|
|
|
# XXX should this accumulate in general??? or |
150
|
|
|
|
|
|
|
# should we leave it to the definer of $reader? |
151
|
|
|
|
|
|
|
defined($s = &reader($readee, @_)) and # read and |
152
|
|
|
|
|
|
|
($rec .= $s), # save everything, but stop |
153
|
|
|
|
|
|
|
$substance = # when we detect substance, ie, |
154
|
|
|
|
|
|
|
$s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m, |
155
|
|
|
|
|
|
|
! $substance # non-comment with non-space |
156
|
|
|
|
|
|
|
); |
157
|
0
|
0
|
|
|
|
0
|
return $substance ? |
158
|
|
|
|
|
|
|
$rec : # return either collected record or undef |
159
|
|
|
|
|
|
|
undef; # any final blank or comment lines are tossed |
160
|
0
|
|
|
|
|
0
|
}; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# XXX deprecated! see sub make_get_anvl |
164
|
0
|
|
|
0
|
0
|
0
|
sub xgetlines { my( $filehandle )=@_; |
165
|
|
|
|
|
|
|
|
166
|
0
|
|
|
|
|
0
|
my $rec = ''; # returned record |
167
|
0
|
|
|
|
|
0
|
my $s; # next increment of input |
168
|
0
|
|
|
|
|
0
|
local $/ = NL.NL; # a kind of "paragraph" input mode |
169
|
|
|
|
|
|
|
# $/ === $INPUT_RECORD_SEPARATOR |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# If $filehandle is specified, use the Perl <$filehandle> idiom to |
172
|
|
|
|
|
|
|
# return next unit of input (normally a line, but here a para). |
173
|
|
|
|
|
|
|
# |
174
|
0
|
|
0
|
|
|
0
|
$filehandle ||= *ARGV; |
175
|
0
|
|
0
|
|
|
0
|
1 while ( defined( $s = <$filehandle> ) and # read up to two \n's |
|
|
|
0
|
|
|
|
|
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# If we get here, $s now contains a block to save. |
178
|
|
|
|
|
|
|
# |
179
|
|
|
|
|
|
|
$rec .= $s, |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# We continue reading only if there's no substance, |
182
|
|
|
|
|
|
|
# ie, no line read starts with a non-comment and no |
183
|
|
|
|
|
|
|
# non-comment line read contains non-whitespace |
184
|
|
|
|
|
|
|
# |
185
|
|
|
|
|
|
|
#$s !~ /^[^#\s]/m and # if no line read starts with |
186
|
|
|
|
|
|
|
# $s !~ /^[^#].*\S/m # or contains substance |
187
|
|
|
|
|
|
|
#(! ($s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m) and |
188
|
|
|
|
|
|
|
# $rec .= "substance found in <$s>\n"), |
189
|
|
|
|
|
|
|
! ($s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m) |
190
|
|
|
|
|
|
|
); |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
#$s !~ /^\s*[^#\s]/m || # match no line of susbstance |
193
|
|
|
|
|
|
|
#$rec .= $s |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
#($rec .= $s), # only "paragraphs"; save everything |
196
|
|
|
|
|
|
|
#$s !~ /^\s*[^#\s]/m # but stop when substance seen |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
#$s !~ /^\s*[^#\s]/m # but stop when substance seen |
199
|
|
|
|
|
|
|
# and while $s matches no line starting with ^# |
200
|
|
|
|
|
|
|
# while every line in $s is either all whitespace |
201
|
|
|
|
|
|
|
# or all comment (ie, first non-ws char is #) |
202
|
|
|
|
|
|
|
#$s =~ /\S/ # but stop when we see substance |
203
|
|
|
|
|
|
|
#$s !~ /\S/ # but stop when we see substance |
204
|
|
|
|
|
|
|
# substance means \S on a non-comment line |
205
|
|
|
|
|
|
|
# $s !~ /^\S|[^#].*\S/m |
206
|
|
|
|
|
|
|
# ! ($s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m) |
207
|
|
|
|
|
|
|
# |
208
|
|
|
|
|
|
|
#); # /^\s*[^#\s]/m |
209
|
0
|
0
|
0
|
|
|
0
|
defined($s) or |
210
|
|
|
|
|
|
|
return $rec || undef; # almost eof or real eof |
211
|
0
|
|
|
|
|
0
|
return $rec; |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# XXXX what happens when one file ends prematurely and |
214
|
|
|
|
|
|
|
# another begins? does last record for first file get |
215
|
|
|
|
|
|
|
# returned glued to beginning of first recond of 2nd file? |
216
|
|
|
|
|
|
|
# If more than one file, line numbers normally just accumulate. |
217
|
|
|
|
|
|
|
# We want to preserve line numbers within files, so we use this |
218
|
|
|
|
|
|
|
# next Perl idiom to cause $. (linenum) to be reset between files. |
219
|
|
|
|
|
|
|
# |
220
|
|
|
|
|
|
|
#close ARGV if eof; # reset line numbers between files |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# args: record, reference to whitespace lines, reference to real record lines |
224
|
|
|
|
|
|
|
# xxx replace \n with NL throughout |
225
|
|
|
|
|
|
|
# returns undef when $rec trims to nothing (EOF) |
226
|
5
|
|
|
5
|
0
|
1076
|
sub trimlines { my( $rec, $r_wslines, $r_rrlines )=@_; |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# $rec might legitimately be undefined if called as |
229
|
|
|
|
|
|
|
# trimlines(getlines(), ...) |
230
|
|
|
|
|
|
|
# |
231
|
5
|
|
100
|
|
|
15
|
$rec ||= ''; |
232
|
|
|
|
|
|
|
|
233
|
5
|
|
|
|
|
19
|
$rec =~ s/^(\s*)//; # '*' guarantees $1 will be defined |
234
|
5
|
|
|
|
|
14
|
my $blanksection = $1; |
235
|
5
|
|
|
|
|
7
|
my @newlines; |
236
|
|
|
|
|
|
|
|
237
|
5
|
50
|
|
|
|
32
|
ref($r_wslines) eq 'SCALAR' and # if given, define it |
238
|
|
|
|
|
|
|
$$r_wslines = scalar(@newlines = $blanksection =~ /\n/g); |
239
|
|
|
|
|
|
|
|
240
|
5
|
50
|
|
|
|
42
|
ref($r_rrlines) eq 'SCALAR' and # if given, define it |
241
|
|
|
|
|
|
|
$$r_rrlines = scalar(@newlines = $rec =~ /\n/g); |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
#$$r_rrlines = scalar($rec =~ /$/gm); # xxx why doesn't this work? |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# At this point $r_wslines and $r_rrlines (if supplied) are safely |
246
|
|
|
|
|
|
|
# defined and ready for return. |
247
|
|
|
|
|
|
|
# |
248
|
5
|
100
|
|
|
|
20
|
$rec or # empty record (but $r_wslines may be defined) |
249
|
|
|
|
|
|
|
return undef; # signal eof-style return |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
#$rec =~ /\n\n$/ and # ok record ending -- this is |
252
|
|
|
|
|
|
|
# return $rec; # the usual return |
253
|
|
|
|
|
|
|
#$rec =~ s/\n*$/\n\n/; # normalize premature eof ending |
254
|
3
|
|
|
|
|
9
|
return $rec; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# returns empty string on success or string beginning "warning:..." |
258
|
|
|
|
|
|
|
# third arg (0 or 1) optional |
259
|
|
|
|
|
|
|
# elems is returned array of name value pairs |
260
|
|
|
|
|
|
|
#DEPRECATED: |
261
|
1
|
|
|
1
|
0
|
1
|
sub anvl_recsplit { my( $record, $r_elems, $strict )=@_; |
262
|
|
|
|
|
|
|
|
263
|
1
|
50
|
|
|
|
3
|
! defined($record) and |
264
|
|
|
|
|
|
|
return "needs an ANVL record"; |
265
|
1
|
50
|
|
|
|
5
|
ref($r_elems) ne "ARRAY" and |
266
|
|
|
|
|
|
|
return "2nd arg must reference an array"; |
267
|
|
|
|
|
|
|
|
268
|
1
|
|
|
|
|
1
|
my $strict_default = 0; |
269
|
1
|
50
|
|
|
|
3
|
! defined($strict) and |
270
|
|
|
|
|
|
|
$strict = $strict_default; |
271
|
|
|
|
|
|
|
|
272
|
1
|
|
|
|
|
2
|
local $_ = $record; |
273
|
1
|
|
|
|
|
5
|
s/^\s*//; s/\s*$//; # trim both ends |
|
1
|
|
|
|
|
5
|
|
274
|
1
|
50
|
|
|
|
5
|
/\n$/ or s/$/\n/; # normalize end of record to \n |
275
|
|
|
|
|
|
|
|
276
|
1
|
50
|
|
|
|
5
|
/\n\n/ and |
277
|
|
|
|
|
|
|
return "record should have no internal blank line(s)"; |
278
|
|
|
|
|
|
|
# xxx adjust regexp for ANVLR |
279
|
1
|
50
|
|
|
|
5
|
! /^[^\s:][\w ]*:/ and # match against first element |
280
|
|
|
|
|
|
|
return "well-formed record begins with a label and colon"; |
281
|
|
|
|
|
|
|
|
282
|
1
|
50
|
|
|
|
5
|
$anvl_mode ne ANVLR and |
283
|
|
|
|
|
|
|
s/^#.*\n//gm; # remove comments plus final \n |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# If we're not in strict parse mode, correct for common error |
286
|
|
|
|
|
|
|
# where continued value is not indented. We can pretty safely |
287
|
|
|
|
|
|
|
# assume a continued value if a line is flush left and contains |
288
|
|
|
|
|
|
|
# no colon at all. |
289
|
|
|
|
|
|
|
# |
290
|
|
|
|
|
|
|
# This next substitution match needs to be multi-line to avoid |
291
|
|
|
|
|
|
|
# more explicit looping. |
292
|
|
|
|
|
|
|
# |
293
|
|
|
|
|
|
|
# XXX there's probably a more efficient way to do this. |
294
|
1
|
|
|
|
|
2
|
my $msg = ""; |
295
|
1
|
|
|
|
|
5
|
my $indented = s/^([^\s:][^:]*)$/ $1/gm; |
296
|
1
|
50
|
|
|
|
3
|
if ($indented) { |
297
|
0
|
0
|
|
|
|
0
|
$strict and |
298
|
|
|
|
|
|
|
(@$r_elems = undef), |
299
|
|
|
|
|
|
|
return "error: $indented unindented value line(s)"; |
300
|
0
|
|
|
|
|
0
|
$msg = "warning: indenting $indented value line(s)"; |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
# if we get here, assume standard continuation lines, and join them |
303
|
|
|
|
|
|
|
# (GRANVL style) |
304
|
|
|
|
|
|
|
# |
305
|
1
|
|
|
|
|
3
|
s/\n\s+/ /g; |
306
|
|
|
|
|
|
|
# XXX should have a newline-preserving form of parse? |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# Split into array element pairs. Toss first "false" split. |
309
|
|
|
|
|
|
|
# xxx buggy limited patterns, how not to match newline |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# This is the critical splitting step. |
312
|
|
|
|
|
|
|
# splits line beginning ..... xxx |
313
|
|
|
|
|
|
|
# |
314
|
1
|
|
|
|
|
4
|
s/\n$//; # strip final \n |
315
|
1
|
|
|
|
|
7
|
(undef, @$r_elems) = split /\n*^([^\s:][\w ]*):\s*/m; |
316
|
|
|
|
|
|
|
|
317
|
1
|
|
|
|
|
7
|
return $msg; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# xxxxxxxx respond to 'comments' (def. off), 'autoindent' (def. on), |
321
|
|
|
|
|
|
|
# 'anvlr' (def. off), 'granvl' ? |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# This is the closest thing to a reference implementation of an ANVL |
324
|
|
|
|
|
|
|
# record parser. |
325
|
|
|
|
|
|
|
# It returns "" on success, or "error: ..." or "warning: ..." |
326
|
|
|
|
|
|
|
|
327
|
10
|
|
|
10
|
0
|
6286
|
sub anvl_recarray { my( $record, $r_elems, $linenum, $o )=@_; |
328
|
|
|
|
|
|
|
|
329
|
10
|
50
|
|
|
|
31
|
! defined($record) and |
330
|
|
|
|
|
|
|
return "error: no input record"; |
331
|
10
|
100
|
|
|
|
32
|
ref($r_elems) ne "ARRAY" and |
332
|
|
|
|
|
|
|
return "error: 2nd arg must reference an array"; |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
# Note: this input $linenum is pure digits, while $lineno on |
335
|
|
|
|
|
|
|
# output is a combination of digits and type (':' or '#') |
336
|
|
|
|
|
|
|
# |
337
|
9
|
100
|
|
|
|
23
|
defined($linenum) or $linenum = 1; |
338
|
9
|
50
|
|
|
|
30
|
$linenum =~ /\D/ and |
339
|
|
|
|
|
|
|
return "error: 3rd arg ($linenum) must be a positive integer"; |
340
|
|
|
|
|
|
|
# XXX can't this be optimized a bit to keep defaults around? |
341
|
9
|
|
66
|
|
|
34
|
$o ||= anvl_opt_defaults(); |
342
|
9
|
50
|
|
|
|
26
|
ref($o) ne "HASH" and |
343
|
|
|
|
|
|
|
return "error: 4th arg must reference a hash"; |
344
|
|
|
|
|
|
|
|
345
|
9
|
|
|
|
|
12
|
local $_ = $record; # localizing $_ prevents modifying global $_ |
346
|
|
|
|
|
|
|
|
347
|
9
|
|
|
|
|
34
|
s/^\s*//; s/\s*$//; # trim both ends |
|
9
|
|
|
|
|
172
|
|
348
|
9
|
50
|
|
|
|
42
|
/\n$/ or s/$/\n/; # normalize end of record to \n |
349
|
|
|
|
|
|
|
#s/\n?$/\nEOR:/; # whether record ends in \n or not, normalize |
350
|
|
|
|
|
|
|
# # end of record to \nEOR: (note no \n after \nEOR:) |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# Reject some malformed cases. |
353
|
|
|
|
|
|
|
# |
354
|
|
|
|
|
|
|
#/\n\n/ and |
355
|
|
|
|
|
|
|
# return "error: record should have no internal blank line(s)"; |
356
|
|
|
|
|
|
|
# xxx adjust regexp for ANVLR |
357
|
|
|
|
|
|
|
# XXX fix so record can consist of nothing but comments and/or whitespace; |
358
|
|
|
|
|
|
|
# comments _may_ be recognized in regular records, but not in this kind |
359
|
|
|
|
|
|
|
#/^[^\s:][\w ]*:/m or # match against first element |
360
|
|
|
|
|
|
|
# return "error: record ($_) should begin with a label and colon"; |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
# Any other unindented line not containing a colon will either |
363
|
|
|
|
|
|
|
# cause an error or will be automatically indented. |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
# xxx what about $anvl_mode ne ANVLR and?? |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
# Now we synthesize stuff (line numbers and pseudo-element names for |
368
|
|
|
|
|
|
|
# any comments) in order to create a uniform structure on each line, |
369
|
|
|
|
|
|
|
# so that we can finally call 'split' to bust apart that structure |
370
|
|
|
|
|
|
|
# into a Perl array in which every 3-element group corresponds to |
371
|
|
|
|
|
|
|
# 1. a line number, |
372
|
|
|
|
|
|
|
# 2. a label, and |
373
|
|
|
|
|
|
|
# 3. a value. |
374
|
|
|
|
|
|
|
# |
375
|
|
|
|
|
|
|
# First insert a line number and ":" in front of each line. |
376
|
|
|
|
|
|
|
# |
377
|
9
|
|
|
|
|
13
|
my $num = $linenum; |
378
|
9
|
|
|
|
|
55
|
s/^/ $num++ . ":" /gem; # put a line number on each line |
|
38
|
|
|
|
|
121
|
|
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# Remove blank lines, now that line numbers have been preserved. |
381
|
|
|
|
|
|
|
# |
382
|
9
|
|
|
|
|
48
|
s/^\d+:[^\S\n]*\n//gm; |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# Now, if we're not deleting comments, insert a pseudo-element |
385
|
|
|
|
|
|
|
# name '#:' in front of each comment while also changing the ':' |
386
|
|
|
|
|
|
|
# after the line numer to '#'. This means that all lines will |
387
|
|
|
|
|
|
|
# begin with a line number followed by ':' for real elements or |
388
|
|
|
|
|
|
|
# by '#' for comment elements. Eg, '# foo' on line 3 becomes |
389
|
|
|
|
|
|
|
# '3##:# foo', which conforms to the eventual split pattern we |
390
|
|
|
|
|
|
|
# rely on (at end). |
391
|
|
|
|
|
|
|
# |
392
|
|
|
|
|
|
|
# xxx problem with line #K:value, which becomes, eg, 4##:K:value |
393
|
9
|
100
|
66
|
|
|
61
|
$$o{comments} and # if we're keeping comments |
394
|
|
|
|
|
|
|
s/^(\d+):#/$1##:/gm, 1 |
395
|
|
|
|
|
|
|
# ^^^ |
396
|
|
|
|
|
|
|
# 123 |
397
|
|
|
|
|
|
|
# 1=separator, 2=pseudo-name, |
398
|
|
|
|
|
|
|
# 3=original value minus '#' starts after : |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
or # else completely delete comments |
401
|
|
|
|
|
|
|
s/^\d+:#.*\n//gm # up to and including final \n |
402
|
|
|
|
|
|
|
; |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# Return if nothing's left after deleting blank lines and comments. |
405
|
|
|
|
|
|
|
# |
406
|
9
|
50
|
|
|
|
31
|
/^\s*$/s and |
407
|
|
|
|
|
|
|
return "warning: record at line $linenum has no content"; |
408
|
|
|
|
|
|
|
|
409
|
9
|
|
|
|
|
14
|
my $msg = ""; # default return message |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# If we're not in strict parse mode, correct for common error |
412
|
|
|
|
|
|
|
# where continued value is not indented. We can pretty safely |
413
|
|
|
|
|
|
|
# assume a continued value if a line is flush left and contains |
414
|
|
|
|
|
|
|
# no colon at all. |
415
|
|
|
|
|
|
|
# |
416
|
|
|
|
|
|
|
# This next substitution match is multi-line to avoid explicit |
417
|
|
|
|
|
|
|
# looping (yyy is this an efficient way to do it?). It indents |
418
|
|
|
|
|
|
|
# by one space any line starting without a space or colon and |
419
|
|
|
|
|
|
|
# that has no instance of a colon until end of line. |
420
|
|
|
|
|
|
|
# |
421
|
9
|
|
|
|
|
57
|
my $indented = s/^(\d+:)([^\s:][^:]*)$/$1 $2/gm; |
422
|
9
|
100
|
|
|
|
21
|
if ($indented) { |
423
|
3
|
100
|
|
|
|
12
|
unless ($$o{autoindent}) { |
424
|
1
|
|
|
|
|
6
|
@$r_elems = undef; # XXXXX isn't this too much? |
425
|
1
|
|
|
|
|
7
|
return "error: $indented unindented value line(s)"; |
426
|
|
|
|
|
|
|
} |
427
|
2
|
|
|
|
|
7
|
$msg = "warning: indenting $indented value line(s)"; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
# Now we join the (normalized) continuation lines (GRANVL style) |
431
|
|
|
|
|
|
|
# so each element-value pair is on one line. The + in [ \t]+ is |
432
|
|
|
|
|
|
|
# very important; we can't use \s+ here because \s matches a \n. |
433
|
|
|
|
|
|
|
# |
434
|
8
|
|
|
|
|
41
|
s/\n\d+:[ \t]+/ /g; |
435
|
|
|
|
|
|
|
#s/\n\d+:\s+/ /g; |
436
|
|
|
|
|
|
|
# XXX should we have a newline-preserving form of parse? |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
# Get rid of initial whitespace from all non-comment GRANVL values. |
439
|
8
|
|
|
|
|
66
|
s/^(\d+:[^\s:][^:]*:)[ \t]+/$1/gm; |
440
|
|
|
|
|
|
|
# xxx problem with line #K:value, which becomes, eg, 4##:K:value |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# Split into array of element pairs. Toss first "false" split. |
443
|
|
|
|
|
|
|
# xxx buggy limited patterns, how not to match newline |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# This is the critical splitting step. |
446
|
|
|
|
|
|
|
# splits line beginning ..... xxx |
447
|
|
|
|
|
|
|
# |
448
|
8
|
|
|
|
|
27
|
s/\n$//; # strip final \n |
449
|
8
|
|
|
|
|
177
|
@$r_elems = ('', 'ANVL', # 3rd elem of 1st triple is |
450
|
|
|
|
|
|
|
# provided by first element resulting from the split |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
split /\n*^(\d+[:#])([^\s:][^:]*):/m |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
# xxx problem with line #K:value, which becomes, eg, 4##:K:value |
455
|
|
|
|
|
|
|
); |
456
|
|
|
|
|
|
|
|
457
|
8
|
50
|
|
|
|
33
|
defined($$r_elems[2]) or |
458
|
|
|
|
|
|
|
return "error: split failed ($_) on '$record', " . |
459
|
|
|
|
|
|
|
"record at line $linenum"; |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# If there was a value with no label at the start of the record, |
462
|
|
|
|
|
|
|
# we deem that interesting enough to keep even though it's not |
463
|
|
|
|
|
|
|
# ANVL-compliant; the caller can prevent this by turning off |
464
|
|
|
|
|
|
|
# 'autoindent', the processing for which will either flag this as |
465
|
|
|
|
|
|
|
# an error or will have inserted one space in front of the value. |
466
|
|
|
|
|
|
|
# |
467
|
8
|
100
|
|
|
|
23
|
$$r_elems[2] =~ /^(\d+): (.*)/ and |
468
|
|
|
|
|
|
|
($$r_elems[0], $$r_elems[2]) = ($1, $2); |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
#(undef, @$r_elems) = split /\n*^([^\s:][\w ]*):\s*/m; |
471
|
|
|
|
|
|
|
# yyy an approach once considered but not used |
472
|
|
|
|
|
|
|
# $num = $.; # linenum |
473
|
|
|
|
|
|
|
# s/^/ $num++ . ":" /e while (/\n/g); |
474
|
|
|
|
|
|
|
# /\G ($N\#.*\n)+ (?=$N[^\#]) /gx # comment block |
475
|
|
|
|
|
|
|
# /\G ($N\S.*\n)+ (?=$N[^\S]) /gx # element on one or more lines |
476
|
|
|
|
|
|
|
# /\G (#.*\n)+(?=[^#])/g |
477
|
|
|
|
|
|
|
# /^#.*?\n[^#]/s # (?=lookahead) |
478
|
|
|
|
|
|
|
#return "_=$_\n" . join(", ", @$r_elems); # to check results |
479
|
|
|
|
|
|
|
|
480
|
8
|
|
|
|
|
51
|
return $msg; |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# XXXXXX for consolidating a:b and a:c into a:b;c, MAJOR constraint |
484
|
|
|
|
|
|
|
# is that b and c CANNOT contain '|' or we refuse... |
485
|
|
|
|
|
|
|
|
486
|
3
|
|
|
3
|
0
|
2181
|
sub anvl_arrayhash { my( $r_elems, $r_hash, $first_only )=@_; |
487
|
|
|
|
|
|
|
|
488
|
3
|
100
|
|
|
|
13
|
ref($r_elems) ne "ARRAY" and |
489
|
|
|
|
|
|
|
return "error: 1st arg must reference an array"; |
490
|
2
|
100
|
|
|
|
7
|
ref($r_hash) ne "HASH" and |
491
|
|
|
|
|
|
|
return "error: 2nd arg must reference a hash"; |
492
|
1
|
50
|
|
|
|
3
|
defined($first_only) or $first_only = 0; |
493
|
|
|
|
|
|
|
|
494
|
1
|
|
|
|
|
3
|
my $num_elems = scalar @$r_elems; |
495
|
1
|
50
|
|
|
|
4
|
$num_elems % 3 != 0 and |
496
|
|
|
|
|
|
|
return "error: input array length must be a multiple of 3"; |
497
|
|
|
|
|
|
|
|
498
|
1
|
50
|
|
|
|
3
|
$num_elems < 1 and return ""; # no elements, we're done |
499
|
|
|
|
|
|
|
|
500
|
1
|
|
|
|
|
3
|
my $msg = ''; # xxx needed? |
501
|
1
|
|
|
|
|
2
|
my ($name, $value, $n, $v); |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
# We know there must be at least 3 elements, so it's safe to check |
504
|
|
|
|
|
|
|
# the special first triple (index 2 is the only one we look at now) |
505
|
|
|
|
|
|
|
# for an initial unlabeled record element (non-standard ANVL). |
506
|
|
|
|
|
|
|
# If we find something, we make up the name, '_'. |
507
|
|
|
|
|
|
|
# |
508
|
1
|
50
|
|
|
|
4
|
if ($$r_elems[2]) { # first triple is special |
509
|
1
|
|
|
|
|
2
|
$name = '_'; |
510
|
0
|
|
|
|
|
0
|
! defined $$r_hash{$name} and |
511
|
|
|
|
|
|
|
$$r_hash{$name} = [ 0 ] # initialize array |
512
|
|
|
|
|
|
|
or |
513
|
1
|
50
|
50
|
|
|
12
|
push @{ $$r_hash{$name} }, 0 # add to array |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
1
|
|
|
|
|
5
|
for ($n = 3; $n < $num_elems; $n += 3) { |
517
|
|
|
|
|
|
|
|
518
|
4
|
|
|
|
|
7
|
$name = $$r_elems[$n + 1]; |
519
|
1
|
|
|
|
|
4
|
! defined $$r_hash{$name} and |
520
|
|
|
|
|
|
|
$$r_hash{$name} = [ $n ] # initialize array |
521
|
|
|
|
|
|
|
or |
522
|
4
|
100
|
100
|
|
|
24
|
push @{ $$r_hash{$name} }, $n # add to array |
523
|
|
|
|
|
|
|
; |
524
|
|
|
|
|
|
|
} |
525
|
1
|
|
|
|
|
3
|
return $msg; |
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# ANVL value split |
529
|
|
|
|
|
|
|
# xxx rename to anvl_valarray? |
530
|
|
|
|
|
|
|
# returns empty string on success or string beginning "warning:..." |
531
|
|
|
|
|
|
|
# r_svals is reference to an array that will be filled upon return |
532
|
5
|
|
|
5
|
0
|
5052
|
sub anvl_valsplit { my( $value, $r_svals )=@_; |
533
|
|
|
|
|
|
|
|
534
|
5
|
50
|
|
|
|
13
|
! defined($value) and |
535
|
|
|
|
|
|
|
return "needs an ANVL value"; |
536
|
5
|
100
|
|
|
|
16
|
ref($r_svals) ne "ARRAY" and |
537
|
|
|
|
|
|
|
return "2nd arg must reference an array"; |
538
|
4
|
|
|
|
|
5
|
local $_; |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
#xxx print "r_svals=$r_svals\n"; |
541
|
|
|
|
|
|
|
#xxx print "value=$value\n"; |
542
|
4
|
|
|
|
|
7
|
my $warning = ""; # xxx used? |
543
|
|
|
|
|
|
|
#my $ret_subvalues = \$_[1]; |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# Assume value is all on one line and split it. |
546
|
|
|
|
|
|
|
#my @svals = split /\|/, $value; |
547
|
4
|
|
|
|
|
23
|
@$r_svals = split /\|/, $value; |
548
|
|
|
|
|
|
|
#$_[1] = \@svals; |
549
|
|
|
|
|
|
|
$_ = [ split(/;/, $_) ] # create array of arrays |
550
|
4
|
|
|
|
|
41
|
for (@$r_svals); |
551
|
|
|
|
|
|
|
#xxxprint("svals=", join(", ", @$_), "\n") for (@$r_svals); |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# xxxx need to look for all 3 levels: (change spec) |
554
|
|
|
|
|
|
|
# XXXXXXX value ::= one or more svals (sval1 | sval2 | ...) |
555
|
|
|
|
|
|
|
# XXXXXXX sval ::= one or more rvals (rval1 ; rval2 ; ...) |
556
|
|
|
|
|
|
|
# XXXXXXX rval ::= one or more qvals (qval1 (=) qval2 (=) ...) |
557
|
|
|
|
|
|
|
# where s=sub, r=repeated, q=equivalent |
558
|
|
|
|
|
|
|
# XXXXXXX or ?? rval ::= one or more avals (aval1 (=) aval2 (=) ...) |
559
|
4
|
50
|
|
|
|
19
|
return $warning ? "warning: $warning" : ""; |
560
|
|
|
|
|
|
|
} |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# Create record hash, elem is key, value is value |
563
|
|
|
|
|
|
|
# |
564
|
2
|
|
|
2
|
0
|
2113
|
sub anvl_rechash { my( $record, $r_hash, $strict )=@_; |
565
|
|
|
|
|
|
|
|
566
|
2
|
50
|
|
|
|
8
|
! defined($record) and |
567
|
|
|
|
|
|
|
return "needs an ANVL record"; |
568
|
2
|
100
|
|
|
|
6
|
ref($r_hash) ne "HASH" and |
569
|
|
|
|
|
|
|
return "2nd arg must reference a hash"; |
570
|
|
|
|
|
|
|
|
571
|
1
|
|
|
|
|
3
|
my $msg = ""; |
572
|
1
|
|
|
|
|
1
|
my @elems; |
573
|
1
|
50
|
|
|
|
5
|
($msg = anvl_recsplit($record, \@elems, $strict)) and |
574
|
|
|
|
|
|
|
return "anvl_recsplit: $msg"; |
575
|
|
|
|
|
|
|
|
576
|
1
|
|
|
|
|
3
|
my ($name, $value); |
577
|
1
|
|
|
|
|
2
|
while (1) { |
578
|
2
|
|
|
|
|
4
|
$name = shift @elems; |
579
|
2
|
100
|
|
|
|
6
|
last unless defined $name; # nothing left |
580
|
1
|
|
|
|
|
2
|
$value = shift @elems; |
581
|
1
|
50
|
|
|
|
4
|
if (! defined $$r_hash{$name}) { |
582
|
|
|
|
|
|
|
# Nothing there, so store scalar and continue. |
583
|
1
|
|
|
|
|
3
|
$$r_hash{$name} = $value; # 1st value (non-array) |
584
|
1
|
|
|
|
|
2
|
next; |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
# If we get here there's something's already there. |
587
|
|
|
|
|
|
|
# Don't overwrite if we're in $strict mode. |
588
|
|
|
|
|
|
|
# xxx document this |
589
|
|
|
|
|
|
|
# |
590
|
0
|
0
|
|
|
|
0
|
$strict and next; # don't overwrite |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
# XXXXXxxxx make adding a value policy-driven, eg, |
593
|
|
|
|
|
|
|
# "add" could mean (a) replace, (b) push on end array, |
594
|
|
|
|
|
|
|
# (c) push on start of array, (d) string concatenation, |
595
|
|
|
|
|
|
|
# (d) error. |
596
|
|
|
|
|
|
|
# xxx should anvl_rechash save line numbers? |
597
|
|
|
|
|
|
|
# xxx should anvl_recsplit save line numbers? |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
# Whatever is there could be a scalar or an array reference. |
600
|
|
|
|
|
|
|
# If not a reference, create an anonymous array, put a |
601
|
|
|
|
|
|
|
# scalar into it, and refer to the array. |
602
|
|
|
|
|
|
|
# |
603
|
0
|
|
|
|
|
0
|
my $v = $$r_hash{$name}; # add to current |
604
|
0
|
0
|
|
|
|
0
|
$v = [ $v ] # make an array if currently |
605
|
|
|
|
|
|
|
unless ref $v; # there's only one value |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
# If we get here, we have a reference to an array, |
608
|
|
|
|
|
|
|
# possibly empty. Either way, we can push onto it. |
609
|
|
|
|
|
|
|
# |
610
|
0
|
|
|
|
|
0
|
push @$v, $value; |
611
|
|
|
|
|
|
|
} |
612
|
1
|
|
|
|
|
3
|
return $msg; |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
# [ !"#\$%&'\(\)\*\+,/:;<=>\?@\[\\\]\|\0] |
616
|
|
|
|
|
|
|
our %anvl_decoding = ( |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
'sp' => ' ', # decodes to space (0x20) |
619
|
|
|
|
|
|
|
'ex' => '!', # decodes to ! (0x21) |
620
|
|
|
|
|
|
|
'dq' => '"', # decodes to " (0x22) |
621
|
|
|
|
|
|
|
'ns' => '#', # decodes to # (0x23) |
622
|
|
|
|
|
|
|
'do' => '$', # decodes to $ (0x24) |
623
|
|
|
|
|
|
|
'pe' => '%', # decodes to % (0x25) |
624
|
|
|
|
|
|
|
'am' => '&', # decodes to & (0x26) |
625
|
|
|
|
|
|
|
'sq' => "'", # decodes to ' (0x27) |
626
|
|
|
|
|
|
|
'op' => '(', # decodes to ( (0x28) |
627
|
|
|
|
|
|
|
'cp' => ')', # decodes to ) (0x29) |
628
|
|
|
|
|
|
|
'as' => '*', # decodes to * (0x2a) |
629
|
|
|
|
|
|
|
'pl' => '+', # decodes to + (0x2b) |
630
|
|
|
|
|
|
|
'co' => ',', # decodes to , (0x2c) |
631
|
|
|
|
|
|
|
'sl' => '/', # decodes to / (0x2f) |
632
|
|
|
|
|
|
|
'cn' => ':', # decodes to : (0x3a) |
633
|
|
|
|
|
|
|
'sc' => ';', # decodes to ; (0x3b) |
634
|
|
|
|
|
|
|
'lt' => '<', # decodes to < (0x3c) |
635
|
|
|
|
|
|
|
'eq' => '=', # decodes to = (0x3d) |
636
|
|
|
|
|
|
|
'gt' => '>', # decodes to > (0x3e) |
637
|
|
|
|
|
|
|
'qu' => '?', # decodes to ? (0x3f) |
638
|
|
|
|
|
|
|
'at' => '@', # decodes to @ (0x40) |
639
|
|
|
|
|
|
|
'ox' => '[', # decodes to [ (0x5b) |
640
|
|
|
|
|
|
|
'ls' => '\\', # decodes to \ (0x5c) |
641
|
|
|
|
|
|
|
'cx' => ']', # decodes to ] (0x5d) |
642
|
|
|
|
|
|
|
'vb' => '|', # decodes to | (0x7c) |
643
|
|
|
|
|
|
|
'nu' => "\0", # decodes to null (0x00) |
644
|
|
|
|
|
|
|
); |
645
|
|
|
|
|
|
|
# XXXXXXX need way to encode newlines (using '\n' in interim) |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
our %anvl_encoding; |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
#%cn : |
650
|
|
|
|
|
|
|
#%sc ; |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
# xxxxx handle these separately |
653
|
|
|
|
|
|
|
# # XXXX remove %% from erc/anvlspec? |
654
|
|
|
|
|
|
|
# '%' => '%pe', # decodes to % (0x25) xxxx do this first? |
655
|
|
|
|
|
|
|
# '_' => '', # a non-character used as a syntax shim |
656
|
|
|
|
|
|
|
# '{' => '', # a non-character that begins an expansion block |
657
|
|
|
|
|
|
|
# '}' => '', # a non-character that ends an expansion block |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
# Takes a single arg. |
660
|
|
|
|
|
|
|
sub anvl_decode { |
661
|
|
|
|
|
|
|
|
662
|
5
|
|
50
|
5
|
0
|
18
|
local $_ = shift(@_) || ''; |
663
|
|
|
|
|
|
|
|
664
|
5
|
|
|
|
|
18
|
pos() = 0; # reset \G for $_ just to be safe |
665
|
5
|
|
|
|
|
29
|
while (/(?=\%\{)/g) { # lookahead; \G matches just before |
666
|
7
|
|
|
|
|
11
|
my $p = pos(); # note \G position before it changes |
667
|
7
|
100
|
|
|
|
41
|
s/\G \%\{ (.*?) \%\}//xs # 's' modifier makes . match \n |
668
|
|
|
|
|
|
|
or last; # if no closing brace, skip match |
669
|
6
|
|
|
|
|
14
|
my $exp_block = $1; # save removed expansion block |
670
|
6
|
|
|
|
|
33
|
$exp_block =~ s/\s+//g; # strip it of all whitespace |
671
|
6
|
|
|
|
|
11
|
pos() = $p; # revert \G to where we started and |
672
|
6
|
|
|
|
|
34
|
s/\G/$exp_block/; # re-insert changed expansion block |
673
|
|
|
|
|
|
|
} |
674
|
5
|
|
|
|
|
14
|
s/\%[}{]//g; # remove any remaining unmatched |
675
|
5
|
|
|
|
|
8
|
s/\%_//g; # xxx %_ -> '' |
676
|
5
|
|
|
|
|
11
|
s/\%\%/\%pe/g; # xxx ??? xxxx??? |
677
|
|
|
|
|
|
|
# decode %XY where XY together don't form a valid pair of hex digits |
678
|
5
|
|
|
|
|
17
|
s/\%([g-z][a-z]|[a-z][g-z])/$anvl_decoding{$1}/g; |
679
|
5
|
|
|
|
|
25
|
return $_; |
680
|
|
|
|
|
|
|
} |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
# xxx encoding should be context-sensitive, eg, name, value |
683
|
1
|
|
|
1
|
0
|
5515
|
sub anvl_encode { my( $s )=@_; |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
# XXXX just define this in the module?? |
686
|
1
|
50
|
|
|
|
7
|
unless (%anvl_encoding) { # one-time definition |
687
|
|
|
|
|
|
|
# This just defines an inverse mapping so we can encode. |
688
|
|
|
|
|
|
|
$anvl_encoding{$anvl_decoding{$_}} = $_ |
689
|
1
|
|
|
|
|
149
|
for (keys %anvl_decoding); |
690
|
|
|
|
|
|
|
} |
691
|
|
|
|
|
|
|
$s =~ |
692
|
1
|
|
|
|
|
14
|
s/([ !\"#\$\%&'\(\)\*\+,\/:;<=>\?@\[\\\]\|\0])/\%$anvl_encoding{$1}/g; |
693
|
1
|
|
|
|
|
10
|
return $s; |
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# return $name in natural word order, using ANVL inversion points |
697
|
|
|
|
|
|
|
# repeat for each final comma present |
698
|
6
|
|
|
6
|
0
|
21
|
sub anvl_name_naturalize { my( $name )=@_; |
699
|
|
|
|
|
|
|
|
700
|
6
|
|
50
|
|
|
14
|
$name ||= ''; |
701
|
6
|
50
|
|
|
|
38
|
$name =~ /^\s*$/ and return $name; # empty |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
# "McCartney, Paul, Sir,," |
704
|
|
|
|
|
|
|
# a, b, c, d, e,,, -> e d c a, b |
705
|
6
|
|
|
|
|
16
|
my $prefix = ''; |
706
|
6
|
|
|
|
|
42
|
while ($name =~ s/,\s*$//) { |
707
|
9
|
100
|
|
|
|
106
|
$name =~ s/^(.*),\s*([^,]+)(,*$)/$1$3/ and |
708
|
|
|
|
|
|
|
$prefix .= $2 . ' '; |
709
|
|
|
|
|
|
|
} |
710
|
6
|
|
|
|
|
43
|
return $prefix . $name; |
711
|
|
|
|
|
|
|
} |
712
|
|
|
|
|
|
|
|
713
|
0
|
|
|
0
|
0
|
0
|
sub anvl_summarize { my( @nodes )=@_; } |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
# XXXXX doesn't this really belong in an ERC.pm module? |
716
|
|
|
|
|
|
|
# |
717
|
|
|
|
|
|
|
# ordered list of kernel element names |
718
|
|
|
|
|
|
|
our @kernel_labels = qw( |
719
|
|
|
|
|
|
|
who |
720
|
|
|
|
|
|
|
what |
721
|
|
|
|
|
|
|
when |
722
|
|
|
|
|
|
|
where |
723
|
|
|
|
|
|
|
how |
724
|
|
|
|
|
|
|
why |
725
|
|
|
|
|
|
|
huh |
726
|
|
|
|
|
|
|
); |
727
|
|
|
|
|
|
|
# |
728
|
|
|
|
|
|
|
# This routine inspects and possibly modifies in place the kind of element |
729
|
|
|
|
|
|
|
# array resulting from a call to anvl_recarray(), which splits and ANVL |
730
|
|
|
|
|
|
|
# record. It is useful for transforming short form ERC elements into full |
731
|
|
|
|
|
|
|
# form elements, for example, to expand "erc:a|b|c|d" into the equivalent, |
732
|
|
|
|
|
|
|
# "erc:\nwho:a\nwhat:b\nwhen:c\nwhere:d". |
733
|
|
|
|
|
|
|
# It returns the empty string on success, otherwise an error message. |
734
|
|
|
|
|
|
|
# |
735
|
4
|
|
|
4
|
0
|
1636
|
sub erc_anvl_expand_array { my( $r_elems )=@_; |
736
|
|
|
|
|
|
|
|
737
|
4
|
|
|
4
|
|
60
|
use File::ANVL; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
5380
|
|
738
|
4
|
|
|
|
|
4
|
my ($lineno, $name, $value, $msg, @svals, $sval); |
739
|
4
|
|
|
|
|
5
|
my $me = 'erc_anvl_expand_array'; |
740
|
4
|
|
|
|
|
5
|
my $i = 3; # skip first 3 elems (anvl array preamble) |
741
|
4
|
|
|
|
|
7
|
while (1) { |
742
|
21
|
|
|
|
|
23
|
$lineno = $$r_elems[$i++]; |
743
|
21
|
|
100
|
|
|
45
|
$name = $$r_elems[$i++] || ''; |
744
|
21
|
|
100
|
|
|
41
|
$value = $$r_elems[$i++] || ''; |
745
|
21
|
100
|
|
|
|
32
|
last unless defined $lineno; # end of record |
746
|
|
|
|
|
|
|
next # skip unless we have erc-type thing |
747
|
17
|
100
|
100
|
|
|
53
|
if ($name ne 'erc' || $value =~ /^\s*$/); |
748
|
|
|
|
|
|
|
#if ($name !~ /^erc\b/ || $value =~ /^\s*$/); |
749
|
|
|
|
|
|
|
# xxx should do this for full generality |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
# If here, we have an erc-type thing with a non-empty value. |
752
|
|
|
|
|
|
|
# |
753
|
2
|
50
|
|
|
|
8
|
($msg = anvl_valsplit($value, \@svals)) and |
754
|
|
|
|
|
|
|
return "error: $me: anvl_valsplit: $msg"; |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
# XXXX only doing straight "erc" (eg, not erc-about) |
757
|
2
|
|
|
|
|
3
|
my $j = 0; |
758
|
2
|
|
|
|
|
4
|
my @extras = (); |
759
|
|
|
|
|
|
|
# If we exceed known labels, we'll re-use last known label. |
760
|
2
|
|
|
|
|
4
|
my $unknown = $kernel_labels[$#kernel_labels]; |
761
|
2
|
|
|
|
|
3
|
foreach $sval (@svals) { |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
# xxx not (yet) tranferring subvalue structure |
764
|
|
|
|
|
|
|
# to anvl_om or other conversion |
765
|
|
|
|
|
|
|
# Recall that each $sval is itself a reference to |
766
|
|
|
|
|
|
|
# an array of subvalues (often just one element). |
767
|
|
|
|
|
|
|
# |
768
|
8
|
|
33
|
|
|
65
|
push @extras, # trust kernel_labels order |
769
|
|
|
|
|
|
|
$lineno, |
770
|
|
|
|
|
|
|
$kernel_labels[$j++] || $unknown, |
771
|
|
|
|
|
|
|
join('; ', # trim ends of subvalues |
772
|
|
|
|
|
|
|
map(m/^\s*(.*?)\s*$/, @$sval) |
773
|
|
|
|
|
|
|
); |
774
|
|
|
|
|
|
|
} |
775
|
|
|
|
|
|
|
# Finally, replace our $value element with '' and append |
776
|
|
|
|
|
|
|
# the new extra values we've just expanded. |
777
|
2
|
|
|
|
|
15
|
splice @$r_elems, $i-1, 1, |
778
|
|
|
|
|
|
|
'', # replaces $value we just used up |
779
|
|
|
|
|
|
|
@extras; # adds new elements from $value |
780
|
|
|
|
|
|
|
} |
781
|
4
|
|
|
|
|
12
|
return ''; # success |
782
|
|
|
|
|
|
|
} |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX============= |
785
|
|
|
|
|
|
|
# xxx checkm _in_ obj1 obj2 ... --> returns noids |
786
|
|
|
|
|
|
|
# xxx checkm _out_ id1 id2 ... --> returns objects |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX============= |
789
|
|
|
|
|
|
|
# xxx do metadata scan of object before ingest and confirm with user that |
790
|
|
|
|
|
|
|
# the object is correctly identified. This could even be done remotely. |
791
|
|
|
|
|
|
|
# Start with informal staff service for depositing objects, returning a |
792
|
|
|
|
|
|
|
# short url to a stable object, and not clogging up allstaff inboxes with |
793
|
|
|
|
|
|
|
# huge attachments. Also applies to any number of draft docs for review |
794
|
|
|
|
|
|
|
# but in temporary storage (but stable). |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX============= |
797
|
|
|
|
|
|
|
# xxx do id generator service with 'expiring' ids. To mint, you tell us |
798
|
|
|
|
|
|
|
# who you are first. To get a perm. id, you agree to use your minted id |
799
|
|
|
|
|
|
|
# and bind it within N months. We track, and warn you several times |
800
|
|
|
|
|
|
|
# until N months as elapsed and then reclaim/recycle the id. |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
############################################ |
803
|
|
|
|
|
|
|
# Output Multiplexer routines |
804
|
|
|
|
|
|
|
############################################ |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
# #$erc = "erc: Smith, J.|The Whole Truth|2004|http://example.com/foo/bar"; |
807
|
|
|
|
|
|
|
# $errmsg = File::ERC::erc_anvl2erc_turtle ($erc, $rec); |
808
|
|
|
|
|
|
|
# $errmsg and |
809
|
|
|
|
|
|
|
# print("$errmsg\n") |
810
|
|
|
|
|
|
|
# or |
811
|
|
|
|
|
|
|
# print("turtle record:\n$rec\n") |
812
|
|
|
|
|
|
|
# ; |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
# xxx anvl_fmt not consistent with om_anvl! |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
# Input file(s) from ARGV. |
817
|
|
|
|
|
|
|
|
818
|
0
|
|
|
0
|
0
|
|
sub anvl_om { my( $om, $o, $get_anvl ) = (shift, shift, shift); |
819
|
|
|
|
|
|
|
|
820
|
0
|
0
|
|
|
|
|
return "anvl_om: 1st arg not an OM object" |
821
|
|
|
|
|
|
|
if ref($om) !~ /^File::OM::/; |
822
|
0
|
|
|
|
|
|
my $p = $om->{outhandle}; # whether 'print' status or small |
823
|
0
|
|
0
|
|
|
|
$o ||= anvl_opt_defaults(); |
824
|
0
|
|
0
|
|
|
|
$get_anvl ||= File::ANVL::make_get_anvl(); # xxx set input here? |
825
|
|
|
|
|
|
|
# XXX test return value! |
826
|
|
|
|
|
|
|
|
827
|
0
|
|
|
|
|
|
my $s = ''; # output strings are returned to $s |
828
|
0
|
0
|
|
|
|
|
my $st = $p ? 1 : ''; # returns (stati or strings) accumulate |
829
|
0
|
|
|
|
|
|
my ($msg, $allmsgs, $anvlrec, $lineno, $name, $value, $pat, $n, $nmax); |
830
|
0
|
|
|
|
|
|
my (%rechash, $ne, $nemax, $elem_name); # for alt. element ordering |
831
|
0
|
|
|
|
|
|
my $r_elem_order = $$o{elem_order}; |
832
|
|
|
|
|
|
|
|
833
|
0
|
|
|
|
|
|
$s = $om->ostream(); # open stream |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
# This next line is a fast and compact (if cryptic) way to |
836
|
|
|
|
|
|
|
# accumulate $om->method calls. Used after each method call, it |
837
|
|
|
|
|
|
|
# concatenates strings or ANDs up print statuses, depending on the |
838
|
|
|
|
|
|
|
# outhandle setting. It makes several appearances in this routine. |
839
|
|
|
|
|
|
|
# |
840
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); # accumulate method returns |
|
|
|
0
|
|
|
|
|
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
# Numbers: record, element in record, and start line |
843
|
|
|
|
|
|
|
# |
844
|
0
|
|
|
|
|
|
my ($startline, $recnum, $elemnum) = (1, 0, 0); |
845
|
0
|
|
|
|
|
|
my ($wslines, $rrlines); |
846
|
0
|
|
|
|
|
|
my $r_elems = $om->{elemsref}; # abbreviation |
847
|
|
|
|
|
|
|
# xxx is that reference kosher? |
848
|
|
|
|
|
|
|
|
849
|
0
|
|
|
|
|
|
while (1) { |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
# Get an ANVL record and count lines therein. ANVL |
852
|
|
|
|
|
|
|
# records can come from anywhere, but typically from |
853
|
|
|
|
|
|
|
# a file (read in "paragraph" mode) or a BDB database. |
854
|
|
|
|
|
|
|
# |
855
|
0
|
|
|
|
|
|
$anvlrec = trimlines(&$get_anvl(), \$wslines, \$rrlines); |
856
|
0
|
|
|
|
|
|
$startline += $wslines; |
857
|
0
|
0
|
|
|
|
|
last unless $anvlrec; |
858
|
|
|
|
|
|
|
|
859
|
0
|
|
|
|
|
|
$recnum++; # increment record counter |
860
|
|
|
|
|
|
|
=for later |
861
|
|
|
|
|
|
|
# XXX anvl_recarray is expensive, do we _need_ to do it if the output is |
862
|
|
|
|
|
|
|
# also in anvl? Maybe call modified [2] here so "find" can work? |
863
|
|
|
|
|
|
|
if (ref($om) eq 'File::OM::ANVL' and ! $r_elem_order) { |
864
|
|
|
|
|
|
|
# xxx do quick expand (short->long erc) here? |
865
|
|
|
|
|
|
|
# xxx _will_ disturb input line numbering |
866
|
|
|
|
|
|
|
$$o{find} and ($anvlrec !~ /$$o{find}/m) and |
867
|
|
|
|
|
|
|
next; # no output has occurred |
868
|
|
|
|
|
|
|
# xxx do quick check for 'show' and next |
869
|
|
|
|
|
|
|
# XXXXXXXX must define lineno for verbose case |
870
|
|
|
|
|
|
|
$s = $om->anvl_rec($anvlrec, $startline, $rrlines); |
871
|
|
|
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
872
|
|
|
|
|
|
|
next; |
873
|
|
|
|
|
|
|
} |
874
|
|
|
|
|
|
|
=cut |
875
|
0
|
|
|
|
|
|
$msg = anvl_recarray($anvlrec, $r_elems, $startline, $o); |
876
|
0
|
0
|
|
|
|
|
$msg =~ /^error/ and return "anvl_recarray: $msg"; |
877
|
0
|
0
|
|
|
|
|
$msg eq "" or |
878
|
|
|
|
|
|
|
#print $msg, "\n"; |
879
|
|
|
|
|
|
|
#$o->{verbose} && print $msg, "\n"; |
880
|
|
|
|
|
|
|
$allmsgs .= $msg . "\n"; # save other message |
881
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
# NB: apply 'find' here before possible expansion, which |
883
|
|
|
|
|
|
|
# means that a pattern like "who:\s*smith" won't work on |
884
|
|
|
|
|
|
|
# on a short form ANVL record. |
885
|
|
|
|
|
|
|
# |
886
|
0
|
0
|
0
|
|
|
|
$$o{find} and ($anvlrec !~ /$$o{find}/m) and |
887
|
|
|
|
|
|
|
next; # no output has occurred |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
# If caller has set $$o{elemsproc} to a code reference, |
890
|
|
|
|
|
|
|
# it is called to process the element array just returned |
891
|
|
|
|
|
|
|
# from anvl_recarray. Typically this is used to convert |
892
|
|
|
|
|
|
|
# (with erc_anvl_expand_array) short form ERCs to long |
893
|
|
|
|
|
|
|
# form ERCs. As an optimization, the code is not called |
894
|
|
|
|
|
|
|
# if $$o{elemsprocpat} (typically, "erc") is set and |
895
|
|
|
|
|
|
|
# doesn't match the raw record string. |
896
|
|
|
|
|
|
|
# |
897
|
0
|
0
|
0
|
|
|
|
if (ref($$o{elemsproc}) eq "CODE" and # if code and either |
|
|
|
0
|
|
|
|
|
898
|
|
|
|
|
|
|
(! ($pat = $$o{elemsprocpat})) # no pattern or |
899
|
|
|
|
|
|
|
|| $anvlrec =~ $pat) { # the pattern matches |
900
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
# [2] XXX can we call elemsproc directly on the $anvlrec? so we don't need |
902
|
|
|
|
|
|
|
# to call expensive anvl_recarray first? |
903
|
0
|
0
|
|
|
|
|
($msg = &{$$o{elemsproc}}($r_elems)) and |
|
0
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
return "File::ANVL::elemsproc: $msg"; |
905
|
|
|
|
|
|
|
} |
906
|
0
|
0
|
|
|
|
|
ref($om) eq 'File::OM::Turtle' and |
907
|
|
|
|
|
|
|
turtle_set_subject($om, $anvlrec); |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
# The orec method is given first crack at a new record. |
910
|
|
|
|
|
|
|
# It sets and/or clears a number of values for keys (eg, |
911
|
|
|
|
|
|
|
# for turtle, $$o{subject}). $recnum is useful for |
912
|
|
|
|
|
|
|
# outputting json separators (eg, no comma if $recnum eq 1) |
913
|
|
|
|
|
|
|
# or record numbers in comments (eg, if $$o{verbose}). |
914
|
|
|
|
|
|
|
# $startline is useful for parser diagnostics (eg, "error |
915
|
|
|
|
|
|
|
# on line 5"). $r_elems and $r_elem_order are needed for |
916
|
|
|
|
|
|
|
# discovering what elements will populate CSV/PSV records. |
917
|
|
|
|
|
|
|
# |
918
|
|
|
|
|
|
|
# XXX this next isn't needed if output is anvl ?! (assuming final NL is |
919
|
|
|
|
|
|
|
# written when closing the record |
920
|
0
|
|
|
|
|
|
$s = $om->orec($recnum, $startline, $r_elems, $r_elem_order); |
921
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
|
|
|
0
|
|
|
|
|
922
|
|
|
|
|
|
|
|
923
|
0
|
0
|
|
|
|
|
if ($r_elem_order) { |
924
|
0
|
|
|
|
|
|
undef %rechash; # don't want prior indices |
925
|
0
|
0
|
|
|
|
|
($msg = anvl_arrayhash($r_elems, \%rechash)) and |
926
|
|
|
|
|
|
|
return "anvl_arrayhash: $msg"; |
927
|
0
|
|
|
|
|
|
$ne = -1; # index into $$r_elem_order |
928
|
0
|
|
|
|
|
|
$nemax = scalar @$r_elem_order; |
929
|
|
|
|
|
|
|
} else { |
930
|
0
|
0
|
|
|
|
|
$n = # index into $$r_elems |
931
|
|
|
|
|
|
|
# XXX don't reference r_elems if we haven't called anvl_recarray |
932
|
|
|
|
|
|
|
$$r_elems[2] # if a no-label value starts |
933
|
|
|
|
|
|
|
? -3 # rec, make sure to output it, |
934
|
|
|
|
|
|
|
: 0; # else skip it (normal) |
935
|
0
|
|
|
|
|
|
$nmax = scalar @$r_elems; |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
# XXX if output is to anvl, can we not skip the entire loop below? but |
939
|
|
|
|
|
|
|
# not if it's possible to output anvl _and_ to care about element order |
940
|
|
|
|
|
|
|
# XXX but still perform $show check and skip not-shown elems |
941
|
|
|
|
|
|
|
# XXX and still perform value inversion if {invert} options |
942
|
0
|
|
|
|
|
|
$elemnum = 0; # true elements, not comments |
943
|
0
|
|
|
|
|
|
undef $name; |
944
|
0
|
|
|
|
|
|
while (1) { |
945
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
# Select next candidate element. If we need to |
947
|
|
|
|
|
|
|
# output elements in a certain order, consult the |
948
|
|
|
|
|
|
|
# hash; otherwise, just use "found" order. |
949
|
|
|
|
|
|
|
# |
950
|
0
|
0
|
|
|
|
|
if ($r_elem_order) { # use specified order |
951
|
|
|
|
|
|
|
|
952
|
0
|
|
|
|
|
|
$ne++; |
953
|
0
|
0
|
|
|
|
|
$ne >= $nemax and last; |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
# For CSV and PSV, the element name at this |
956
|
|
|
|
|
|
|
# position may be deliberately undefined, or |
957
|
|
|
|
|
|
|
# may correspond to a named element missing |
958
|
|
|
|
|
|
|
# in this record, in which case we skip it. |
959
|
|
|
|
|
|
|
# |
960
|
0
|
|
|
|
|
|
$elem_name = $r_elem_order->[$ne]; |
961
|
0
|
0
|
0
|
|
|
|
! defined($elem_name) || ! defined( |
962
|
|
|
|
|
|
|
#XXX ignore multiple instances for now |
963
|
|
|
|
|
|
|
$n = $rechash{$elem_name}->[0] |
964
|
|
|
|
|
|
|
) and |
965
|
|
|
|
|
|
|
# for CSV/PSV, output an empty element |
966
|
|
|
|
|
|
|
next; |
967
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
} else { # use natural array order |
969
|
0
|
|
|
|
|
|
$n += 3; |
970
|
0
|
0
|
|
|
|
|
$n >= $nmax and last; |
971
|
|
|
|
|
|
|
} |
972
|
|
|
|
|
|
|
# If we get here, $n is defined. |
973
|
|
|
|
|
|
|
|
974
|
0
|
|
|
|
|
|
$lineno = $$r_elems[$n]; |
975
|
0
|
0
|
|
|
|
|
$name = $n < 3 # for special first triple |
976
|
|
|
|
|
|
|
? '_' # use synthesized name '_' |
977
|
|
|
|
|
|
|
: $$r_elems[$n + 1]; # else real name |
978
|
0
|
|
0
|
|
|
|
$value = $$r_elems[$n + 2] || ""; |
979
|
|
|
|
|
|
|
|
980
|
0
|
0
|
|
|
|
|
$elemnum++ unless $name eq '#'; |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
# Skip if 'show' given and not requested. |
983
|
0
|
0
|
0
|
|
|
|
$$o{show} and ("$name: $value" !~ /$$o{show}/m) and |
984
|
|
|
|
|
|
|
(undef $name), # cause elem to be skipped |
985
|
|
|
|
|
|
|
next; |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
# Instead of $om->oelem, $om->celem, $om->contelem, |
988
|
|
|
|
|
|
|
# combine open and close into one, but first |
989
|
|
|
|
|
|
|
# naturalize values if called upon. |
990
|
|
|
|
|
|
|
# |
991
|
0
|
0
|
0
|
|
|
|
$$o{invert} and $value =~ /,\s*$/ and |
992
|
|
|
|
|
|
|
$value = anvl_name_naturalize($value); |
993
|
|
|
|
|
|
|
} |
994
|
|
|
|
|
|
|
continue { |
995
|
0
|
|
|
|
|
|
$s = $om->elem($name, $value, $lineno); |
996
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
|
|
|
0
|
|
|
|
|
997
|
0
|
|
|
|
|
|
undef $name; # clean the slate |
998
|
|
|
|
|
|
|
} |
999
|
0
|
|
|
|
|
|
$s = $om->crec($recnum); |
1000
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
|
|
|
0
|
|
|
|
|
1001
|
|
|
|
|
|
|
} |
1002
|
|
|
|
|
|
|
continue { |
1003
|
0
|
|
|
|
|
|
$startline += $rrlines; |
1004
|
|
|
|
|
|
|
} |
1005
|
|
|
|
|
|
|
# XXX currently doing nothing with $allmsgs warnings! |
1006
|
|
|
|
|
|
|
# should probably print if verbose mode on |
1007
|
0
|
|
|
|
|
|
$s = $om->cstream(); |
1008
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
|
|
|
0
|
|
|
|
|
1009
|
|
|
|
|
|
|
|
1010
|
0
|
|
|
|
|
|
return $st; |
1011
|
|
|
|
|
|
|
} |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
# xxx document all om options |
1014
|
|
|
|
|
|
|
# xxx should om also have a recstring slot (for anvlrec)? |
1015
|
|
|
|
|
|
|
# xxx pass in turtle_nosubject (default)? |
1016
|
|
|
|
|
|
|
sub turtle_set_subject { |
1017
|
|
|
|
|
|
|
|
1018
|
0
|
|
|
0
|
0
|
|
my ($om, $anvlrec) = (shift, shift); |
1019
|
0
|
|
|
|
|
|
my $r_elems = $om->{elemsref}; |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
# In order to find the subject element for Turtle/RDF |
1022
|
|
|
|
|
|
|
# assertions, we need an element name pattern. If one is |
1023
|
|
|
|
|
|
|
# defined in $om->{turtle_subjelpat}, use it. If it's undefined, |
1024
|
|
|
|
|
|
|
# per-record code will use 'where' if it thinks the record |
1025
|
|
|
|
|
|
|
# is an ERC, or use 'identifier|subject' as a last resort. |
1026
|
|
|
|
|
|
|
# If no element matching subjelpat is found, $om->{subject} |
1027
|
|
|
|
|
|
|
# will default to $om->{turtle_nosubject}. |
1028
|
|
|
|
|
|
|
# |
1029
|
0
|
|
0
|
|
|
|
my $subjpat = $om->{turtle_subjelpat} || |
1030
|
|
|
|
|
|
|
($anvlrec =~ /^erc\s*:/m |
1031
|
|
|
|
|
|
|
? "^where\$" : # 1st where in an 'erc', or |
1032
|
|
|
|
|
|
|
($anvlrec =~ /^(identifier|subject)\s*:/m |
1033
|
|
|
|
|
|
|
? "^$1\$" : # 1st identifier or subject, |
1034
|
|
|
|
|
|
|
($anvlrec =~ /^(.+)\s*:\s*(\n\s+)*\w/ |
1035
|
|
|
|
|
|
|
? "^$1\$" : # or 1st non-empty element |
1036
|
|
|
|
|
|
|
''))); # or nothing (always matches) |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
# Now find a 'subject' for our Turtle/RDF assertions. |
1039
|
|
|
|
|
|
|
# |
1040
|
0
|
|
|
|
|
|
my $j = 1; # element names in positions 1, 4, 7, ... |
1041
|
0
|
|
0
|
|
|
|
1 while ($j < $#$r_elems and # quickly find it |
|
|
|
0
|
|
|
|
|
1042
|
|
|
|
|
|
|
@$r_elems[$j] !~ $subjpat and ($j += 3)); |
1043
|
0
|
0
|
0
|
|
|
|
$om->{subject} = $j < $#$r_elems && $subjpat ? # if found, |
1044
|
|
|
|
|
|
|
@$r_elems[$j + 1] : # use associated value |
1045
|
|
|
|
|
|
|
$om->{turtle_nosubject}; # else use default |
1046
|
0
|
|
|
|
|
|
return $om->{subject}; |
1047
|
|
|
|
|
|
|
} |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
1; |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
__END__ |