| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package File::ANVL; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# XXXXXxxxx make adding a value policy-driven, eg, |
|
4
|
|
|
|
|
|
|
# "add" could mean (a) replace, (b) push on end array, |
|
5
|
|
|
|
|
|
|
# (c) push on start of array, (d) string concatenation, |
|
6
|
|
|
|
|
|
|
# (d) error. |
|
7
|
|
|
|
|
|
|
|
|
8
|
4
|
|
|
4
|
|
139546
|
use 5.006; |
|
|
4
|
|
|
|
|
13
|
|
|
|
4
|
|
|
|
|
187
|
|
|
9
|
4
|
|
|
4
|
|
24
|
use strict; |
|
|
4
|
|
|
|
|
6
|
|
|
|
4
|
|
|
|
|
144
|
|
|
10
|
4
|
|
|
4
|
|
20
|
use warnings; |
|
|
4
|
|
|
|
|
14
|
|
|
|
4
|
|
|
|
|
152
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
4
|
|
|
4
|
|
18
|
use constant NL => "\n"; |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
345
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# ANVL flavors |
|
15
|
|
|
|
|
|
|
# |
|
16
|
4
|
|
|
4
|
|
18
|
use constant ANVL => 1; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
159
|
|
|
17
|
4
|
|
|
4
|
|
20
|
use constant ANVLR => 2; |
|
|
4
|
|
|
|
|
6
|
|
|
|
4
|
|
|
|
|
160
|
|
|
18
|
4
|
|
|
4
|
|
18
|
use constant ANVLS => 3; |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
14216
|
|
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our $VERSION; |
|
21
|
|
|
|
|
|
|
$VERSION = sprintf "%d.%02d", q$Name: Release-1-05 $ =~ /Release-(\d+)-(\d+)/; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
require Exporter; |
|
24
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our @EXPORT = qw(); |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
our @EXPORT_OK = qw( |
|
29
|
|
|
|
|
|
|
anvl_recarray anvl_arrayhash |
|
30
|
|
|
|
|
|
|
anvl_name_naturalize |
|
31
|
|
|
|
|
|
|
anvl_rechash anvl_valsplit |
|
32
|
|
|
|
|
|
|
erc_anvl_expand_array kernel_labels |
|
33
|
|
|
|
|
|
|
xgetlines trimlines |
|
34
|
|
|
|
|
|
|
make_get_anvl |
|
35
|
|
|
|
|
|
|
anvl_opt_defaults anvl_decode anvl_om |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
anvl_encode anvl_recsplit |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
ANVL ANVLR ANVLS ANVLSH |
|
40
|
|
|
|
|
|
|
); |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
our %EXPORT_TAGS = (all => [ @EXPORT_OK ]); |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# All these symbols must be listed also in EXPORT_OK (?) |
|
45
|
|
|
|
|
|
|
# |
|
46
|
|
|
|
|
|
|
our @EXPORT_FAIL = qw( |
|
47
|
|
|
|
|
|
|
ANVL ANVLR ANVLS ANVLSH |
|
48
|
|
|
|
|
|
|
); |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
our $anvl_mode = 'ANVL'; # default mode |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# This is a magic routine that the Exporter calls for any unknown symbols. |
|
53
|
|
|
|
|
|
|
# |
|
54
|
3
|
|
|
3
|
0
|
490
|
sub export_fail { my( $class, @symbols )=@_; |
|
55
|
|
|
|
|
|
|
|
|
56
|
3
|
|
|
|
|
14
|
$anvl_mode = $_ for (@symbols); |
|
57
|
3
|
|
|
|
|
2767
|
return (); |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# Initialize or re-initialize options to factory defaults. |
|
61
|
|
|
|
|
|
|
# |
|
62
|
|
|
|
|
|
|
sub anvl_opt_defaults { return { |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# Input options |
|
65
|
|
|
|
|
|
|
# |
|
66
|
7
|
|
|
7
|
0
|
61
|
autoindent => 1, # yes, fix recoverably bad indention |
|
67
|
|
|
|
|
|
|
comments => 0, # no, don't parse comments |
|
68
|
|
|
|
|
|
|
elemsproc => # to expand short form ERCs (if any) |
|
69
|
|
|
|
|
|
|
\&File::ANVL::erc_anvl_expand_array, |
|
70
|
|
|
|
|
|
|
elemsprocpat => # no call from anvl_om if no match |
|
71
|
|
|
|
|
|
|
qr/^erc:/m, # in rec; no call if set and matches |
|
72
|
|
|
|
|
|
|
}; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# xxx decide on good name for short form and long form ERC |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# Returns a closure that calls an input reader with that's set to *ARGV |
|
78
|
|
|
|
|
|
|
# by default. If $reader and $readee are defined, they are stored in the |
|
79
|
|
|
|
|
|
|
# closure and all reads will be performed by calling &$reader($readee). |
|
80
|
|
|
|
|
|
|
# |
|
81
|
|
|
|
|
|
|
# The default reader collects text lines from a file and returns all the |
|
82
|
|
|
|
|
|
|
# lines associated with the next "record", which is considered to start |
|
83
|
|
|
|
|
|
|
# wherever the read pointer happens to be and continues to the first two |
|
84
|
|
|
|
|
|
|
# blank lines encountered that occur after "substance" is detected. |
|
85
|
|
|
|
|
|
|
# Substance is defined to be at least one non-whitespace character |
|
86
|
|
|
|
|
|
|
# occurring on a non-comment line. Comment and blank lines that precede |
|
87
|
|
|
|
|
|
|
# a record with substance are returned, but any such lines that follow |
|
88
|
|
|
|
|
|
|
# that the final record are discarded. |
|
89
|
|
|
|
|
|
|
# |
|
90
|
1
|
|
|
1
|
0
|
389
|
sub make_get_anvl { my( $reader, $readee ) = shift; |
|
91
|
|
|
|
|
|
|
|
|
92
|
1
|
50
|
|
|
|
6
|
unless ($reader) { |
|
93
|
|
|
|
|
|
|
|
|
94
|
1
|
|
|
|
|
3
|
my $rec; # returned record |
|
95
|
|
|
|
|
|
|
my $s; # next increment of input |
|
96
|
0
|
|
|
|
|
0
|
my $substance; # boolean detecting substance |
|
97
|
|
|
|
|
|
|
|
|
98
|
4
|
|
|
4
|
|
1393
|
return sub { my( $filehandle ) = shift; |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# Returns a subroutine, call it get_anvl() |
|
101
|
|
|
|
|
|
|
# |
|
102
|
|
|
|
|
|
|
# Usage: $record = get_anvl( [$filehandle] ); |
|
103
|
|
|
|
|
|
|
# |
|
104
|
|
|
|
|
|
|
# It reads ANVL input records as text lines from the |
|
105
|
|
|
|
|
|
|
# file given by $filehandle (*ARGV by default, which can |
|
106
|
|
|
|
|
|
|
# process multiple files via while loop magic). Usually, |
|
107
|
|
|
|
|
|
|
# the closure holds enough state information, set up by |
|
108
|
|
|
|
|
|
|
# make_get_anvl(), that get_anvl() can be called without |
|
109
|
|
|
|
|
|
|
# arguments. get_anvl() returns the record read as a |
|
110
|
|
|
|
|
|
|
# string, or returns undef on end of input or error. |
|
111
|
|
|
|
|
|
|
# |
|
112
|
4
|
|
33
|
|
|
18
|
$filehandle ||= *ARGV; |
|
113
|
4
|
|
|
|
|
22
|
local $/ = NL.NL; # a kind of "paragraph" input mode |
|
114
|
|
|
|
|
|
|
# $/ === $INPUT_RECORD_SEPARATOR |
|
115
|
4
|
|
|
|
|
6
|
$rec = ''; |
|
116
|
4
|
|
66
|
|
|
118
|
1 while ( |
|
|
|
|
100
|
|
|
|
|
|
117
|
|
|
|
|
|
|
defined($s = <$filehandle>) and # read to eof and |
|
118
|
|
|
|
|
|
|
($rec .= $s), # save everything, but stop |
|
119
|
|
|
|
|
|
|
$substance = # when we detect substance, ie, |
|
120
|
|
|
|
|
|
|
$s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m, |
|
121
|
|
|
|
|
|
|
! $substance # non-comment with non-space |
|
122
|
|
|
|
|
|
|
); |
|
123
|
4
|
100
|
|
|
|
24
|
return $substance ? |
|
124
|
|
|
|
|
|
|
$rec : # return either collected record or undef |
|
125
|
|
|
|
|
|
|
undef; # any final blank or comment lines are tossed |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# yyy If more than one file, line numbers normally accumulate |
|
128
|
|
|
|
|
|
|
# across files. Should we preserve line numbers within each |
|
129
|
|
|
|
|
|
|
# files? (If so, use "close ARGV" (Perl idiom) to cause $. |
|
130
|
|
|
|
|
|
|
# (linenum) to be reset between files. |
|
131
|
1
|
|
|
|
|
9
|
}; |
|
132
|
|
|
|
|
|
|
} |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# If we get here, $reader should reference an input method and |
|
135
|
|
|
|
|
|
|
# $readee is assumed to be any value (eg, BDB handle) that may |
|
136
|
|
|
|
|
|
|
# permit &reader to get the next record. Any other arguments |
|
137
|
|
|
|
|
|
|
# passed to the get_anvl() function below will be passed along |
|
138
|
|
|
|
|
|
|
# too, ie, $reader($readee, @_). |
|
139
|
|
|
|
|
|
|
# |
|
140
|
0
|
0
|
|
|
|
0
|
ref($reader) eq "CODE" or return undef; |
|
141
|
|
|
|
|
|
|
|
|
142
|
0
|
|
|
|
|
0
|
my $rec; # returned record |
|
143
|
|
|
|
|
|
|
my $s; # next increment of input |
|
144
|
0
|
|
|
|
|
0
|
my $substance; # boolean detecting substance |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
return sub { |
|
147
|
0
|
|
|
0
|
|
0
|
$rec = ''; |
|
148
|
0
|
|
0
|
|
|
0
|
1 while ( |
|
|
|
|
0
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# XXX should this accumulate in general??? or |
|
150
|
|
|
|
|
|
|
# should we leave it to the definer of $reader? |
|
151
|
|
|
|
|
|
|
defined($s = &reader($readee, @_)) and # read and |
|
152
|
|
|
|
|
|
|
($rec .= $s), # save everything, but stop |
|
153
|
|
|
|
|
|
|
$substance = # when we detect substance, ie, |
|
154
|
|
|
|
|
|
|
$s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m, |
|
155
|
|
|
|
|
|
|
! $substance # non-comment with non-space |
|
156
|
|
|
|
|
|
|
); |
|
157
|
0
|
0
|
|
|
|
0
|
return $substance ? |
|
158
|
|
|
|
|
|
|
$rec : # return either collected record or undef |
|
159
|
|
|
|
|
|
|
undef; # any final blank or comment lines are tossed |
|
160
|
0
|
|
|
|
|
0
|
}; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# XXX deprecated! see sub make_get_anvl |
|
164
|
0
|
|
|
0
|
0
|
0
|
sub xgetlines { my( $filehandle )=@_; |
|
165
|
|
|
|
|
|
|
|
|
166
|
0
|
|
|
|
|
0
|
my $rec = ''; # returned record |
|
167
|
0
|
|
|
|
|
0
|
my $s; # next increment of input |
|
168
|
0
|
|
|
|
|
0
|
local $/ = NL.NL; # a kind of "paragraph" input mode |
|
169
|
|
|
|
|
|
|
# $/ === $INPUT_RECORD_SEPARATOR |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# If $filehandle is specified, use the Perl <$filehandle> idiom to |
|
172
|
|
|
|
|
|
|
# return next unit of input (normally a line, but here a para). |
|
173
|
|
|
|
|
|
|
# |
|
174
|
0
|
|
0
|
|
|
0
|
$filehandle ||= *ARGV; |
|
175
|
0
|
|
0
|
|
|
0
|
1 while ( defined( $s = <$filehandle> ) and # read up to two \n's |
|
|
|
|
0
|
|
|
|
|
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# If we get here, $s now contains a block to save. |
|
178
|
|
|
|
|
|
|
# |
|
179
|
|
|
|
|
|
|
$rec .= $s, |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# We continue reading only if there's no substance, |
|
182
|
|
|
|
|
|
|
# ie, no line read starts with a non-comment and no |
|
183
|
|
|
|
|
|
|
# non-comment line read contains non-whitespace |
|
184
|
|
|
|
|
|
|
# |
|
185
|
|
|
|
|
|
|
#$s !~ /^[^#\s]/m and # if no line read starts with |
|
186
|
|
|
|
|
|
|
# $s !~ /^[^#].*\S/m # or contains substance |
|
187
|
|
|
|
|
|
|
#(! ($s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m) and |
|
188
|
|
|
|
|
|
|
# $rec .= "substance found in <$s>\n"), |
|
189
|
|
|
|
|
|
|
! ($s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m) |
|
190
|
|
|
|
|
|
|
); |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
#$s !~ /^\s*[^#\s]/m || # match no line of susbstance |
|
193
|
|
|
|
|
|
|
#$rec .= $s |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
#($rec .= $s), # only "paragraphs"; save everything |
|
196
|
|
|
|
|
|
|
#$s !~ /^\s*[^#\s]/m # but stop when substance seen |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
#$s !~ /^\s*[^#\s]/m # but stop when substance seen |
|
199
|
|
|
|
|
|
|
# and while $s matches no line starting with ^# |
|
200
|
|
|
|
|
|
|
# while every line in $s is either all whitespace |
|
201
|
|
|
|
|
|
|
# or all comment (ie, first non-ws char is #) |
|
202
|
|
|
|
|
|
|
#$s =~ /\S/ # but stop when we see substance |
|
203
|
|
|
|
|
|
|
#$s !~ /\S/ # but stop when we see substance |
|
204
|
|
|
|
|
|
|
# substance means \S on a non-comment line |
|
205
|
|
|
|
|
|
|
# $s !~ /^\S|[^#].*\S/m |
|
206
|
|
|
|
|
|
|
# ! ($s =~ /^[^#\s]/m || $s =~ /^[^#].*\S/m) |
|
207
|
|
|
|
|
|
|
# |
|
208
|
|
|
|
|
|
|
#); # /^\s*[^#\s]/m |
|
209
|
0
|
0
|
0
|
|
|
0
|
defined($s) or |
|
210
|
|
|
|
|
|
|
return $rec || undef; # almost eof or real eof |
|
211
|
0
|
|
|
|
|
0
|
return $rec; |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# XXXX what happens when one file ends prematurely and |
|
214
|
|
|
|
|
|
|
# another begins? does last record for first file get |
|
215
|
|
|
|
|
|
|
# returned glued to beginning of first recond of 2nd file? |
|
216
|
|
|
|
|
|
|
# If more than one file, line numbers normally just accumulate. |
|
217
|
|
|
|
|
|
|
# We want to preserve line numbers within files, so we use this |
|
218
|
|
|
|
|
|
|
# next Perl idiom to cause $. (linenum) to be reset between files. |
|
219
|
|
|
|
|
|
|
# |
|
220
|
|
|
|
|
|
|
#close ARGV if eof; # reset line numbers between files |
|
221
|
|
|
|
|
|
|
} |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# args: record, reference to whitespace lines, reference to real record lines |
|
224
|
|
|
|
|
|
|
# xxx replace \n with NL throughout |
|
225
|
|
|
|
|
|
|
# returns undef when $rec trims to nothing (EOF) |
|
226
|
5
|
|
|
5
|
0
|
1076
|
sub trimlines { my( $rec, $r_wslines, $r_rrlines )=@_; |
|
227
|
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
# $rec might legitimately be undefined if called as |
|
229
|
|
|
|
|
|
|
# trimlines(getlines(), ...) |
|
230
|
|
|
|
|
|
|
# |
|
231
|
5
|
|
100
|
|
|
15
|
$rec ||= ''; |
|
232
|
|
|
|
|
|
|
|
|
233
|
5
|
|
|
|
|
19
|
$rec =~ s/^(\s*)//; # '*' guarantees $1 will be defined |
|
234
|
5
|
|
|
|
|
14
|
my $blanksection = $1; |
|
235
|
5
|
|
|
|
|
7
|
my @newlines; |
|
236
|
|
|
|
|
|
|
|
|
237
|
5
|
50
|
|
|
|
32
|
ref($r_wslines) eq 'SCALAR' and # if given, define it |
|
238
|
|
|
|
|
|
|
$$r_wslines = scalar(@newlines = $blanksection =~ /\n/g); |
|
239
|
|
|
|
|
|
|
|
|
240
|
5
|
50
|
|
|
|
42
|
ref($r_rrlines) eq 'SCALAR' and # if given, define it |
|
241
|
|
|
|
|
|
|
$$r_rrlines = scalar(@newlines = $rec =~ /\n/g); |
|
242
|
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
#$$r_rrlines = scalar($rec =~ /$/gm); # xxx why doesn't this work? |
|
244
|
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# At this point $r_wslines and $r_rrlines (if supplied) are safely |
|
246
|
|
|
|
|
|
|
# defined and ready for return. |
|
247
|
|
|
|
|
|
|
# |
|
248
|
5
|
100
|
|
|
|
20
|
$rec or # empty record (but $r_wslines may be defined) |
|
249
|
|
|
|
|
|
|
return undef; # signal eof-style return |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
#$rec =~ /\n\n$/ and # ok record ending -- this is |
|
252
|
|
|
|
|
|
|
# return $rec; # the usual return |
|
253
|
|
|
|
|
|
|
#$rec =~ s/\n*$/\n\n/; # normalize premature eof ending |
|
254
|
3
|
|
|
|
|
9
|
return $rec; |
|
255
|
|
|
|
|
|
|
} |
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
# returns empty string on success or string beginning "warning:..." |
|
258
|
|
|
|
|
|
|
# third arg (0 or 1) optional |
|
259
|
|
|
|
|
|
|
# elems is returned array of name value pairs |
|
260
|
|
|
|
|
|
|
#DEPRECATED: |
|
261
|
1
|
|
|
1
|
0
|
1
|
sub anvl_recsplit { my( $record, $r_elems, $strict )=@_; |
|
262
|
|
|
|
|
|
|
|
|
263
|
1
|
50
|
|
|
|
3
|
! defined($record) and |
|
264
|
|
|
|
|
|
|
return "needs an ANVL record"; |
|
265
|
1
|
50
|
|
|
|
5
|
ref($r_elems) ne "ARRAY" and |
|
266
|
|
|
|
|
|
|
return "2nd arg must reference an array"; |
|
267
|
|
|
|
|
|
|
|
|
268
|
1
|
|
|
|
|
1
|
my $strict_default = 0; |
|
269
|
1
|
50
|
|
|
|
3
|
! defined($strict) and |
|
270
|
|
|
|
|
|
|
$strict = $strict_default; |
|
271
|
|
|
|
|
|
|
|
|
272
|
1
|
|
|
|
|
2
|
local $_ = $record; |
|
273
|
1
|
|
|
|
|
5
|
s/^\s*//; s/\s*$//; # trim both ends |
|
|
1
|
|
|
|
|
5
|
|
|
274
|
1
|
50
|
|
|
|
5
|
/\n$/ or s/$/\n/; # normalize end of record to \n |
|
275
|
|
|
|
|
|
|
|
|
276
|
1
|
50
|
|
|
|
5
|
/\n\n/ and |
|
277
|
|
|
|
|
|
|
return "record should have no internal blank line(s)"; |
|
278
|
|
|
|
|
|
|
# xxx adjust regexp for ANVLR |
|
279
|
1
|
50
|
|
|
|
5
|
! /^[^\s:][\w ]*:/ and # match against first element |
|
280
|
|
|
|
|
|
|
return "well-formed record begins with a label and colon"; |
|
281
|
|
|
|
|
|
|
|
|
282
|
1
|
50
|
|
|
|
5
|
$anvl_mode ne ANVLR and |
|
283
|
|
|
|
|
|
|
s/^#.*\n//gm; # remove comments plus final \n |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# If we're not in strict parse mode, correct for common error |
|
286
|
|
|
|
|
|
|
# where continued value is not indented. We can pretty safely |
|
287
|
|
|
|
|
|
|
# assume a continued value if a line is flush left and contains |
|
288
|
|
|
|
|
|
|
# no colon at all. |
|
289
|
|
|
|
|
|
|
# |
|
290
|
|
|
|
|
|
|
# This next substitution match needs to be multi-line to avoid |
|
291
|
|
|
|
|
|
|
# more explicit looping. |
|
292
|
|
|
|
|
|
|
# |
|
293
|
|
|
|
|
|
|
# XXX there's probably a more efficient way to do this. |
|
294
|
1
|
|
|
|
|
2
|
my $msg = ""; |
|
295
|
1
|
|
|
|
|
5
|
my $indented = s/^([^\s:][^:]*)$/ $1/gm; |
|
296
|
1
|
50
|
|
|
|
3
|
if ($indented) { |
|
297
|
0
|
0
|
|
|
|
0
|
$strict and |
|
298
|
|
|
|
|
|
|
(@$r_elems = undef), |
|
299
|
|
|
|
|
|
|
return "error: $indented unindented value line(s)"; |
|
300
|
0
|
|
|
|
|
0
|
$msg = "warning: indenting $indented value line(s)"; |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
# if we get here, assume standard continuation lines, and join them |
|
303
|
|
|
|
|
|
|
# (GRANVL style) |
|
304
|
|
|
|
|
|
|
# |
|
305
|
1
|
|
|
|
|
3
|
s/\n\s+/ /g; |
|
306
|
|
|
|
|
|
|
# XXX should have a newline-preserving form of parse? |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# Split into array element pairs. Toss first "false" split. |
|
309
|
|
|
|
|
|
|
# xxx buggy limited patterns, how not to match newline |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# This is the critical splitting step. |
|
312
|
|
|
|
|
|
|
# splits line beginning ..... xxx |
|
313
|
|
|
|
|
|
|
# |
|
314
|
1
|
|
|
|
|
4
|
s/\n$//; # strip final \n |
|
315
|
1
|
|
|
|
|
7
|
(undef, @$r_elems) = split /\n*^([^\s:][\w ]*):\s*/m; |
|
316
|
|
|
|
|
|
|
|
|
317
|
1
|
|
|
|
|
7
|
return $msg; |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# xxxxxxxx respond to 'comments' (def. off), 'autoindent' (def. on), |
|
321
|
|
|
|
|
|
|
# 'anvlr' (def. off), 'granvl' ? |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# This is the closest thing to a reference implementation of an ANVL |
|
324
|
|
|
|
|
|
|
# record parser. |
|
325
|
|
|
|
|
|
|
# It returns "" on success, or "error: ..." or "warning: ..." |
|
326
|
|
|
|
|
|
|
|
|
327
|
10
|
|
|
10
|
0
|
6286
|
sub anvl_recarray { my( $record, $r_elems, $linenum, $o )=@_; |
|
328
|
|
|
|
|
|
|
|
|
329
|
10
|
50
|
|
|
|
31
|
! defined($record) and |
|
330
|
|
|
|
|
|
|
return "error: no input record"; |
|
331
|
10
|
100
|
|
|
|
32
|
ref($r_elems) ne "ARRAY" and |
|
332
|
|
|
|
|
|
|
return "error: 2nd arg must reference an array"; |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
# Note: this input $linenum is pure digits, while $lineno on |
|
335
|
|
|
|
|
|
|
# output is a combination of digits and type (':' or '#') |
|
336
|
|
|
|
|
|
|
# |
|
337
|
9
|
100
|
|
|
|
23
|
defined($linenum) or $linenum = 1; |
|
338
|
9
|
50
|
|
|
|
30
|
$linenum =~ /\D/ and |
|
339
|
|
|
|
|
|
|
return "error: 3rd arg ($linenum) must be a positive integer"; |
|
340
|
|
|
|
|
|
|
# XXX can't this be optimized a bit to keep defaults around? |
|
341
|
9
|
|
66
|
|
|
34
|
$o ||= anvl_opt_defaults(); |
|
342
|
9
|
50
|
|
|
|
26
|
ref($o) ne "HASH" and |
|
343
|
|
|
|
|
|
|
return "error: 4th arg must reference a hash"; |
|
344
|
|
|
|
|
|
|
|
|
345
|
9
|
|
|
|
|
12
|
local $_ = $record; # localizing $_ prevents modifying global $_ |
|
346
|
|
|
|
|
|
|
|
|
347
|
9
|
|
|
|
|
34
|
s/^\s*//; s/\s*$//; # trim both ends |
|
|
9
|
|
|
|
|
172
|
|
|
348
|
9
|
50
|
|
|
|
42
|
/\n$/ or s/$/\n/; # normalize end of record to \n |
|
349
|
|
|
|
|
|
|
#s/\n?$/\nEOR:/; # whether record ends in \n or not, normalize |
|
350
|
|
|
|
|
|
|
# # end of record to \nEOR: (note no \n after \nEOR:) |
|
351
|
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# Reject some malformed cases. |
|
353
|
|
|
|
|
|
|
# |
|
354
|
|
|
|
|
|
|
#/\n\n/ and |
|
355
|
|
|
|
|
|
|
# return "error: record should have no internal blank line(s)"; |
|
356
|
|
|
|
|
|
|
# xxx adjust regexp for ANVLR |
|
357
|
|
|
|
|
|
|
# XXX fix so record can consist of nothing but comments and/or whitespace; |
|
358
|
|
|
|
|
|
|
# comments _may_ be recognized in regular records, but not in this kind |
|
359
|
|
|
|
|
|
|
#/^[^\s:][\w ]*:/m or # match against first element |
|
360
|
|
|
|
|
|
|
# return "error: record ($_) should begin with a label and colon"; |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
# Any other unindented line not containing a colon will either |
|
363
|
|
|
|
|
|
|
# cause an error or will be automatically indented. |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
# xxx what about $anvl_mode ne ANVLR and?? |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
# Now we synthesize stuff (line numbers and pseudo-element names for |
|
368
|
|
|
|
|
|
|
# any comments) in order to create a uniform structure on each line, |
|
369
|
|
|
|
|
|
|
# so that we can finally call 'split' to bust apart that structure |
|
370
|
|
|
|
|
|
|
# into a Perl array in which every 3-element group corresponds to |
|
371
|
|
|
|
|
|
|
# 1. a line number, |
|
372
|
|
|
|
|
|
|
# 2. a label, and |
|
373
|
|
|
|
|
|
|
# 3. a value. |
|
374
|
|
|
|
|
|
|
# |
|
375
|
|
|
|
|
|
|
# First insert a line number and ":" in front of each line. |
|
376
|
|
|
|
|
|
|
# |
|
377
|
9
|
|
|
|
|
13
|
my $num = $linenum; |
|
378
|
9
|
|
|
|
|
55
|
s/^/ $num++ . ":" /gem; # put a line number on each line |
|
|
38
|
|
|
|
|
121
|
|
|
379
|
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# Remove blank lines, now that line numbers have been preserved. |
|
381
|
|
|
|
|
|
|
# |
|
382
|
9
|
|
|
|
|
48
|
s/^\d+:[^\S\n]*\n//gm; |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# Now, if we're not deleting comments, insert a pseudo-element |
|
385
|
|
|
|
|
|
|
# name '#:' in front of each comment while also changing the ':' |
|
386
|
|
|
|
|
|
|
# after the line numer to '#'. This means that all lines will |
|
387
|
|
|
|
|
|
|
# begin with a line number followed by ':' for real elements or |
|
388
|
|
|
|
|
|
|
# by '#' for comment elements. Eg, '# foo' on line 3 becomes |
|
389
|
|
|
|
|
|
|
# '3##:# foo', which conforms to the eventual split pattern we |
|
390
|
|
|
|
|
|
|
# rely on (at end). |
|
391
|
|
|
|
|
|
|
# |
|
392
|
|
|
|
|
|
|
# xxx problem with line #K:value, which becomes, eg, 4##:K:value |
|
393
|
9
|
100
|
66
|
|
|
61
|
$$o{comments} and # if we're keeping comments |
|
394
|
|
|
|
|
|
|
s/^(\d+):#/$1##:/gm, 1 |
|
395
|
|
|
|
|
|
|
# ^^^ |
|
396
|
|
|
|
|
|
|
# 123 |
|
397
|
|
|
|
|
|
|
# 1=separator, 2=pseudo-name, |
|
398
|
|
|
|
|
|
|
# 3=original value minus '#' starts after : |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
or # else completely delete comments |
|
401
|
|
|
|
|
|
|
s/^\d+:#.*\n//gm # up to and including final \n |
|
402
|
|
|
|
|
|
|
; |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
# Return if nothing's left after deleting blank lines and comments. |
|
405
|
|
|
|
|
|
|
# |
|
406
|
9
|
50
|
|
|
|
31
|
/^\s*$/s and |
|
407
|
|
|
|
|
|
|
return "warning: record at line $linenum has no content"; |
|
408
|
|
|
|
|
|
|
|
|
409
|
9
|
|
|
|
|
14
|
my $msg = ""; # default return message |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# If we're not in strict parse mode, correct for common error |
|
412
|
|
|
|
|
|
|
# where continued value is not indented. We can pretty safely |
|
413
|
|
|
|
|
|
|
# assume a continued value if a line is flush left and contains |
|
414
|
|
|
|
|
|
|
# no colon at all. |
|
415
|
|
|
|
|
|
|
# |
|
416
|
|
|
|
|
|
|
# This next substitution match is multi-line to avoid explicit |
|
417
|
|
|
|
|
|
|
# looping (yyy is this an efficient way to do it?). It indents |
|
418
|
|
|
|
|
|
|
# by one space any line starting without a space or colon and |
|
419
|
|
|
|
|
|
|
# that has no instance of a colon until end of line. |
|
420
|
|
|
|
|
|
|
# |
|
421
|
9
|
|
|
|
|
57
|
my $indented = s/^(\d+:)([^\s:][^:]*)$/$1 $2/gm; |
|
422
|
9
|
100
|
|
|
|
21
|
if ($indented) { |
|
423
|
3
|
100
|
|
|
|
12
|
unless ($$o{autoindent}) { |
|
424
|
1
|
|
|
|
|
6
|
@$r_elems = undef; # XXXXX isn't this too much? |
|
425
|
1
|
|
|
|
|
7
|
return "error: $indented unindented value line(s)"; |
|
426
|
|
|
|
|
|
|
} |
|
427
|
2
|
|
|
|
|
7
|
$msg = "warning: indenting $indented value line(s)"; |
|
428
|
|
|
|
|
|
|
} |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
# Now we join the (normalized) continuation lines (GRANVL style) |
|
431
|
|
|
|
|
|
|
# so each element-value pair is on one line. The + in [ \t]+ is |
|
432
|
|
|
|
|
|
|
# very important; we can't use \s+ here because \s matches a \n. |
|
433
|
|
|
|
|
|
|
# |
|
434
|
8
|
|
|
|
|
41
|
s/\n\d+:[ \t]+/ /g; |
|
435
|
|
|
|
|
|
|
#s/\n\d+:\s+/ /g; |
|
436
|
|
|
|
|
|
|
# XXX should we have a newline-preserving form of parse? |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
# Get rid of initial whitespace from all non-comment GRANVL values. |
|
439
|
8
|
|
|
|
|
66
|
s/^(\d+:[^\s:][^:]*:)[ \t]+/$1/gm; |
|
440
|
|
|
|
|
|
|
# xxx problem with line #K:value, which becomes, eg, 4##:K:value |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
# Split into array of element pairs. Toss first "false" split. |
|
443
|
|
|
|
|
|
|
# xxx buggy limited patterns, how not to match newline |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
# This is the critical splitting step. |
|
446
|
|
|
|
|
|
|
# splits line beginning ..... xxx |
|
447
|
|
|
|
|
|
|
# |
|
448
|
8
|
|
|
|
|
27
|
s/\n$//; # strip final \n |
|
449
|
8
|
|
|
|
|
177
|
@$r_elems = ('', 'ANVL', # 3rd elem of 1st triple is |
|
450
|
|
|
|
|
|
|
# provided by first element resulting from the split |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
split /\n*^(\d+[:#])([^\s:][^:]*):/m |
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
# xxx problem with line #K:value, which becomes, eg, 4##:K:value |
|
455
|
|
|
|
|
|
|
); |
|
456
|
|
|
|
|
|
|
|
|
457
|
8
|
50
|
|
|
|
33
|
defined($$r_elems[2]) or |
|
458
|
|
|
|
|
|
|
return "error: split failed ($_) on '$record', " . |
|
459
|
|
|
|
|
|
|
"record at line $linenum"; |
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# If there was a value with no label at the start of the record, |
|
462
|
|
|
|
|
|
|
# we deem that interesting enough to keep even though it's not |
|
463
|
|
|
|
|
|
|
# ANVL-compliant; the caller can prevent this by turning off |
|
464
|
|
|
|
|
|
|
# 'autoindent', the processing for which will either flag this as |
|
465
|
|
|
|
|
|
|
# an error or will have inserted one space in front of the value. |
|
466
|
|
|
|
|
|
|
# |
|
467
|
8
|
100
|
|
|
|
23
|
$$r_elems[2] =~ /^(\d+): (.*)/ and |
|
468
|
|
|
|
|
|
|
($$r_elems[0], $$r_elems[2]) = ($1, $2); |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
#(undef, @$r_elems) = split /\n*^([^\s:][\w ]*):\s*/m; |
|
471
|
|
|
|
|
|
|
# yyy an approach once considered but not used |
|
472
|
|
|
|
|
|
|
# $num = $.; # linenum |
|
473
|
|
|
|
|
|
|
# s/^/ $num++ . ":" /e while (/\n/g); |
|
474
|
|
|
|
|
|
|
# /\G ($N\#.*\n)+ (?=$N[^\#]) /gx # comment block |
|
475
|
|
|
|
|
|
|
# /\G ($N\S.*\n)+ (?=$N[^\S]) /gx # element on one or more lines |
|
476
|
|
|
|
|
|
|
# /\G (#.*\n)+(?=[^#])/g |
|
477
|
|
|
|
|
|
|
# /^#.*?\n[^#]/s # (?=lookahead) |
|
478
|
|
|
|
|
|
|
#return "_=$_\n" . join(", ", @$r_elems); # to check results |
|
479
|
|
|
|
|
|
|
|
|
480
|
8
|
|
|
|
|
51
|
return $msg; |
|
481
|
|
|
|
|
|
|
} |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# XXXXXX for consolidating a:b and a:c into a:b;c, MAJOR constraint |
|
484
|
|
|
|
|
|
|
# is that b and c CANNOT contain '|' or we refuse... |
|
485
|
|
|
|
|
|
|
|
|
486
|
3
|
|
|
3
|
0
|
2181
|
sub anvl_arrayhash { my( $r_elems, $r_hash, $first_only )=@_; |
|
487
|
|
|
|
|
|
|
|
|
488
|
3
|
100
|
|
|
|
13
|
ref($r_elems) ne "ARRAY" and |
|
489
|
|
|
|
|
|
|
return "error: 1st arg must reference an array"; |
|
490
|
2
|
100
|
|
|
|
7
|
ref($r_hash) ne "HASH" and |
|
491
|
|
|
|
|
|
|
return "error: 2nd arg must reference a hash"; |
|
492
|
1
|
50
|
|
|
|
3
|
defined($first_only) or $first_only = 0; |
|
493
|
|
|
|
|
|
|
|
|
494
|
1
|
|
|
|
|
3
|
my $num_elems = scalar @$r_elems; |
|
495
|
1
|
50
|
|
|
|
4
|
$num_elems % 3 != 0 and |
|
496
|
|
|
|
|
|
|
return "error: input array length must be a multiple of 3"; |
|
497
|
|
|
|
|
|
|
|
|
498
|
1
|
50
|
|
|
|
3
|
$num_elems < 1 and return ""; # no elements, we're done |
|
499
|
|
|
|
|
|
|
|
|
500
|
1
|
|
|
|
|
3
|
my $msg = ''; # xxx needed? |
|
501
|
1
|
|
|
|
|
2
|
my ($name, $value, $n, $v); |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
# We know there must be at least 3 elements, so it's safe to check |
|
504
|
|
|
|
|
|
|
# the special first triple (index 2 is the only one we look at now) |
|
505
|
|
|
|
|
|
|
# for an initial unlabeled record element (non-standard ANVL). |
|
506
|
|
|
|
|
|
|
# If we find something, we make up the name, '_'. |
|
507
|
|
|
|
|
|
|
# |
|
508
|
1
|
50
|
|
|
|
4
|
if ($$r_elems[2]) { # first triple is special |
|
509
|
1
|
|
|
|
|
2
|
$name = '_'; |
|
510
|
0
|
|
|
|
|
0
|
! defined $$r_hash{$name} and |
|
511
|
|
|
|
|
|
|
$$r_hash{$name} = [ 0 ] # initialize array |
|
512
|
|
|
|
|
|
|
or |
|
513
|
1
|
50
|
50
|
|
|
12
|
push @{ $$r_hash{$name} }, 0 # add to array |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
1
|
|
|
|
|
5
|
for ($n = 3; $n < $num_elems; $n += 3) { |
|
517
|
|
|
|
|
|
|
|
|
518
|
4
|
|
|
|
|
7
|
$name = $$r_elems[$n + 1]; |
|
519
|
1
|
|
|
|
|
4
|
! defined $$r_hash{$name} and |
|
520
|
|
|
|
|
|
|
$$r_hash{$name} = [ $n ] # initialize array |
|
521
|
|
|
|
|
|
|
or |
|
522
|
4
|
100
|
100
|
|
|
24
|
push @{ $$r_hash{$name} }, $n # add to array |
|
523
|
|
|
|
|
|
|
; |
|
524
|
|
|
|
|
|
|
} |
|
525
|
1
|
|
|
|
|
3
|
return $msg; |
|
526
|
|
|
|
|
|
|
} |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# ANVL value split |
|
529
|
|
|
|
|
|
|
# xxx rename to anvl_valarray? |
|
530
|
|
|
|
|
|
|
# returns empty string on success or string beginning "warning:..." |
|
531
|
|
|
|
|
|
|
# r_svals is reference to an array that will be filled upon return |
|
532
|
5
|
|
|
5
|
0
|
5052
|
sub anvl_valsplit { my( $value, $r_svals )=@_; |
|
533
|
|
|
|
|
|
|
|
|
534
|
5
|
50
|
|
|
|
13
|
! defined($value) and |
|
535
|
|
|
|
|
|
|
return "needs an ANVL value"; |
|
536
|
5
|
100
|
|
|
|
16
|
ref($r_svals) ne "ARRAY" and |
|
537
|
|
|
|
|
|
|
return "2nd arg must reference an array"; |
|
538
|
4
|
|
|
|
|
5
|
local $_; |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
#xxx print "r_svals=$r_svals\n"; |
|
541
|
|
|
|
|
|
|
#xxx print "value=$value\n"; |
|
542
|
4
|
|
|
|
|
7
|
my $warning = ""; # xxx used? |
|
543
|
|
|
|
|
|
|
#my $ret_subvalues = \$_[1]; |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# Assume value is all on one line and split it. |
|
546
|
|
|
|
|
|
|
#my @svals = split /\|/, $value; |
|
547
|
4
|
|
|
|
|
23
|
@$r_svals = split /\|/, $value; |
|
548
|
|
|
|
|
|
|
#$_[1] = \@svals; |
|
549
|
|
|
|
|
|
|
$_ = [ split(/;/, $_) ] # create array of arrays |
|
550
|
4
|
|
|
|
|
41
|
for (@$r_svals); |
|
551
|
|
|
|
|
|
|
#xxxprint("svals=", join(", ", @$_), "\n") for (@$r_svals); |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# xxxx need to look for all 3 levels: (change spec) |
|
554
|
|
|
|
|
|
|
# XXXXXXX value ::= one or more svals (sval1 | sval2 | ...) |
|
555
|
|
|
|
|
|
|
# XXXXXXX sval ::= one or more rvals (rval1 ; rval2 ; ...) |
|
556
|
|
|
|
|
|
|
# XXXXXXX rval ::= one or more qvals (qval1 (=) qval2 (=) ...) |
|
557
|
|
|
|
|
|
|
# where s=sub, r=repeated, q=equivalent |
|
558
|
|
|
|
|
|
|
# XXXXXXX or ?? rval ::= one or more avals (aval1 (=) aval2 (=) ...) |
|
559
|
4
|
50
|
|
|
|
19
|
return $warning ? "warning: $warning" : ""; |
|
560
|
|
|
|
|
|
|
} |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# Create record hash, elem is key, value is value |
|
563
|
|
|
|
|
|
|
# |
|
564
|
2
|
|
|
2
|
0
|
2113
|
sub anvl_rechash { my( $record, $r_hash, $strict )=@_; |
|
565
|
|
|
|
|
|
|
|
|
566
|
2
|
50
|
|
|
|
8
|
! defined($record) and |
|
567
|
|
|
|
|
|
|
return "needs an ANVL record"; |
|
568
|
2
|
100
|
|
|
|
6
|
ref($r_hash) ne "HASH" and |
|
569
|
|
|
|
|
|
|
return "2nd arg must reference a hash"; |
|
570
|
|
|
|
|
|
|
|
|
571
|
1
|
|
|
|
|
3
|
my $msg = ""; |
|
572
|
1
|
|
|
|
|
1
|
my @elems; |
|
573
|
1
|
50
|
|
|
|
5
|
($msg = anvl_recsplit($record, \@elems, $strict)) and |
|
574
|
|
|
|
|
|
|
return "anvl_recsplit: $msg"; |
|
575
|
|
|
|
|
|
|
|
|
576
|
1
|
|
|
|
|
3
|
my ($name, $value); |
|
577
|
1
|
|
|
|
|
2
|
while (1) { |
|
578
|
2
|
|
|
|
|
4
|
$name = shift @elems; |
|
579
|
2
|
100
|
|
|
|
6
|
last unless defined $name; # nothing left |
|
580
|
1
|
|
|
|
|
2
|
$value = shift @elems; |
|
581
|
1
|
50
|
|
|
|
4
|
if (! defined $$r_hash{$name}) { |
|
582
|
|
|
|
|
|
|
# Nothing there, so store scalar and continue. |
|
583
|
1
|
|
|
|
|
3
|
$$r_hash{$name} = $value; # 1st value (non-array) |
|
584
|
1
|
|
|
|
|
2
|
next; |
|
585
|
|
|
|
|
|
|
} |
|
586
|
|
|
|
|
|
|
# If we get here there's something's already there. |
|
587
|
|
|
|
|
|
|
# Don't overwrite if we're in $strict mode. |
|
588
|
|
|
|
|
|
|
# xxx document this |
|
589
|
|
|
|
|
|
|
# |
|
590
|
0
|
0
|
|
|
|
0
|
$strict and next; # don't overwrite |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
# XXXXXxxxx make adding a value policy-driven, eg, |
|
593
|
|
|
|
|
|
|
# "add" could mean (a) replace, (b) push on end array, |
|
594
|
|
|
|
|
|
|
# (c) push on start of array, (d) string concatenation, |
|
595
|
|
|
|
|
|
|
# (d) error. |
|
596
|
|
|
|
|
|
|
# xxx should anvl_rechash save line numbers? |
|
597
|
|
|
|
|
|
|
# xxx should anvl_recsplit save line numbers? |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
# Whatever is there could be a scalar or an array reference. |
|
600
|
|
|
|
|
|
|
# If not a reference, create an anonymous array, put a |
|
601
|
|
|
|
|
|
|
# scalar into it, and refer to the array. |
|
602
|
|
|
|
|
|
|
# |
|
603
|
0
|
|
|
|
|
0
|
my $v = $$r_hash{$name}; # add to current |
|
604
|
0
|
0
|
|
|
|
0
|
$v = [ $v ] # make an array if currently |
|
605
|
|
|
|
|
|
|
unless ref $v; # there's only one value |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
# If we get here, we have a reference to an array, |
|
608
|
|
|
|
|
|
|
# possibly empty. Either way, we can push onto it. |
|
609
|
|
|
|
|
|
|
# |
|
610
|
0
|
|
|
|
|
0
|
push @$v, $value; |
|
611
|
|
|
|
|
|
|
} |
|
612
|
1
|
|
|
|
|
3
|
return $msg; |
|
613
|
|
|
|
|
|
|
} |
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
# [ !"#\$%&'\(\)\*\+,/:;<=>\?@\[\\\]\|\0] |
|
616
|
|
|
|
|
|
|
our %anvl_decoding = ( |
|
617
|
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
'sp' => ' ', # decodes to space (0x20) |
|
619
|
|
|
|
|
|
|
'ex' => '!', # decodes to ! (0x21) |
|
620
|
|
|
|
|
|
|
'dq' => '"', # decodes to " (0x22) |
|
621
|
|
|
|
|
|
|
'ns' => '#', # decodes to # (0x23) |
|
622
|
|
|
|
|
|
|
'do' => '$', # decodes to $ (0x24) |
|
623
|
|
|
|
|
|
|
'pe' => '%', # decodes to % (0x25) |
|
624
|
|
|
|
|
|
|
'am' => '&', # decodes to & (0x26) |
|
625
|
|
|
|
|
|
|
'sq' => "'", # decodes to ' (0x27) |
|
626
|
|
|
|
|
|
|
'op' => '(', # decodes to ( (0x28) |
|
627
|
|
|
|
|
|
|
'cp' => ')', # decodes to ) (0x29) |
|
628
|
|
|
|
|
|
|
'as' => '*', # decodes to * (0x2a) |
|
629
|
|
|
|
|
|
|
'pl' => '+', # decodes to + (0x2b) |
|
630
|
|
|
|
|
|
|
'co' => ',', # decodes to , (0x2c) |
|
631
|
|
|
|
|
|
|
'sl' => '/', # decodes to / (0x2f) |
|
632
|
|
|
|
|
|
|
'cn' => ':', # decodes to : (0x3a) |
|
633
|
|
|
|
|
|
|
'sc' => ';', # decodes to ; (0x3b) |
|
634
|
|
|
|
|
|
|
'lt' => '<', # decodes to < (0x3c) |
|
635
|
|
|
|
|
|
|
'eq' => '=', # decodes to = (0x3d) |
|
636
|
|
|
|
|
|
|
'gt' => '>', # decodes to > (0x3e) |
|
637
|
|
|
|
|
|
|
'qu' => '?', # decodes to ? (0x3f) |
|
638
|
|
|
|
|
|
|
'at' => '@', # decodes to @ (0x40) |
|
639
|
|
|
|
|
|
|
'ox' => '[', # decodes to [ (0x5b) |
|
640
|
|
|
|
|
|
|
'ls' => '\\', # decodes to \ (0x5c) |
|
641
|
|
|
|
|
|
|
'cx' => ']', # decodes to ] (0x5d) |
|
642
|
|
|
|
|
|
|
'vb' => '|', # decodes to | (0x7c) |
|
643
|
|
|
|
|
|
|
'nu' => "\0", # decodes to null (0x00) |
|
644
|
|
|
|
|
|
|
); |
|
645
|
|
|
|
|
|
|
# XXXXXXX need way to encode newlines (using '\n' in interim) |
|
646
|
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
our %anvl_encoding; |
|
648
|
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
#%cn : |
|
650
|
|
|
|
|
|
|
#%sc ; |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
# xxxxx handle these separately |
|
653
|
|
|
|
|
|
|
# # XXXX remove %% from erc/anvlspec? |
|
654
|
|
|
|
|
|
|
# '%' => '%pe', # decodes to % (0x25) xxxx do this first? |
|
655
|
|
|
|
|
|
|
# '_' => '', # a non-character used as a syntax shim |
|
656
|
|
|
|
|
|
|
# '{' => '', # a non-character that begins an expansion block |
|
657
|
|
|
|
|
|
|
# '}' => '', # a non-character that ends an expansion block |
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
# Takes a single arg. |
|
660
|
|
|
|
|
|
|
sub anvl_decode { |
|
661
|
|
|
|
|
|
|
|
|
662
|
5
|
|
50
|
5
|
0
|
18
|
local $_ = shift(@_) || ''; |
|
663
|
|
|
|
|
|
|
|
|
664
|
5
|
|
|
|
|
18
|
pos() = 0; # reset \G for $_ just to be safe |
|
665
|
5
|
|
|
|
|
29
|
while (/(?=\%\{)/g) { # lookahead; \G matches just before |
|
666
|
7
|
|
|
|
|
11
|
my $p = pos(); # note \G position before it changes |
|
667
|
7
|
100
|
|
|
|
41
|
s/\G \%\{ (.*?) \%\}//xs # 's' modifier makes . match \n |
|
668
|
|
|
|
|
|
|
or last; # if no closing brace, skip match |
|
669
|
6
|
|
|
|
|
14
|
my $exp_block = $1; # save removed expansion block |
|
670
|
6
|
|
|
|
|
33
|
$exp_block =~ s/\s+//g; # strip it of all whitespace |
|
671
|
6
|
|
|
|
|
11
|
pos() = $p; # revert \G to where we started and |
|
672
|
6
|
|
|
|
|
34
|
s/\G/$exp_block/; # re-insert changed expansion block |
|
673
|
|
|
|
|
|
|
} |
|
674
|
5
|
|
|
|
|
14
|
s/\%[}{]//g; # remove any remaining unmatched |
|
675
|
5
|
|
|
|
|
8
|
s/\%_//g; # xxx %_ -> '' |
|
676
|
5
|
|
|
|
|
11
|
s/\%\%/\%pe/g; # xxx ??? xxxx??? |
|
677
|
|
|
|
|
|
|
# decode %XY where XY together don't form a valid pair of hex digits |
|
678
|
5
|
|
|
|
|
17
|
s/\%([g-z][a-z]|[a-z][g-z])/$anvl_decoding{$1}/g; |
|
679
|
5
|
|
|
|
|
25
|
return $_; |
|
680
|
|
|
|
|
|
|
} |
|
681
|
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
# xxx encoding should be context-sensitive, eg, name, value |
|
683
|
1
|
|
|
1
|
0
|
5515
|
sub anvl_encode { my( $s )=@_; |
|
684
|
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
# XXXX just define this in the module?? |
|
686
|
1
|
50
|
|
|
|
7
|
unless (%anvl_encoding) { # one-time definition |
|
687
|
|
|
|
|
|
|
# This just defines an inverse mapping so we can encode. |
|
688
|
|
|
|
|
|
|
$anvl_encoding{$anvl_decoding{$_}} = $_ |
|
689
|
1
|
|
|
|
|
149
|
for (keys %anvl_decoding); |
|
690
|
|
|
|
|
|
|
} |
|
691
|
|
|
|
|
|
|
$s =~ |
|
692
|
1
|
|
|
|
|
14
|
s/([ !\"#\$\%&'\(\)\*\+,\/:;<=>\?@\[\\\]\|\0])/\%$anvl_encoding{$1}/g; |
|
693
|
1
|
|
|
|
|
10
|
return $s; |
|
694
|
|
|
|
|
|
|
} |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# return $name in natural word order, using ANVL inversion points |
|
697
|
|
|
|
|
|
|
# repeat for each final comma present |
|
698
|
6
|
|
|
6
|
0
|
21
|
sub anvl_name_naturalize { my( $name )=@_; |
|
699
|
|
|
|
|
|
|
|
|
700
|
6
|
|
50
|
|
|
14
|
$name ||= ''; |
|
701
|
6
|
50
|
|
|
|
38
|
$name =~ /^\s*$/ and return $name; # empty |
|
702
|
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
# "McCartney, Paul, Sir,," |
|
704
|
|
|
|
|
|
|
# a, b, c, d, e,,, -> e d c a, b |
|
705
|
6
|
|
|
|
|
16
|
my $prefix = ''; |
|
706
|
6
|
|
|
|
|
42
|
while ($name =~ s/,\s*$//) { |
|
707
|
9
|
100
|
|
|
|
106
|
$name =~ s/^(.*),\s*([^,]+)(,*$)/$1$3/ and |
|
708
|
|
|
|
|
|
|
$prefix .= $2 . ' '; |
|
709
|
|
|
|
|
|
|
} |
|
710
|
6
|
|
|
|
|
43
|
return $prefix . $name; |
|
711
|
|
|
|
|
|
|
} |
|
712
|
|
|
|
|
|
|
|
|
713
|
0
|
|
|
0
|
0
|
0
|
sub anvl_summarize { my( @nodes )=@_; } |
|
714
|
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
# XXXXX doesn't this really belong in an ERC.pm module? |
|
716
|
|
|
|
|
|
|
# |
|
717
|
|
|
|
|
|
|
# ordered list of kernel element names |
|
718
|
|
|
|
|
|
|
our @kernel_labels = qw( |
|
719
|
|
|
|
|
|
|
who |
|
720
|
|
|
|
|
|
|
what |
|
721
|
|
|
|
|
|
|
when |
|
722
|
|
|
|
|
|
|
where |
|
723
|
|
|
|
|
|
|
how |
|
724
|
|
|
|
|
|
|
why |
|
725
|
|
|
|
|
|
|
huh |
|
726
|
|
|
|
|
|
|
); |
|
727
|
|
|
|
|
|
|
# |
|
728
|
|
|
|
|
|
|
# This routine inspects and possibly modifies in place the kind of element |
|
729
|
|
|
|
|
|
|
# array resulting from a call to anvl_recarray(), which splits and ANVL |
|
730
|
|
|
|
|
|
|
# record. It is useful for transforming short form ERC elements into full |
|
731
|
|
|
|
|
|
|
# form elements, for example, to expand "erc:a|b|c|d" into the equivalent, |
|
732
|
|
|
|
|
|
|
# "erc:\nwho:a\nwhat:b\nwhen:c\nwhere:d". |
|
733
|
|
|
|
|
|
|
# It returns the empty string on success, otherwise an error message. |
|
734
|
|
|
|
|
|
|
# |
|
735
|
4
|
|
|
4
|
0
|
1636
|
sub erc_anvl_expand_array { my( $r_elems )=@_; |
|
736
|
|
|
|
|
|
|
|
|
737
|
4
|
|
|
4
|
|
60
|
use File::ANVL; |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
5380
|
|
|
738
|
4
|
|
|
|
|
4
|
my ($lineno, $name, $value, $msg, @svals, $sval); |
|
739
|
4
|
|
|
|
|
5
|
my $me = 'erc_anvl_expand_array'; |
|
740
|
4
|
|
|
|
|
5
|
my $i = 3; # skip first 3 elems (anvl array preamble) |
|
741
|
4
|
|
|
|
|
7
|
while (1) { |
|
742
|
21
|
|
|
|
|
23
|
$lineno = $$r_elems[$i++]; |
|
743
|
21
|
|
100
|
|
|
45
|
$name = $$r_elems[$i++] || ''; |
|
744
|
21
|
|
100
|
|
|
41
|
$value = $$r_elems[$i++] || ''; |
|
745
|
21
|
100
|
|
|
|
32
|
last unless defined $lineno; # end of record |
|
746
|
|
|
|
|
|
|
next # skip unless we have erc-type thing |
|
747
|
17
|
100
|
100
|
|
|
53
|
if ($name ne 'erc' || $value =~ /^\s*$/); |
|
748
|
|
|
|
|
|
|
#if ($name !~ /^erc\b/ || $value =~ /^\s*$/); |
|
749
|
|
|
|
|
|
|
# xxx should do this for full generality |
|
750
|
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
# If here, we have an erc-type thing with a non-empty value. |
|
752
|
|
|
|
|
|
|
# |
|
753
|
2
|
50
|
|
|
|
8
|
($msg = anvl_valsplit($value, \@svals)) and |
|
754
|
|
|
|
|
|
|
return "error: $me: anvl_valsplit: $msg"; |
|
755
|
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
# XXXX only doing straight "erc" (eg, not erc-about) |
|
757
|
2
|
|
|
|
|
3
|
my $j = 0; |
|
758
|
2
|
|
|
|
|
4
|
my @extras = (); |
|
759
|
|
|
|
|
|
|
# If we exceed known labels, we'll re-use last known label. |
|
760
|
2
|
|
|
|
|
4
|
my $unknown = $kernel_labels[$#kernel_labels]; |
|
761
|
2
|
|
|
|
|
3
|
foreach $sval (@svals) { |
|
762
|
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
# xxx not (yet) tranferring subvalue structure |
|
764
|
|
|
|
|
|
|
# to anvl_om or other conversion |
|
765
|
|
|
|
|
|
|
# Recall that each $sval is itself a reference to |
|
766
|
|
|
|
|
|
|
# an array of subvalues (often just one element). |
|
767
|
|
|
|
|
|
|
# |
|
768
|
8
|
|
33
|
|
|
65
|
push @extras, # trust kernel_labels order |
|
769
|
|
|
|
|
|
|
$lineno, |
|
770
|
|
|
|
|
|
|
$kernel_labels[$j++] || $unknown, |
|
771
|
|
|
|
|
|
|
join('; ', # trim ends of subvalues |
|
772
|
|
|
|
|
|
|
map(m/^\s*(.*?)\s*$/, @$sval) |
|
773
|
|
|
|
|
|
|
); |
|
774
|
|
|
|
|
|
|
} |
|
775
|
|
|
|
|
|
|
# Finally, replace our $value element with '' and append |
|
776
|
|
|
|
|
|
|
# the new extra values we've just expanded. |
|
777
|
2
|
|
|
|
|
15
|
splice @$r_elems, $i-1, 1, |
|
778
|
|
|
|
|
|
|
'', # replaces $value we just used up |
|
779
|
|
|
|
|
|
|
@extras; # adds new elements from $value |
|
780
|
|
|
|
|
|
|
} |
|
781
|
4
|
|
|
|
|
12
|
return ''; # success |
|
782
|
|
|
|
|
|
|
} |
|
783
|
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX============= |
|
785
|
|
|
|
|
|
|
# xxx checkm _in_ obj1 obj2 ... --> returns noids |
|
786
|
|
|
|
|
|
|
# xxx checkm _out_ id1 id2 ... --> returns objects |
|
787
|
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX============= |
|
789
|
|
|
|
|
|
|
# xxx do metadata scan of object before ingest and confirm with user that |
|
790
|
|
|
|
|
|
|
# the object is correctly identified. This could even be done remotely. |
|
791
|
|
|
|
|
|
|
# Start with informal staff service for depositing objects, returning a |
|
792
|
|
|
|
|
|
|
# short url to a stable object, and not clogging up allstaff inboxes with |
|
793
|
|
|
|
|
|
|
# huge attachments. Also applies to any number of draft docs for review |
|
794
|
|
|
|
|
|
|
# but in temporary storage (but stable). |
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX============= |
|
797
|
|
|
|
|
|
|
# xxx do id generator service with 'expiring' ids. To mint, you tell us |
|
798
|
|
|
|
|
|
|
# who you are first. To get a perm. id, you agree to use your minted id |
|
799
|
|
|
|
|
|
|
# and bind it within N months. We track, and warn you several times |
|
800
|
|
|
|
|
|
|
# until N months as elapsed and then reclaim/recycle the id. |
|
801
|
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
############################################ |
|
803
|
|
|
|
|
|
|
# Output Multiplexer routines |
|
804
|
|
|
|
|
|
|
############################################ |
|
805
|
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
# #$erc = "erc: Smith, J.|The Whole Truth|2004|http://example.com/foo/bar"; |
|
807
|
|
|
|
|
|
|
# $errmsg = File::ERC::erc_anvl2erc_turtle ($erc, $rec); |
|
808
|
|
|
|
|
|
|
# $errmsg and |
|
809
|
|
|
|
|
|
|
# print("$errmsg\n") |
|
810
|
|
|
|
|
|
|
# or |
|
811
|
|
|
|
|
|
|
# print("turtle record:\n$rec\n") |
|
812
|
|
|
|
|
|
|
# ; |
|
813
|
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
# xxx anvl_fmt not consistent with om_anvl! |
|
815
|
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
# Input file(s) from ARGV. |
|
817
|
|
|
|
|
|
|
|
|
818
|
0
|
|
|
0
|
0
|
|
sub anvl_om { my( $om, $o, $get_anvl ) = (shift, shift, shift); |
|
819
|
|
|
|
|
|
|
|
|
820
|
0
|
0
|
|
|
|
|
return "anvl_om: 1st arg not an OM object" |
|
821
|
|
|
|
|
|
|
if ref($om) !~ /^File::OM::/; |
|
822
|
0
|
|
|
|
|
|
my $p = $om->{outhandle}; # whether 'print' status or small |
|
823
|
0
|
|
0
|
|
|
|
$o ||= anvl_opt_defaults(); |
|
824
|
0
|
|
0
|
|
|
|
$get_anvl ||= File::ANVL::make_get_anvl(); # xxx set input here? |
|
825
|
|
|
|
|
|
|
# XXX test return value! |
|
826
|
|
|
|
|
|
|
|
|
827
|
0
|
|
|
|
|
|
my $s = ''; # output strings are returned to $s |
|
828
|
0
|
0
|
|
|
|
|
my $st = $p ? 1 : ''; # returns (stati or strings) accumulate |
|
829
|
0
|
|
|
|
|
|
my ($msg, $allmsgs, $anvlrec, $lineno, $name, $value, $pat, $n, $nmax); |
|
830
|
0
|
|
|
|
|
|
my (%rechash, $ne, $nemax, $elem_name); # for alt. element ordering |
|
831
|
0
|
|
|
|
|
|
my $r_elem_order = $$o{elem_order}; |
|
832
|
|
|
|
|
|
|
|
|
833
|
0
|
|
|
|
|
|
$s = $om->ostream(); # open stream |
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
# This next line is a fast and compact (if cryptic) way to |
|
836
|
|
|
|
|
|
|
# accumulate $om->method calls. Used after each method call, it |
|
837
|
|
|
|
|
|
|
# concatenates strings or ANDs up print statuses, depending on the |
|
838
|
|
|
|
|
|
|
# outhandle setting. It makes several appearances in this routine. |
|
839
|
|
|
|
|
|
|
# |
|
840
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); # accumulate method returns |
|
|
|
|
0
|
|
|
|
|
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
# Numbers: record, element in record, and start line |
|
843
|
|
|
|
|
|
|
# |
|
844
|
0
|
|
|
|
|
|
my ($startline, $recnum, $elemnum) = (1, 0, 0); |
|
845
|
0
|
|
|
|
|
|
my ($wslines, $rrlines); |
|
846
|
0
|
|
|
|
|
|
my $r_elems = $om->{elemsref}; # abbreviation |
|
847
|
|
|
|
|
|
|
# xxx is that reference kosher? |
|
848
|
|
|
|
|
|
|
|
|
849
|
0
|
|
|
|
|
|
while (1) { |
|
850
|
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
# Get an ANVL record and count lines therein. ANVL |
|
852
|
|
|
|
|
|
|
# records can come from anywhere, but typically from |
|
853
|
|
|
|
|
|
|
# a file (read in "paragraph" mode) or a BDB database. |
|
854
|
|
|
|
|
|
|
# |
|
855
|
0
|
|
|
|
|
|
$anvlrec = trimlines(&$get_anvl(), \$wslines, \$rrlines); |
|
856
|
0
|
|
|
|
|
|
$startline += $wslines; |
|
857
|
0
|
0
|
|
|
|
|
last unless $anvlrec; |
|
858
|
|
|
|
|
|
|
|
|
859
|
0
|
|
|
|
|
|
$recnum++; # increment record counter |
|
860
|
|
|
|
|
|
|
=for later |
|
861
|
|
|
|
|
|
|
# XXX anvl_recarray is expensive, do we _need_ to do it if the output is |
|
862
|
|
|
|
|
|
|
# also in anvl? Maybe call modified [2] here so "find" can work? |
|
863
|
|
|
|
|
|
|
if (ref($om) eq 'File::OM::ANVL' and ! $r_elem_order) { |
|
864
|
|
|
|
|
|
|
# xxx do quick expand (short->long erc) here? |
|
865
|
|
|
|
|
|
|
# xxx _will_ disturb input line numbering |
|
866
|
|
|
|
|
|
|
$$o{find} and ($anvlrec !~ /$$o{find}/m) and |
|
867
|
|
|
|
|
|
|
next; # no output has occurred |
|
868
|
|
|
|
|
|
|
# xxx do quick check for 'show' and next |
|
869
|
|
|
|
|
|
|
# XXXXXXXX must define lineno for verbose case |
|
870
|
|
|
|
|
|
|
$s = $om->anvl_rec($anvlrec, $startline, $rrlines); |
|
871
|
|
|
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
|
872
|
|
|
|
|
|
|
next; |
|
873
|
|
|
|
|
|
|
} |
|
874
|
|
|
|
|
|
|
=cut |
|
875
|
0
|
|
|
|
|
|
$msg = anvl_recarray($anvlrec, $r_elems, $startline, $o); |
|
876
|
0
|
0
|
|
|
|
|
$msg =~ /^error/ and return "anvl_recarray: $msg"; |
|
877
|
0
|
0
|
|
|
|
|
$msg eq "" or |
|
878
|
|
|
|
|
|
|
#print $msg, "\n"; |
|
879
|
|
|
|
|
|
|
#$o->{verbose} && print $msg, "\n"; |
|
880
|
|
|
|
|
|
|
$allmsgs .= $msg . "\n"; # save other message |
|
881
|
|
|
|
|
|
|
|
|
882
|
|
|
|
|
|
|
# NB: apply 'find' here before possible expansion, which |
|
883
|
|
|
|
|
|
|
# means that a pattern like "who:\s*smith" won't work on |
|
884
|
|
|
|
|
|
|
# on a short form ANVL record. |
|
885
|
|
|
|
|
|
|
# |
|
886
|
0
|
0
|
0
|
|
|
|
$$o{find} and ($anvlrec !~ /$$o{find}/m) and |
|
887
|
|
|
|
|
|
|
next; # no output has occurred |
|
888
|
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
# If caller has set $$o{elemsproc} to a code reference, |
|
890
|
|
|
|
|
|
|
# it is called to process the element array just returned |
|
891
|
|
|
|
|
|
|
# from anvl_recarray. Typically this is used to convert |
|
892
|
|
|
|
|
|
|
# (with erc_anvl_expand_array) short form ERCs to long |
|
893
|
|
|
|
|
|
|
# form ERCs. As an optimization, the code is not called |
|
894
|
|
|
|
|
|
|
# if $$o{elemsprocpat} (typically, "erc") is set and |
|
895
|
|
|
|
|
|
|
# doesn't match the raw record string. |
|
896
|
|
|
|
|
|
|
# |
|
897
|
0
|
0
|
0
|
|
|
|
if (ref($$o{elemsproc}) eq "CODE" and # if code and either |
|
|
|
|
0
|
|
|
|
|
|
898
|
|
|
|
|
|
|
(! ($pat = $$o{elemsprocpat})) # no pattern or |
|
899
|
|
|
|
|
|
|
|| $anvlrec =~ $pat) { # the pattern matches |
|
900
|
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
# [2] XXX can we call elemsproc directly on the $anvlrec? so we don't need |
|
902
|
|
|
|
|
|
|
# to call expensive anvl_recarray first? |
|
903
|
0
|
0
|
|
|
|
|
($msg = &{$$o{elemsproc}}($r_elems)) and |
|
|
0
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
return "File::ANVL::elemsproc: $msg"; |
|
905
|
|
|
|
|
|
|
} |
|
906
|
0
|
0
|
|
|
|
|
ref($om) eq 'File::OM::Turtle' and |
|
907
|
|
|
|
|
|
|
turtle_set_subject($om, $anvlrec); |
|
908
|
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
# The orec method is given first crack at a new record. |
|
910
|
|
|
|
|
|
|
# It sets and/or clears a number of values for keys (eg, |
|
911
|
|
|
|
|
|
|
# for turtle, $$o{subject}). $recnum is useful for |
|
912
|
|
|
|
|
|
|
# outputting json separators (eg, no comma if $recnum eq 1) |
|
913
|
|
|
|
|
|
|
# or record numbers in comments (eg, if $$o{verbose}). |
|
914
|
|
|
|
|
|
|
# $startline is useful for parser diagnostics (eg, "error |
|
915
|
|
|
|
|
|
|
# on line 5"). $r_elems and $r_elem_order are needed for |
|
916
|
|
|
|
|
|
|
# discovering what elements will populate CSV/PSV records. |
|
917
|
|
|
|
|
|
|
# |
|
918
|
|
|
|
|
|
|
# XXX this next isn't needed if output is anvl ?! (assuming final NL is |
|
919
|
|
|
|
|
|
|
# written when closing the record |
|
920
|
0
|
|
|
|
|
|
$s = $om->orec($recnum, $startline, $r_elems, $r_elem_order); |
|
921
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
|
|
|
|
0
|
|
|
|
|
|
922
|
|
|
|
|
|
|
|
|
923
|
0
|
0
|
|
|
|
|
if ($r_elem_order) { |
|
924
|
0
|
|
|
|
|
|
undef %rechash; # don't want prior indices |
|
925
|
0
|
0
|
|
|
|
|
($msg = anvl_arrayhash($r_elems, \%rechash)) and |
|
926
|
|
|
|
|
|
|
return "anvl_arrayhash: $msg"; |
|
927
|
0
|
|
|
|
|
|
$ne = -1; # index into $$r_elem_order |
|
928
|
0
|
|
|
|
|
|
$nemax = scalar @$r_elem_order; |
|
929
|
|
|
|
|
|
|
} else { |
|
930
|
0
|
0
|
|
|
|
|
$n = # index into $$r_elems |
|
931
|
|
|
|
|
|
|
# XXX don't reference r_elems if we haven't called anvl_recarray |
|
932
|
|
|
|
|
|
|
$$r_elems[2] # if a no-label value starts |
|
933
|
|
|
|
|
|
|
? -3 # rec, make sure to output it, |
|
934
|
|
|
|
|
|
|
: 0; # else skip it (normal) |
|
935
|
0
|
|
|
|
|
|
$nmax = scalar @$r_elems; |
|
936
|
|
|
|
|
|
|
} |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
# XXX if output is to anvl, can we not skip the entire loop below? but |
|
939
|
|
|
|
|
|
|
# not if it's possible to output anvl _and_ to care about element order |
|
940
|
|
|
|
|
|
|
# XXX but still perform $show check and skip not-shown elems |
|
941
|
|
|
|
|
|
|
# XXX and still perform value inversion if {invert} options |
|
942
|
0
|
|
|
|
|
|
$elemnum = 0; # true elements, not comments |
|
943
|
0
|
|
|
|
|
|
undef $name; |
|
944
|
0
|
|
|
|
|
|
while (1) { |
|
945
|
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
# Select next candidate element. If we need to |
|
947
|
|
|
|
|
|
|
# output elements in a certain order, consult the |
|
948
|
|
|
|
|
|
|
# hash; otherwise, just use "found" order. |
|
949
|
|
|
|
|
|
|
# |
|
950
|
0
|
0
|
|
|
|
|
if ($r_elem_order) { # use specified order |
|
951
|
|
|
|
|
|
|
|
|
952
|
0
|
|
|
|
|
|
$ne++; |
|
953
|
0
|
0
|
|
|
|
|
$ne >= $nemax and last; |
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
# For CSV and PSV, the element name at this |
|
956
|
|
|
|
|
|
|
# position may be deliberately undefined, or |
|
957
|
|
|
|
|
|
|
# may correspond to a named element missing |
|
958
|
|
|
|
|
|
|
# in this record, in which case we skip it. |
|
959
|
|
|
|
|
|
|
# |
|
960
|
0
|
|
|
|
|
|
$elem_name = $r_elem_order->[$ne]; |
|
961
|
0
|
0
|
0
|
|
|
|
! defined($elem_name) || ! defined( |
|
962
|
|
|
|
|
|
|
#XXX ignore multiple instances for now |
|
963
|
|
|
|
|
|
|
$n = $rechash{$elem_name}->[0] |
|
964
|
|
|
|
|
|
|
) and |
|
965
|
|
|
|
|
|
|
# for CSV/PSV, output an empty element |
|
966
|
|
|
|
|
|
|
next; |
|
967
|
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
} else { # use natural array order |
|
969
|
0
|
|
|
|
|
|
$n += 3; |
|
970
|
0
|
0
|
|
|
|
|
$n >= $nmax and last; |
|
971
|
|
|
|
|
|
|
} |
|
972
|
|
|
|
|
|
|
# If we get here, $n is defined. |
|
973
|
|
|
|
|
|
|
|
|
974
|
0
|
|
|
|
|
|
$lineno = $$r_elems[$n]; |
|
975
|
0
|
0
|
|
|
|
|
$name = $n < 3 # for special first triple |
|
976
|
|
|
|
|
|
|
? '_' # use synthesized name '_' |
|
977
|
|
|
|
|
|
|
: $$r_elems[$n + 1]; # else real name |
|
978
|
0
|
|
0
|
|
|
|
$value = $$r_elems[$n + 2] || ""; |
|
979
|
|
|
|
|
|
|
|
|
980
|
0
|
0
|
|
|
|
|
$elemnum++ unless $name eq '#'; |
|
981
|
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
# Skip if 'show' given and not requested. |
|
983
|
0
|
0
|
0
|
|
|
|
$$o{show} and ("$name: $value" !~ /$$o{show}/m) and |
|
984
|
|
|
|
|
|
|
(undef $name), # cause elem to be skipped |
|
985
|
|
|
|
|
|
|
next; |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
# Instead of $om->oelem, $om->celem, $om->contelem, |
|
988
|
|
|
|
|
|
|
# combine open and close into one, but first |
|
989
|
|
|
|
|
|
|
# naturalize values if called upon. |
|
990
|
|
|
|
|
|
|
# |
|
991
|
0
|
0
|
0
|
|
|
|
$$o{invert} and $value =~ /,\s*$/ and |
|
992
|
|
|
|
|
|
|
$value = anvl_name_naturalize($value); |
|
993
|
|
|
|
|
|
|
} |
|
994
|
|
|
|
|
|
|
continue { |
|
995
|
0
|
|
|
|
|
|
$s = $om->elem($name, $value, $lineno); |
|
996
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
|
|
|
|
0
|
|
|
|
|
|
997
|
0
|
|
|
|
|
|
undef $name; # clean the slate |
|
998
|
|
|
|
|
|
|
} |
|
999
|
0
|
|
|
|
|
|
$s = $om->crec($recnum); |
|
1000
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
|
|
|
|
0
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
} |
|
1002
|
|
|
|
|
|
|
continue { |
|
1003
|
0
|
|
|
|
|
|
$startline += $rrlines; |
|
1004
|
|
|
|
|
|
|
} |
|
1005
|
|
|
|
|
|
|
# XXX currently doing nothing with $allmsgs warnings! |
|
1006
|
|
|
|
|
|
|
# should probably print if verbose mode on |
|
1007
|
0
|
|
|
|
|
|
$s = $om->cstream(); |
|
1008
|
0
|
0
|
0
|
|
|
|
$p and ($st &&= $s), 1 or ($st .= $s); |
|
|
|
|
0
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
|
|
1010
|
0
|
|
|
|
|
|
return $st; |
|
1011
|
|
|
|
|
|
|
} |
|
1012
|
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
# xxx document all om options |
|
1014
|
|
|
|
|
|
|
# xxx should om also have a recstring slot (for anvlrec)? |
|
1015
|
|
|
|
|
|
|
# xxx pass in turtle_nosubject (default)? |
|
1016
|
|
|
|
|
|
|
sub turtle_set_subject { |
|
1017
|
|
|
|
|
|
|
|
|
1018
|
0
|
|
|
0
|
0
|
|
my ($om, $anvlrec) = (shift, shift); |
|
1019
|
0
|
|
|
|
|
|
my $r_elems = $om->{elemsref}; |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
# In order to find the subject element for Turtle/RDF |
|
1022
|
|
|
|
|
|
|
# assertions, we need an element name pattern. If one is |
|
1023
|
|
|
|
|
|
|
# defined in $om->{turtle_subjelpat}, use it. If it's undefined, |
|
1024
|
|
|
|
|
|
|
# per-record code will use 'where' if it thinks the record |
|
1025
|
|
|
|
|
|
|
# is an ERC, or use 'identifier|subject' as a last resort. |
|
1026
|
|
|
|
|
|
|
# If no element matching subjelpat is found, $om->{subject} |
|
1027
|
|
|
|
|
|
|
# will default to $om->{turtle_nosubject}. |
|
1028
|
|
|
|
|
|
|
# |
|
1029
|
0
|
|
0
|
|
|
|
my $subjpat = $om->{turtle_subjelpat} || |
|
1030
|
|
|
|
|
|
|
($anvlrec =~ /^erc\s*:/m |
|
1031
|
|
|
|
|
|
|
? "^where\$" : # 1st where in an 'erc', or |
|
1032
|
|
|
|
|
|
|
($anvlrec =~ /^(identifier|subject)\s*:/m |
|
1033
|
|
|
|
|
|
|
? "^$1\$" : # 1st identifier or subject, |
|
1034
|
|
|
|
|
|
|
($anvlrec =~ /^(.+)\s*:\s*(\n\s+)*\w/ |
|
1035
|
|
|
|
|
|
|
? "^$1\$" : # or 1st non-empty element |
|
1036
|
|
|
|
|
|
|
''))); # or nothing (always matches) |
|
1037
|
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
# Now find a 'subject' for our Turtle/RDF assertions. |
|
1039
|
|
|
|
|
|
|
# |
|
1040
|
0
|
|
|
|
|
|
my $j = 1; # element names in positions 1, 4, 7, ... |
|
1041
|
0
|
|
0
|
|
|
|
1 while ($j < $#$r_elems and # quickly find it |
|
|
|
|
0
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
@$r_elems[$j] !~ $subjpat and ($j += 3)); |
|
1043
|
0
|
0
|
0
|
|
|
|
$om->{subject} = $j < $#$r_elems && $subjpat ? # if found, |
|
1044
|
|
|
|
|
|
|
@$r_elems[$j + 1] : # use associated value |
|
1045
|
|
|
|
|
|
|
$om->{turtle_nosubject}; # else use default |
|
1046
|
0
|
|
|
|
|
|
return $om->{subject}; |
|
1047
|
|
|
|
|
|
|
} |
|
1048
|
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
1; |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
__END__ |