line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Decl::Parser;
|
2
|
|
|
|
|
|
|
|
3
|
12
|
|
|
12
|
|
125
|
use warnings;
|
|
12
|
|
|
|
|
24
|
|
|
12
|
|
|
|
|
382
|
|
4
|
12
|
|
|
12
|
|
68
|
use strict;
|
|
12
|
|
|
|
|
26
|
|
|
12
|
|
|
|
|
562
|
|
5
|
12
|
|
|
12
|
|
66
|
use Decl::Util;
|
|
12
|
|
|
|
|
26
|
|
|
12
|
|
|
|
|
1026
|
|
6
|
12
|
|
|
12
|
|
78
|
use Iterator::Simple qw(:all);
|
|
12
|
|
|
|
|
24
|
|
|
12
|
|
|
|
|
3588
|
|
7
|
12
|
|
|
12
|
|
77
|
use Data::Dumper;
|
|
12
|
|
|
|
|
23
|
|
|
12
|
|
|
|
|
570
|
|
8
|
12
|
|
|
12
|
|
26458
|
use Text::Balanced qw(extract_codeblock);
|
|
12
|
|
|
|
|
285572
|
|
|
12
|
|
|
|
|
56027
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 NAME
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
Decl::Parser - implements a parser to be defined using Decl::Semantics::Parse.
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 VERSION
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Version 0.01
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=cut
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
our $VERSION = '0.01';
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
The L module uses the structure of a "parse" tag to build a parser. The parser it builds, though,
|
26
|
|
|
|
|
|
|
is implemented using this class. And in fact this class also exposes a procedural API for building parsers, if you need to bypass the
|
27
|
|
|
|
|
|
|
parser builder. It's the work of but a moment to realize that you need to bypass the parser builder when building a parser to parse parser
|
28
|
|
|
|
|
|
|
specifications. (If you could parse I, I'm impressed.)
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
The idea is to build this parser with as few external dependencies as possible. Then it might be useful outside the framework as well.
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
These parsers are based on those in Mark Jason Dominus' fantastic book I. They consist of a tokenizer that is a chain
|
33
|
|
|
|
|
|
|
of lesser tokenizers, registered actions that can be carried out on intermediate parses, and rules that build structure from a sequence
|
34
|
|
|
|
|
|
|
of tokens.
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head2 new()
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
Instantiates a blank parser.
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=cut
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub new {
|
43
|
37
|
|
|
37
|
1
|
88
|
my $class = shift;
|
44
|
37
|
|
|
|
|
465
|
return bless {
|
45
|
|
|
|
|
|
|
tokenizers => [],
|
46
|
|
|
|
|
|
|
lexer => undef,
|
47
|
|
|
|
|
|
|
actions => {},
|
48
|
|
|
|
|
|
|
rules => {},
|
49
|
|
|
|
|
|
|
rulelist => [],
|
50
|
|
|
|
|
|
|
cmps => {},
|
51
|
|
|
|
|
|
|
parser => undef,
|
52
|
|
|
|
|
|
|
user => {}, # A place to stash action-specific data gleaned from input or ... wherever.
|
53
|
|
|
|
|
|
|
}, $class;
|
54
|
|
|
|
|
|
|
}
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 BUILDING THE PARSER
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
To build a parser, we add tokenizers, rules, and actions.
|
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 add_tokenizer()
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Each parser has a tokenizer, which is a list of atomic tokenizers consisting of regular expressions to examine incoming text and spit it back
|
63
|
|
|
|
|
|
|
out in categorized chunks of low-level meaning.
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Each atomic tokenizer consists of a label, a regex pattern, and an optional callback to be called to produce the token. Intervening text that
|
66
|
|
|
|
|
|
|
does not match the token's pattern is passed through unchanged, allowing later tokenizers in the chain to break that up.
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
The C function just pushes an atomic tokenizer onto the list. Later, C is called to tie those all together into a full
|
69
|
|
|
|
|
|
|
lexer.
|
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Possible extension: C<$pattern> could be a coderef instead of a string, to permit more flexibility in the design of tokenizers.
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=cut
|
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub add_tokenizer {
|
76
|
278
|
|
|
278
|
1
|
1409
|
my ($self, $label, $pattern, $handler) = @_;
|
77
|
|
|
|
|
|
|
|
78
|
278
|
100
|
|
1691
|
|
309
|
push @{$self->{tokenizers}}, [$label, $pattern, $handler ? $handler : sub { [ $_[1], $_[0] ] }];
|
|
278
|
|
|
|
|
1940
|
|
|
1691
|
|
|
|
|
10654
|
|
79
|
|
|
|
|
|
|
}
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=head2 action()
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
Adds a named action to the list of actions that can be integrated into the parser. Also used to retrieve a named action.
|
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
=cut
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
sub action {
|
88
|
4198
|
|
|
4198
|
1
|
7576
|
my ($self, $name, $action) = @_;
|
89
|
|
|
|
|
|
|
|
90
|
4198
|
100
|
|
|
|
9141
|
$self->{actions}->{$name} = $action if defined $action;
|
91
|
4198
|
|
|
|
|
22300
|
$self->{actions}->{$name};
|
92
|
|
|
|
|
|
|
}
|
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=head2 add_rule($name, $rule), get_rule($name), list_rules(), clear_rule($name);
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
The C function adds a rule. The rule is expressed in a sort of restricted Perl to assemble the available parser atoms into something useful.
|
97
|
|
|
|
|
|
|
Rule cross-references
|
98
|
|
|
|
|
|
|
can be indicated by enclosing the name of the rule in angle brackets <>; that will be substituted by a reference to a parser built with that rule. The purpose
|
99
|
|
|
|
|
|
|
of this API is to provide a simple but procedural way to assemble a basic parser - one that we can then use to parse our declarative structures.
|
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
The target Perl code again leans heavily on Dominus, with some extensions and simplifications to make things easier in our context.
|
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
Multiple rules added under the same name will be considered alternatives, and treated as such when the parser is built.
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
The C function clears the information associated with a rule name. I'm not sure it will ever be used, but it just seems so easy that it would
|
106
|
|
|
|
|
|
|
be silly not to put it in here. It does I delete the rule from the list of rules, so the rule's precedence (if any) will be unchanged.
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=cut
|
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub add_rule {
|
111
|
228
|
|
|
228
|
1
|
436
|
my ($self, $name, $rule) = @_;
|
112
|
228
|
|
|
|
|
537
|
$self->{rules}->{$name} = $rule;
|
113
|
228
|
|
|
|
|
484
|
$self->{cmps}->{$name} = $self->make_component('', '\¬hing');
|
114
|
228
|
50
|
|
|
|
476
|
push @{$self->{rulelist}}, $name unless grep { $_ eq $name} @{$self->{rulelist}};
|
|
228
|
|
|
|
|
1000
|
|
|
972
|
|
|
|
|
1869
|
|
|
228
|
|
|
|
|
529
|
|
115
|
|
|
|
|
|
|
}
|
116
|
2148
|
|
|
2148
|
1
|
3147
|
sub list_rules { @{$_[0]->{rulelist}} }
|
|
2148
|
|
|
|
|
13057
|
|
117
|
228
|
|
|
228
|
1
|
594
|
sub get_rule { $_[0]->{rules}->{$_[1]} }
|
118
|
|
|
|
|
|
|
sub clear_rule {
|
119
|
0
|
|
|
0
|
1
|
0
|
my ($self, $name) = @_;
|
120
|
0
|
|
|
|
|
0
|
$self->{rules}->{$name} = [];
|
121
|
0
|
|
|
|
|
0
|
$self->{cmps}->{$name} = $self->make_component('', '\¬hing');
|
122
|
|
|
|
|
|
|
}
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head1 USING THE PARSER
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=head2 lexer($input), _t()
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
The C function creates a lexer using the list of tokenizers already registered, using the input stream provided. The lexer is an iterator, with a peek function to
|
129
|
|
|
|
|
|
|
check the next token without consuming it. Tokens are arrayrefs or plain strings.
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
This leans heavily on Dominus.
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
Note that the input may itself be a token stream.
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
If called in a list context, returns the full list of tokens instead of an iterator. I hope that's what you wanted.
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
The C<_t> function does most of the heavy lifting, and *really* leans on Dominus. I've extended his lexer framework with two features: first, if a lexer
|
138
|
|
|
|
|
|
|
is simply passed a string as its input, it will still work, by creating a single-use interator. Second, token labels that end in an asterisk are filtered
|
139
|
|
|
|
|
|
|
out of the final token string.
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Dominus's framework provides for suppression of tokens using the token building function (e.g. sub {''} to suppress whitespace in the outgoing token stream),
|
142
|
|
|
|
|
|
|
but there's a surprising problem with that approach - if the resulting stream is fed into the next atomic tokenizer in a chain, neighboring unparsed text
|
143
|
|
|
|
|
|
|
will be pushed back together! This is a natural result of the fact that blockwise reading of files needs to be supported without breaking tokens that span
|
144
|
|
|
|
|
|
|
block boundaries; the final tokenizer in the chain necessarily treats the output of earlier tokenizers like blocks.
|
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
But what if I want to tokenize into whitespace first, then, say, find all words starting with 't' and treat them as special tokens? OK, so this was a silly
|
147
|
|
|
|
|
|
|
test case, and yet it seems intuitively to be something like what I'd want to do in some situations. The naive approach is this:
|
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
parse t
|
150
|
|
|
|
|
|
|
tokens
|
151
|
|
|
|
|
|
|
WHITESPACE "\s+" { "" }
|
152
|
|
|
|
|
|
|
TWORDS "^t.*"
|
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
If I give that the string "this is a test string", I don't get five tokens, two of which are TWORDS. I get one TWORD token with the value
|
155
|
|
|
|
|
|
|
"thisisateststring". That is because by swallowing the "tokenicity" of the whitespace, we're actually just ignoring the whitespace.
|
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
Bad!
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
So instead, we put an asterisk on the whitespace specification, so that it will be suppressed I the tokenizing process is complete, that is, at
|
160
|
|
|
|
|
|
|
the end of the tokenizer chain. In the meantime, though, the whitespace tokens are still there to hold their place in the queue.
|
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
parse t
|
163
|
|
|
|
|
|
|
tokens
|
164
|
|
|
|
|
|
|
WHITESPACE* "\s+"
|
165
|
|
|
|
|
|
|
TWORDS "^t.*"
|
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=cut
|
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
sub lexer {
|
170
|
2100
|
|
|
2100
|
1
|
5730
|
my ($self, $input) = @_;
|
171
|
|
|
|
|
|
|
|
172
|
2100
|
100
|
|
|
|
4729
|
return $self->tokens($input) if wantarray;
|
173
|
|
|
|
|
|
|
|
174
|
2097
|
|
|
|
|
2545
|
my @tokenizers = @{$self->{tokenizers}};
|
|
2097
|
|
|
|
|
8115
|
|
175
|
2097
|
|
|
|
|
5156
|
while (@tokenizers) {
|
176
|
16409
|
|
|
|
|
156265
|
my $t = shift @tokenizers;
|
177
|
|
|
|
|
|
|
|
178
|
16409
|
100
|
|
|
|
40703
|
if ($t->[0] eq 'CODEBLOCK') {
|
179
|
997
|
|
100
|
|
|
2810
|
my $pattern = $t->[1] || "{}";
|
180
|
997
|
|
|
|
|
3830
|
my $prefix = "[^\\" . substr($pattern,0,1) . "]*";
|
181
|
997
|
100
|
66
|
1994
|
|
3148
|
$t->[1] = sub { my @r = eval { extract_codeblock ($_[0], $pattern, $prefix) }; defined $r[0] && $r[0] ne '' ? ($r[2], $r[0], $r[1]) : $_[0] } unless ref $pattern;
|
|
1994
|
100
|
|
|
|
2814
|
|
|
1994
|
|
|
|
|
6910
|
|
|
1994
|
|
|
|
|
171264
|
|
182
|
|
|
|
|
|
|
}
|
183
|
16409
|
|
|
|
|
32070
|
$input = _t($input, @$t);
|
184
|
|
|
|
|
|
|
}
|
185
|
|
|
|
|
|
|
ifilter $input, sub {
|
186
|
3939
|
100
|
|
3939
|
|
22433
|
return $_ unless ref $_;
|
187
|
1794
|
100
|
|
|
|
8096
|
return $_ unless $$_[0] =~ /\*$/; # Skip tokens whose labels end in *.
|
188
|
872
|
|
|
|
|
2392
|
return;
|
189
|
|
|
|
|
|
|
}
|
190
|
2097
|
|
|
|
|
31063
|
}
|
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub _t {
|
193
|
16409
|
|
|
16409
|
|
30926
|
my ($input, $label, $pattern, $handler) = @_;
|
194
|
16409
|
|
|
|
|
19693
|
my @tokens;
|
195
|
16409
|
|
|
|
|
20496
|
my $buf = '';
|
196
|
16409
|
100
|
|
|
|
35122
|
unless (ref $input) {
|
197
|
2097
|
|
|
|
|
8834
|
$input = iter ([$input]); # Make $input iterable if it's just a string.
|
198
|
|
|
|
|
|
|
}
|
199
|
16409
|
|
|
|
|
75357
|
my $split = $pattern;
|
200
|
16409
|
100
|
|
31069
|
|
62568
|
$split = sub { split /($pattern)/, $_[0] } unless ref $pattern;
|
|
31069
|
|
|
|
|
346715
|
|
201
|
|
|
|
|
|
|
iterator {
|
202
|
34886
|
|
100
|
34886
|
|
170343
|
while (@tokens == 0 && defined $buf) {
|
203
|
33063
|
|
|
|
|
64820
|
my $i = $input->();
|
204
|
|
|
|
|
|
|
|
205
|
33063
|
100
|
|
|
|
85682
|
if (ref $i) { # $i is itself a token!
|
206
|
6073
|
|
|
|
|
10725
|
my ($sep, $tok) = $split->($buf);
|
207
|
6073
|
100
|
|
|
|
15011
|
$tok = $handler->($tok, $label) if defined $tok;
|
208
|
6073
|
|
100
|
|
|
48374
|
push @tokens, grep defined && $_ ne "", $sep, $tok, $i;
|
209
|
6073
|
|
|
|
|
8111
|
$buf = "";
|
210
|
6073
|
|
|
|
|
11049
|
last;
|
211
|
|
|
|
|
|
|
}
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# $i is just a bunch of new text.
|
214
|
26990
|
100
|
|
|
|
62360
|
$buf .= $i if defined $i;
|
215
|
26990
|
|
|
|
|
50170
|
my @newtoks = $split->($buf);
|
216
|
26990
|
|
100
|
|
|
166155
|
while (@newtoks > 2 || @newtoks && ! defined $i) {
|
|
|
|
66
|
|
|
|
|
217
|
8131
|
|
|
|
|
14916
|
push @tokens, shift(@newtoks);
|
218
|
8131
|
100
|
|
|
|
37441
|
push @tokens, $handler->(shift(@newtoks), $label) if @newtoks;
|
219
|
|
|
|
|
|
|
}
|
220
|
26990
|
|
|
|
|
54690
|
$buf = join '', @newtoks;
|
221
|
26990
|
100
|
|
|
|
55917
|
undef $buf if ! defined $i;
|
222
|
26990
|
|
|
|
|
151980
|
@tokens = grep $_ ne "", @tokens;
|
223
|
|
|
|
|
|
|
}
|
224
|
34886
|
50
|
33
|
|
|
126878
|
return (defined $_[0] and $_[0] eq 'peek') ? $tokens[0] : shift (@tokens);
|
225
|
|
|
|
|
|
|
}
|
226
|
16409
|
|
|
|
|
109682
|
}
|
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=head2 tokens($input)
|
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
If you know you've got a limited number of tokens and just want to grab the whole list, use C, which just returns a list.
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=cut
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
sub tokens {
|
236
|
4
|
|
|
4
|
1
|
952
|
my ($self, $input) = @_;
|
237
|
4
|
|
|
|
|
16
|
my $lexer = $self->lexer ($input);
|
238
|
4
|
|
|
|
|
102
|
my @list = ();
|
239
|
4
|
|
|
|
|
12
|
while (defined (my $t = $lexer->())) {
|
240
|
33
|
|
|
|
|
466
|
push @list, $t;
|
241
|
|
|
|
|
|
|
}
|
242
|
4
|
|
|
|
|
231
|
return @list;
|
243
|
|
|
|
|
|
|
}
|
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head2 tokenstream($input)
|
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Finally, if you need a lazily evaluated stream for your token output (and hey, who doesn't?) call tokenstream. (Note: you'll want a stream
|
248
|
|
|
|
|
|
|
if you're passing your lexer to a recursive-descent parser as below, because you need to be able to unwind the stream if one of your rules doesn't
|
249
|
|
|
|
|
|
|
match.)
|
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=cut
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
sub tokenstream {
|
254
|
2092
|
|
|
2092
|
1
|
18506
|
my ($self, $input) = @_;
|
255
|
2092
|
|
|
|
|
5427
|
my $lexer = $self->lexer ($input);
|
256
|
2092
|
|
|
|
|
62270
|
lazyiter ($lexer);
|
257
|
|
|
|
|
|
|
}
|
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=head2 PARSER COMPONENTS: parser, nothing, anything, end_of_input, token, token_silent, literal, word, p_and, p_or, series, one_or_more, list_of, optional, debug, debug_next_token
|
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
These are not methods; they're functions. They are the little subparsers that we hack together to make a full parser. The output of each of these
|
262
|
|
|
|
|
|
|
parsers is an arrayref containing a flat list of tokens it has matched in the token stream it's given as input. Each token is itself an arrayref of
|
263
|
|
|
|
|
|
|
two parts (a cons pair), with the first being the type, and second the token value. Bare words surviving the lexer are converted into individual
|
264
|
|
|
|
|
|
|
tokens of type '' (empty string), allowing tokens to be treated uniformly.
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=cut
|
267
|
|
|
|
|
|
|
|
268
|
1577
|
|
|
1577
|
1
|
12968
|
sub parser (&) { $_[0] }
|
269
|
|
|
|
|
|
|
sub nothing {
|
270
|
9233
|
|
|
9233
|
1
|
12553
|
my $input = shift;
|
271
|
9233
|
|
|
|
|
28389
|
return (undef, $input);
|
272
|
|
|
|
|
|
|
}
|
273
|
|
|
|
|
|
|
sub debug {
|
274
|
0
|
|
|
0
|
1
|
0
|
my $message = shift;
|
275
|
0
|
0
|
|
|
|
0
|
return \¬hing unless $message;
|
276
|
|
|
|
|
|
|
my $parser = parser {
|
277
|
0
|
|
|
0
|
|
0
|
my $input = shift;
|
278
|
0
|
|
|
|
|
0
|
print STDERR $message;
|
279
|
0
|
|
|
|
|
0
|
return (undef, $input);
|
280
|
|
|
|
|
|
|
}
|
281
|
0
|
|
|
|
|
0
|
}
|
282
|
|
|
|
|
|
|
sub debug_next_token {
|
283
|
0
|
|
|
0
|
1
|
0
|
my $input = shift;
|
284
|
0
|
|
|
|
|
0
|
print STDERR "at this point the input stream is:\n" . Dumper($input);
|
285
|
0
|
0
|
|
|
|
0
|
if (not defined $input) {
|
286
|
0
|
|
|
|
|
0
|
print STDERR "no more tokens\n";
|
287
|
|
|
|
|
|
|
} else {
|
288
|
0
|
|
|
|
|
0
|
my $next = car($input);
|
289
|
0
|
0
|
|
|
|
0
|
if (not defined $next) {
|
290
|
0
|
|
|
|
|
0
|
print STDERR "car(input) is not defined\n";
|
291
|
|
|
|
|
|
|
} else {
|
292
|
0
|
|
0
|
|
|
0
|
my $carn = car($next) || '';
|
293
|
0
|
|
0
|
|
|
0
|
my $cdrn = cdr($next) || '';
|
294
|
0
|
|
|
|
|
0
|
print STDERR "next token: ['$carn', '$cdrn']\n";
|
295
|
|
|
|
|
|
|
}
|
296
|
|
|
|
|
|
|
}
|
297
|
0
|
|
|
|
|
0
|
return (undef, $input);
|
298
|
|
|
|
|
|
|
}
|
299
|
|
|
|
|
|
|
sub end_of_input {
|
300
|
991
|
|
|
991
|
1
|
1713
|
my $input = shift;
|
301
|
991
|
100
|
|
|
|
4048
|
defined($input) ? () : (undef, undef);
|
302
|
|
|
|
|
|
|
}
|
303
|
|
|
|
|
|
|
sub token {
|
304
|
102
|
|
|
102
|
1
|
279
|
my $wanted = shift;
|
305
|
102
|
100
|
|
|
|
280
|
$wanted = [$wanted] unless ref $wanted;
|
306
|
|
|
|
|
|
|
my $parser = parser {
|
307
|
4820
|
|
|
4820
|
|
6759
|
my $input = shift;
|
308
|
4820
|
100
|
|
|
|
117792
|
return unless defined $input;
|
309
|
284
|
|
|
|
|
829
|
my $next = car($input);
|
310
|
284
|
50
|
|
|
|
801
|
return unless defined $next;
|
311
|
284
|
100
|
|
|
|
4771
|
return unless ref $next;
|
312
|
275
|
|
|
|
|
782
|
for my $i (0 .. $#$wanted) {
|
313
|
275
|
50
|
|
|
|
897
|
next unless defined $wanted->[$i];
|
314
|
275
|
100
|
|
|
|
3792
|
return unless $wanted->[$i] eq $next->[$i];
|
315
|
|
|
|
|
|
|
}
|
316
|
215
|
50
|
|
|
|
644
|
$next = ['', $next] unless ref $next;
|
317
|
215
|
|
|
|
|
598
|
return ($next, cdr($input));
|
318
|
102
|
|
|
|
|
611
|
};
|
319
|
|
|
|
|
|
|
|
320
|
102
|
|
|
|
|
1253
|
return $parser;
|
321
|
|
|
|
|
|
|
}
|
322
|
|
|
|
|
|
|
sub token_silent {
|
323
|
230
|
|
|
230
|
1
|
362
|
my $wanted = shift;
|
324
|
230
|
100
|
|
|
|
592
|
$wanted = [$wanted] unless ref $wanted;
|
325
|
|
|
|
|
|
|
my $parser = parser {
|
326
|
4349
|
|
|
4349
|
|
5975
|
my $input = shift;
|
327
|
4349
|
100
|
|
|
|
100086
|
return unless defined $input;
|
328
|
1224
|
|
|
|
|
3344
|
my $next = car($input);
|
329
|
1224
|
50
|
|
|
|
3165
|
return unless defined $next;
|
330
|
1224
|
100
|
|
|
|
3397
|
return unless ref $next;
|
331
|
1216
|
|
|
|
|
3195
|
for my $i (0 .. $#$wanted) {
|
332
|
1216
|
50
|
|
|
|
3052
|
next unless defined $wanted->[$i];
|
333
|
1216
|
100
|
|
|
|
17513
|
return unless $wanted->[$i] eq $next->[$i];
|
334
|
|
|
|
|
|
|
}
|
335
|
693
|
50
|
|
|
|
1861
|
$next = ['', $next] unless ref $next;
|
336
|
693
|
|
|
|
|
1910
|
return (undef, cdr($input));
|
337
|
230
|
|
|
|
|
1038
|
};
|
338
|
|
|
|
|
|
|
|
339
|
230
|
|
|
|
|
876
|
return $parser;
|
340
|
|
|
|
|
|
|
}
|
341
|
|
|
|
|
|
|
sub literal {
|
342
|
7
|
|
|
7
|
1
|
31
|
my $wanted = shift;
|
343
|
|
|
|
|
|
|
my $parser = parser {
|
344
|
33
|
|
|
33
|
|
861
|
my $input = shift;
|
345
|
33
|
100
|
|
|
|
84
|
return unless defined $input;
|
346
|
29
|
|
|
|
|
63
|
my $next = car($input);
|
347
|
29
|
50
|
|
|
|
65
|
return unless defined $next;
|
348
|
29
|
|
|
|
|
28
|
my $value;
|
349
|
29
|
100
|
|
|
|
51
|
if (ref $next) {
|
350
|
24
|
|
|
|
|
46
|
$value = $next->[1];
|
351
|
|
|
|
|
|
|
} else {
|
352
|
5
|
|
|
|
|
10
|
$value = $next;
|
353
|
|
|
|
|
|
|
}
|
354
|
29
|
100
|
|
|
|
90
|
return unless $value eq $wanted;
|
355
|
26
|
100
|
|
|
|
56
|
$next = ['', $next] unless ref $next;
|
356
|
26
|
|
|
|
|
56
|
return ($next, cdr($input));
|
357
|
7
|
|
|
|
|
29
|
};
|
358
|
|
|
|
|
|
|
|
359
|
7
|
|
|
|
|
20
|
return $parser;
|
360
|
|
|
|
|
|
|
}
|
361
|
|
|
|
|
|
|
sub word { # Need this for undecorated, non-token text.
|
362
|
5021
|
|
|
5021
|
1
|
5980
|
my $input = shift;
|
363
|
5021
|
100
|
|
|
|
78711
|
return unless defined $input;
|
364
|
2569
|
|
|
|
|
7038
|
my $next = car($input);
|
365
|
2569
|
50
|
|
|
|
5970
|
return unless defined $next;
|
366
|
2569
|
100
|
|
|
|
12821
|
return if ref $next;
|
367
|
2163
|
|
|
|
|
8868
|
return (['', $next], cdr($input));
|
368
|
|
|
|
|
|
|
}
|
369
|
|
|
|
|
|
|
sub anything {
|
370
|
0
|
|
|
0
|
1
|
0
|
my $input = shift;
|
371
|
0
|
0
|
|
|
|
0
|
return unless defined $input;
|
372
|
0
|
|
|
|
|
0
|
my $next = car($input);
|
373
|
0
|
0
|
|
|
|
0
|
return unless defined $next;
|
374
|
0
|
0
|
|
|
|
0
|
return ($next, cdr($input)) if ref $next;
|
375
|
0
|
|
|
|
|
0
|
return (['', $next], cdr($input));
|
376
|
|
|
|
|
|
|
}
|
377
|
|
|
|
|
|
|
sub p_and {
|
378
|
438
|
|
|
438
|
1
|
6114
|
my @p = @_;
|
379
|
438
|
50
|
|
|
|
956
|
return \¬hing if @p == 0;
|
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
my $parser = parser {
|
382
|
18643
|
|
|
18643
|
|
28510
|
my $input = shift;
|
383
|
18643
|
|
|
|
|
19513
|
my $v;
|
384
|
|
|
|
|
|
|
my @values;
|
385
|
18643
|
|
|
|
|
29434
|
for (@p) {
|
386
|
29863
|
100
|
|
|
|
260265
|
($v, $input) = $_->($input) or return;
|
387
|
15415
|
100
|
|
|
|
52072
|
if (ref car($v)) {
|
388
|
1253
|
|
|
|
|
2554
|
foreach (@$v) {
|
389
|
3680
|
50
|
|
|
|
10913
|
push @values, $_ if defined $v;
|
390
|
|
|
|
|
|
|
}
|
391
|
|
|
|
|
|
|
} else {
|
392
|
14162
|
100
|
|
|
|
45470
|
push @values, $v if defined $v;
|
393
|
|
|
|
|
|
|
}
|
394
|
|
|
|
|
|
|
}
|
395
|
4195
|
|
|
|
|
77569
|
return (\@values, $input);
|
396
|
|
|
|
|
|
|
}
|
397
|
438
|
|
|
|
|
2174
|
}
|
398
|
|
|
|
|
|
|
sub p_or {
|
399
|
268
|
|
|
268
|
1
|
1503
|
my @p = @_;
|
400
|
268
|
50
|
|
0
|
|
587
|
return parser { return () } if @p == 0;
|
|
0
|
|
|
|
|
0
|
|
401
|
268
|
50
|
|
|
|
519
|
return $p[0] if @p == 1;
|
402
|
|
|
|
|
|
|
my $parser = parser {
|
403
|
13467
|
|
|
13467
|
|
22345
|
my $input = shift;
|
404
|
13467
|
|
|
|
|
13757
|
my ($v, $newinput);
|
405
|
13467
|
|
|
|
|
20827
|
for (@p) {
|
406
|
24590
|
100
|
|
|
|
89517
|
if (($v, $newinput) = $_->($input)) {
|
407
|
12692
|
|
|
|
|
72228
|
return ($v, $newinput);
|
408
|
|
|
|
|
|
|
}
|
409
|
|
|
|
|
|
|
}
|
410
|
775
|
|
|
|
|
4089
|
return;
|
411
|
|
|
|
|
|
|
}
|
412
|
268
|
|
|
|
|
1287
|
}
|
413
|
|
|
|
|
|
|
sub series { # TODO: long series (like, oh, series of lines in a parsed body of over 150 lines or so) generate deep recursion warnings.
|
414
|
|
|
|
|
|
|
# So this is elegant - but not a good solution. Instead, we should collect matches until one doesn't match, i.e.
|
415
|
|
|
|
|
|
|
# make "series" a primary parser instead of relying on and/or.
|
416
|
98
|
|
|
98
|
1
|
1246
|
my $p = shift;
|
417
|
98
|
|
|
|
|
104
|
my $p_star;
|
418
|
98
|
|
|
1376
|
|
387
|
$p_star = p_or(p_and($p, parser {$p_star->(@_) }), \¬hing);
|
|
1376
|
|
|
|
|
3783
|
|
419
|
|
|
|
|
|
|
}
|
420
|
|
|
|
|
|
|
sub one_or_more {
|
421
|
36
|
|
|
36
|
1
|
59
|
my $p = shift;
|
422
|
36
|
|
|
|
|
87
|
p_and ($p, series($p));
|
423
|
|
|
|
|
|
|
}
|
424
|
|
|
|
|
|
|
sub list_of {
|
425
|
39
|
|
|
39
|
1
|
73
|
my ($element, $separator) = @_;
|
426
|
39
|
100
|
66
|
|
|
287
|
if (defined $separator and not ref $separator) {
|
427
|
38
|
100
|
|
|
|
171
|
if ($separator =~ /\*$/) {
|
428
|
37
|
|
|
|
|
131
|
$separator =~ s/\*$//;
|
429
|
37
|
|
|
|
|
97
|
$separator = token_silent($separator);
|
430
|
|
|
|
|
|
|
} else {
|
431
|
1
|
|
|
|
|
5
|
$separator = token ($separator);
|
432
|
|
|
|
|
|
|
}
|
433
|
|
|
|
|
|
|
}
|
434
|
39
|
50
|
|
|
|
138
|
$separator = token($separator) if ref $separator eq 'ARRAY';
|
435
|
39
|
100
|
|
|
|
113
|
$separator = token_silent('COMMA') unless defined $separator;
|
436
|
39
|
|
|
|
|
83
|
return p_and($element, series(p_and ($separator, $element)));
|
437
|
|
|
|
|
|
|
}
|
438
|
97
|
|
|
97
|
1
|
189
|
sub optional { p_or (p_and (@_), \¬hing) }
|
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=head2 build(), make_component($name, $spec), get_parser($name), parse($input), execute($defined input)
|
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
The C function takes the rules that have been added to the parser, and builds the actual parser using C, which is also available for
|
443
|
|
|
|
|
|
|
external use. The C function runs in the context of the parser itself and uses C to build its parser. Each parser built with C
|
444
|
|
|
|
|
|
|
or C is named. Its output, if it matches, is a two-part arrayref, with the first element being its name and the second the arrayref list of
|
445
|
|
|
|
|
|
|
tokens or subvalues that it matched.
|
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
An anonymous parser (name '' or undef) just returns the list of tokens, without the level of structure. The same applies to any name ending in an asterisk.
|
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
This should probably be covered in more detail in the tutorial, but the principle used here is that of the recursive-descent parser. A recursive-descent
|
450
|
|
|
|
|
|
|
parser can effectively be constructed as a series of little parsers that are glued together by combination functions. Each of these parsers consumes a series
|
451
|
|
|
|
|
|
|
of tokens, and returns a value; the default value is an arrayref (a pair, if you're Pythonic) consisting of the name or tag of the parser, followed
|
452
|
|
|
|
|
|
|
by the list of tokens consumed. The sum total of all those arrayrefs is an abstract syntax tree for the expression being parsed.
|
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
When a parser is invoked in a macro context, that syntax tree is converted into a structure of Decl::Node objects (a nodal structure), with or
|
455
|
|
|
|
|
|
|
without subclass decoration, depending on where the macro is expanded. But when we call a parser from Perl directly, we get the arrayrefs.
|
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
By defining actions and invoking them during the parse, we can also modify that structure as it's being built, or even build something else entirely, like
|
458
|
|
|
|
|
|
|
a numeric result of a calculation or perhaps some callable code. This is still pretty hand-wavy, as I still haven't got my head around actual applications.
|
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
At any rate, the rule specifications passed to C are pretty straightforward:
|
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
C matches a token by that name.
|
463
|
|
|
|
|
|
|
C matches a specific token.
|
464
|
|
|
|
|
|
|
C matches either a token by text, or a bare word. It converts the bare word to ['', 'word'].
|
465
|
|
|
|
|
|
|
C matches a bare word using a regex. If the regex has parentheses in it, the output value may be one or more tokens with the contents.
|
466
|
|
|
|
|
|
|
C<> matches a named parser rule, and expands to C<$eta_parser> in order to permit self-reference. (See Dominus Chapter 8.)
|
467
|
|
|
|
|
|
|
C<\¬hing> is the null parser, used to build complex rules.
|
468
|
|
|
|
|
|
|
C<\&anything> is the universal token, used to match things like comments.
|
469
|
|
|
|
|
|
|
C<\&end_of_input> is the end of input.
|
470
|
|
|
|
|
|
|
C<\&word> is any bare word (non-token text). It also converts the bare word to ['', 'word'].
|
471
|
|
|
|
|
|
|
C is Dominus's "alternate" function, because I don't like to type that much.
|
472
|
|
|
|
|
|
|
C is Dominus's "concatenate" function, for the same reason.
|
473
|
|
|
|
|
|
|
C is just a function that matches a series of whatever it's called on.
|
474
|
|
|
|
|
|
|
C is a function. It matches a delimited series of its first argument, delimited by tokens of its second argument. If omitted, the delimiter is COMMA.
|
475
|
|
|
|
|
|
|
C is a function that matches either its contents or nothing.
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
Note that the only code-munging done here is reference to other rules. It's difficult for me to avoid code generation because it's so fun, but since parser
|
478
|
|
|
|
|
|
|
specifications are supposed to be pretty general code, it's really not safe.
|
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
The order of addition of rules determines the order they'll be processed in. When the parser is built, it will check for consistency and dangling rule
|
481
|
|
|
|
|
|
|
references (i.e. rules you mention but don't define), perform the eta expansions needed for self-reference, and build all the subparsers.
|
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
=cut
|
484
|
|
|
|
|
|
|
sub make_component {
|
485
|
234
|
|
|
234
|
1
|
8456
|
my ($self, $name, $code) = @_;
|
486
|
234
|
|
|
|
|
273
|
my $parser;
|
487
|
|
|
|
|
|
|
|
488
|
234
|
|
|
|
|
666
|
while ($code =~ /<(\w+)>/) {
|
489
|
0
|
|
|
|
|
0
|
my $pref = $1;
|
490
|
0
|
0
|
|
|
|
0
|
$self->{cmps}->{$pref} = $self->make_component('', '\¬hing') unless $self->{cmps}->{$pref};
|
491
|
0
|
|
|
|
|
0
|
$code =~ s/<$pref>/parser { \$pref->(\@_) }/g;
|
492
|
|
|
|
|
|
|
}
|
493
|
234
|
|
|
|
|
24057
|
$parser = eval ($code);
|
494
|
234
|
50
|
|
|
|
1015
|
warn "make_component: $@\n>>> $code" if $@;
|
495
|
234
|
100
|
|
|
|
1146
|
return $parser unless $name;
|
496
|
|
|
|
|
|
|
parser {
|
497
|
2
|
|
|
2
|
|
11
|
my $input = shift;
|
498
|
2
|
|
|
|
|
3
|
my $v;
|
499
|
2
|
100
|
|
|
|
6
|
($v, $input) = $parser->($input) or return;
|
500
|
1
|
|
|
|
|
34
|
[$name, $v];
|
501
|
|
|
|
|
|
|
}
|
502
|
2
|
|
|
|
|
14
|
}
|
503
|
2080
|
|
|
2080
|
1
|
15349
|
sub get_parser { $_[0]->{cmps}->{$_[1]} }
|
504
|
|
|
|
|
|
|
sub build {
|
505
|
36
|
|
|
36
|
1
|
69
|
my ($self) = @_;
|
506
|
36
|
|
|
|
|
80
|
$self->{cmps} = {}; # Start from scratch on every build, of course.
|
507
|
|
|
|
|
|
|
|
508
|
36
|
|
|
|
|
109
|
my $code = "sub {\n";
|
509
|
36
|
|
|
|
|
114
|
foreach my $name ($self->list_rules()) {
|
510
|
228
|
|
|
|
|
485
|
$code .= "my (\$p__$name, \$p__${name}_anon);\n";
|
511
|
|
|
|
|
|
|
}
|
512
|
36
|
|
|
|
|
112
|
foreach my $name ($self->list_rules()) {
|
513
|
|
|
|
|
|
|
#$self->{cmps}->{$name} = $self->make_component($name, $self->get_rule($name));
|
514
|
228
|
|
|
|
|
511
|
my $rule = $self->get_rule($name);
|
515
|
228
|
|
|
|
|
775
|
while ($rule =~ /<(\w+)>/) {
|
516
|
204
|
|
|
|
|
355
|
my $pref = $1;
|
517
|
204
|
|
|
|
|
3089
|
$rule =~ s/<$pref>/parser { \$p__$pref->(\@_) }/g;
|
518
|
|
|
|
|
|
|
}
|
519
|
|
|
|
|
|
|
|
520
|
228
|
|
|
|
|
487
|
$code .= "\n\$p__${name}_anon = $rule;\n";
|
521
|
228
|
|
|
|
|
343
|
$code .= "\$p__$name = parser {\n";
|
522
|
228
|
|
|
|
|
274
|
$code .= " my \$input = shift;\n";
|
523
|
228
|
|
|
|
|
263
|
$code .= " my \$v;\n";
|
524
|
|
|
|
|
|
|
#$code .= " print STDERR \"Calling parser $name\\n\";\n";
|
525
|
228
|
|
|
|
|
339
|
$code .= " (\$v, \$input) = \$p__${name}_anon->(\$input) or return;\n";
|
526
|
|
|
|
|
|
|
#$code .= " print STDERR \"Parser $name succeeded\\n\";\n";
|
527
|
228
|
|
|
|
|
438
|
$code .= " (['$name', \$v], \$input);\n";
|
528
|
228
|
|
|
|
|
240
|
$code .= "};\n";
|
529
|
228
|
|
|
|
|
600
|
$code .= "\$self->{cmps}->{'$name'} = \$p__$name;\n";
|
530
|
|
|
|
|
|
|
}
|
531
|
36
|
|
|
|
|
80
|
$code .= "}\n";
|
532
|
|
|
|
|
|
|
#print STDERR $code;
|
533
|
36
|
|
|
|
|
29636
|
my $builder = eval $code;
|
534
|
36
|
50
|
|
|
|
148
|
warn "building: $@" if $@;
|
535
|
36
|
|
|
|
|
827
|
$self->{parser} = $builder->();
|
536
|
|
|
|
|
|
|
}
|
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
sub parse {
|
539
|
2076
|
|
|
2076
|
1
|
15776
|
my ($self, $input) = @_;
|
540
|
2076
|
50
|
|
|
|
8337
|
$input = $self->tokenstream($input) unless ref $input eq 'ARRAY';
|
541
|
2076
|
|
|
|
|
7108
|
my @rules = $self->list_rules();
|
542
|
2076
|
|
|
|
|
6830
|
my $first = $self->get_parser($rules[0]);
|
543
|
2076
|
|
|
|
|
78269
|
my ($output, $remainder) = $first->($input);
|
544
|
2076
|
|
|
|
|
9770
|
return $output;
|
545
|
|
|
|
|
|
|
}
|
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
sub execute {
|
548
|
2069
|
|
|
2069
|
1
|
3303
|
my ($self) = @_;
|
549
|
2069
|
|
100
|
320
|
|
5500
|
my $input_builder = $self->action('input') || sub { $_[1] };
|
|
320
|
|
|
|
|
1383
|
|
550
|
2069
|
|
|
|
|
7220
|
my $parse_result = $self->parse($input_builder->(@_));
|
551
|
2069
|
|
50
|
0
|
|
6544
|
my $output_builder = $self->action('output') || sub { $_[0] };
|
|
0
|
|
|
|
|
|
|
552
|
2069
|
|
|
|
|
8065
|
$output_builder->($parse_result, @_);
|
553
|
|
|
|
|
|
|
}
|
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=head1 AUTHOR
|
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
Michael Roberts, C<< >>
|
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
=head1 BUGS
|
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through
|
562
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll
|
563
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes.
|
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT
|
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
Copyright 2010 Michael Roberts.
|
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it
|
570
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published
|
571
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License.
|
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information.
|
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=cut
|
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
1; # End of Decl::Parser
|