| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
##-*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
## File: DDC::::yyqlexer.pm |
|
4
|
|
|
|
|
|
|
## Author: Bryan Jurish |
|
5
|
|
|
|
|
|
|
## Description: |
|
6
|
|
|
|
|
|
|
## + lexer for ddc queries (formerly DDC::Query::yylexer) |
|
7
|
|
|
|
|
|
|
## + last updated for ddc v2.1.15 |
|
8
|
|
|
|
|
|
|
##====================================================================== |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
package DDC::PP::yyqlexer; |
|
11
|
20
|
|
|
20
|
|
603
|
use 5.010; ##-- we need at least v5.10.0 for /p regex modifier |
|
|
20
|
|
|
|
|
76
|
|
|
12
|
20
|
|
|
20
|
|
117
|
use Encode qw(encode_utf8 decode_utf8); |
|
|
20
|
|
|
|
|
40
|
|
|
|
20
|
|
|
|
|
1086
|
|
|
13
|
20
|
|
|
20
|
|
121
|
use Carp; |
|
|
20
|
|
|
|
|
39
|
|
|
|
20
|
|
|
|
|
1222
|
|
|
14
|
20
|
|
|
20
|
|
136
|
use IO::File; |
|
|
20
|
|
|
|
|
43
|
|
|
|
20
|
|
|
|
|
2954
|
|
|
15
|
20
|
|
|
20
|
|
153
|
use IO::Handle; |
|
|
20
|
|
|
|
|
41
|
|
|
|
20
|
|
|
|
|
833
|
|
|
16
|
20
|
|
|
20
|
|
128
|
use strict; |
|
|
20
|
|
|
|
|
51
|
|
|
|
20
|
|
|
|
|
7378
|
|
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##====================================================================== |
|
19
|
|
|
|
|
|
|
## Globals etc. |
|
20
|
|
|
|
|
|
|
our @ISA = qw(); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
|
23
|
|
|
|
|
|
|
## Globals: regexes for Parse::Lex lexer token regexes |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
## %DEF |
|
26
|
|
|
|
|
|
|
## + common shared regex definitions |
|
27
|
|
|
|
|
|
|
our (%DEF); |
|
28
|
|
|
|
|
|
|
BEGIN { |
|
29
|
|
|
|
|
|
|
##-- adapted from ddc-2.1.1/src/ConcordLib/yyQLexer.l ; extra escapes needed for backslashes ('\\' -> '\\\\') |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
##-- whitespace |
|
32
|
20
|
|
|
20
|
|
96
|
$DEF{ws} = '[ \t\n\r\f\x0b]'; |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
##-- integer boundary characters |
|
35
|
20
|
|
|
|
|
48
|
$DEF{int_boundary} = '[ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\/\\\\\'\".$@_+-]'; |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
##-- bareword symbols |
|
38
|
20
|
|
|
|
|
50
|
$DEF{symbol_cfirst} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\".$@]'; |
|
39
|
20
|
|
|
|
|
50
|
$DEF{symbol_crest} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\"]'; |
|
40
|
20
|
|
|
|
|
54
|
$DEF{symbol_cescape} = '(?:\\\\.)'; |
|
41
|
20
|
|
|
|
|
141
|
$DEF{symbol_text} = "(?:$DEF{symbol_cescape}|$DEF{symbol_cfirst})(?:$DEF{symbol_cescape}|$DEF{symbol_crest})*"; |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
##-- subcorpus symbols (ddc >= v2.2.0; also allow '!' here) |
|
44
|
20
|
|
|
|
|
46
|
$DEF{corpus_cfirst} = '[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"$@]'; |
|
45
|
20
|
|
|
|
|
51
|
$DEF{corpus_crest} = '[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"]'; |
|
46
|
20
|
|
|
|
|
78
|
$DEF{corpus_text} = "(?:$DEF{symbol_cescape}|$DEF{corpus_cfirst})(?:$DEF{symbol_cescape}|$DEF{corpus_crest})*"; |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
##-- bareword index names (underscore and digits ok, but no '.', '-', or '+') |
|
49
|
20
|
|
|
|
|
67
|
$DEF{index_char} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\/\'\".$@+-]'; |
|
50
|
20
|
|
|
|
|
65
|
$DEF{index_name} = "(?:$DEF{index_char}|$DEF{symbol_cescape})+"; |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
##-- single-quoted symbols |
|
53
|
20
|
|
|
|
|
49
|
$DEF{sq_text} = "(?:[^\']|$DEF{symbol_cescape})*"; |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
##-- bareword dates (>= 4 digits of year, otherwise breaks {count(...) #by[$w=1-1]}) |
|
56
|
20
|
|
|
|
|
47
|
$DEF{date_bare} = "[+-]?[0-9]{4,}(-[0-9]{1,2}){1,2}"; |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
##-- regexes |
|
59
|
20
|
|
|
|
|
46
|
$DEF{regex_text} = "(?:(?:\\\\.)|[^\\\\/])*"; |
|
60
|
20
|
|
|
|
|
34
|
$DEF{regex_modifier} = '[dgimsx]'; |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
##-- comments |
|
63
|
20
|
|
|
|
|
77
|
$DEF{comment_text} = "(?:\\\\.|[^\\]])*"; |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
##-- compile patterns |
|
66
|
20
|
|
|
|
|
218
|
foreach (keys %DEF) { |
|
67
|
|
|
|
|
|
|
#print STDERR __PACKAGE__, ": compiling regex macro: $_ ~ /$DEF{$_}/\n"; |
|
68
|
320
|
|
|
|
|
12096
|
$DEF{$_} = qr/$DEF{$_}/; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
##====================================================================== |
|
73
|
|
|
|
|
|
|
## $lex = $CLASS_OR_OBJ->new(%args) |
|
74
|
|
|
|
|
|
|
## + abstract constructor |
|
75
|
|
|
|
|
|
|
## + %$lex, %args: |
|
76
|
|
|
|
|
|
|
## { |
|
77
|
|
|
|
|
|
|
## src => $name, ##-- source name |
|
78
|
|
|
|
|
|
|
## fh => $srcfh, ##-- source filehandle |
|
79
|
|
|
|
|
|
|
## bufr => \$buf, ##-- source buffer (string reference) |
|
80
|
|
|
|
|
|
|
## bufp => $pos, ##-- current pos() in source buffer |
|
81
|
|
|
|
|
|
|
## buf => $buf, ##-- local buffer (for filehandle input) |
|
82
|
|
|
|
|
|
|
## state => $q, ##-- symbolic state name (default: 'INITIAL') |
|
83
|
|
|
|
|
|
|
## stack => \@stack, ##-- state stack |
|
84
|
|
|
|
|
|
|
## |
|
85
|
|
|
|
|
|
|
## ##-- utf-8 or byte mode? |
|
86
|
|
|
|
|
|
|
## utf8 => $bool, ##-- whether to use utf8 or byte-mode (default: true (non-compatible but handy)) |
|
87
|
|
|
|
|
|
|
## |
|
88
|
|
|
|
|
|
|
## ##-- runtime data |
|
89
|
|
|
|
|
|
|
## yytext => $text, ##-- current text |
|
90
|
|
|
|
|
|
|
## yytype => $type, ##-- current token type |
|
91
|
|
|
|
|
|
|
## yylineno => $line, ##-- current source line (file input only) |
|
92
|
|
|
|
|
|
|
## } |
|
93
|
|
|
|
|
|
|
sub new { |
|
94
|
15
|
|
|
15
|
0
|
46
|
my $that = shift; |
|
95
|
15
|
|
33
|
|
|
249
|
my $lex = bless({ |
|
96
|
|
|
|
|
|
|
src =>undef, |
|
97
|
|
|
|
|
|
|
fh =>undef, |
|
98
|
|
|
|
|
|
|
bufr =>undef, |
|
99
|
|
|
|
|
|
|
bufp =>undef, |
|
100
|
|
|
|
|
|
|
buf =>undef, |
|
101
|
|
|
|
|
|
|
utf8 =>1, |
|
102
|
|
|
|
|
|
|
state => 'INITIAL', |
|
103
|
|
|
|
|
|
|
stack => [], |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
yytext=>undef, |
|
106
|
|
|
|
|
|
|
yytype=>undef, |
|
107
|
|
|
|
|
|
|
yylineno=>undef, |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
##-- lexer comment retention hacks |
|
110
|
|
|
|
|
|
|
comments=>[], |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
##-- user args |
|
113
|
|
|
|
|
|
|
@_ |
|
114
|
|
|
|
|
|
|
}, |
|
115
|
|
|
|
|
|
|
ref($that)||$that |
|
116
|
|
|
|
|
|
|
); |
|
117
|
15
|
|
|
|
|
141
|
return $lex; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
## $lex = $lex->clear() |
|
121
|
|
|
|
|
|
|
## + clear lexer buffer, source, etc |
|
122
|
|
|
|
|
|
|
sub clear { |
|
123
|
398
|
|
|
398
|
0
|
598
|
my $lex = shift; |
|
124
|
398
|
|
|
|
|
1298
|
delete @$lex{qw(src fh bufr bufp buf yytext yytype yylineno)}; |
|
125
|
398
|
|
|
|
|
680
|
$lex->{state} = 'INITIAL'; |
|
126
|
398
|
|
|
|
|
568
|
@{$lex->{stack}} = qw(); |
|
|
398
|
|
|
|
|
772
|
|
|
127
|
398
|
|
|
|
|
604
|
@{$lex->{comments}} = qw(); |
|
|
398
|
|
|
|
|
633
|
|
|
128
|
398
|
|
|
|
|
574
|
delete $lex->{_cmtbuf}; |
|
129
|
398
|
|
|
|
|
667
|
return $lex; |
|
130
|
|
|
|
|
|
|
} |
|
131
|
20
|
|
|
20
|
|
95152
|
BEGIN { *reset = *close = \&clear; } |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
##====================================================================== |
|
134
|
|
|
|
|
|
|
## I/O |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
## $lex = $lex->from($which,$src, %opts) |
|
137
|
|
|
|
|
|
|
## + $which is one of qw(fh file string) |
|
138
|
|
|
|
|
|
|
## + $src is the actual source (default: 'string') |
|
139
|
|
|
|
|
|
|
sub from { |
|
140
|
199
|
|
|
199
|
0
|
457
|
my ($lex,$which,$src,%opts) = @_; |
|
141
|
199
|
50
|
|
|
|
474
|
return $lex->fromFh($src,%opts) if ($which eq 'fh'); |
|
142
|
199
|
50
|
|
|
|
393
|
return $lex->fromFile($src,%opts) if ($which eq 'file'); |
|
143
|
199
|
|
|
|
|
563
|
return $lex->fromString($src,%opts); |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
## $lex = $lex->fromFile($filename_or_handle,%opts) |
|
147
|
|
|
|
|
|
|
sub fromFile { |
|
148
|
0
|
|
|
0
|
0
|
0
|
my ($lex,$file,%opts) = @_; |
|
149
|
0
|
0
|
|
|
|
0
|
return $lex->fromFh($file,%opts) if (ref($file)); |
|
150
|
0
|
0
|
|
|
|
0
|
my $fh = IO::File->new("<$file") |
|
151
|
|
|
|
|
|
|
or confess("cannot open '$file' for read: $!"); |
|
152
|
0
|
0
|
|
|
|
0
|
binmode($fh,':encoding(utf8)') if ($lex->{utf8}); |
|
153
|
0
|
|
|
|
|
0
|
return $lex->fromFh($fh,src=>"file \`$file'",%opts); |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
our $FH_SLURP=0; ##-- DEBUG: slurp whole files instead of line-wise input |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
## $lex = $lex->fromFh($fh,%opts) |
|
159
|
|
|
|
|
|
|
## + uses native $fh encoding |
|
160
|
|
|
|
|
|
|
sub fromFh { |
|
161
|
0
|
|
|
0
|
0
|
0
|
my ($lex,$fh,%opts) = @_; |
|
162
|
0
|
0
|
|
|
|
0
|
if ($FH_SLURP) { |
|
163
|
|
|
|
|
|
|
##-- always use string mode |
|
164
|
0
|
|
|
|
|
0
|
local $/=undef; |
|
165
|
0
|
|
|
|
|
0
|
my $buf = $fh->getline; |
|
166
|
0
|
|
|
|
|
0
|
$fh->close(); |
|
167
|
0
|
|
|
|
|
0
|
return $lex->fromString(\$buf,src=>"filehandle \`$fh'",%opts); |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
##-- line-wise buffering |
|
170
|
0
|
|
|
|
|
0
|
$lex->clear(); |
|
171
|
0
|
|
|
|
|
0
|
@$lex{keys %opts} = values(%opts); |
|
172
|
0
|
|
|
|
|
0
|
$lex->{fh} = $fh; |
|
173
|
0
|
|
|
|
|
0
|
$lex->{buf} = undef; |
|
174
|
0
|
|
|
|
|
0
|
$lex->{bufr} = \$lex->{buf}; |
|
175
|
0
|
|
|
|
|
0
|
$lex->{bufp} = 0; |
|
176
|
0
|
0
|
|
|
|
0
|
$lex->{src} = "filehandle \`$fh'" if (!defined($lex->{src})); |
|
177
|
0
|
|
|
|
|
0
|
return $lex; |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
## $lex = $lex->fromString($str,%opts) |
|
181
|
|
|
|
|
|
|
## $lex = $lex->fromString(\$str,%opts) |
|
182
|
|
|
|
|
|
|
sub fromString { |
|
183
|
199
|
|
|
199
|
0
|
426
|
my ($lex,$str,%opts) = @_; |
|
184
|
199
|
|
|
|
|
477
|
$lex->clear(); |
|
185
|
199
|
50
|
|
|
|
522
|
if (ref($str)) { |
|
186
|
199
|
|
|
|
|
357
|
$lex->{bufr} = $str; |
|
187
|
199
|
50
|
|
|
|
903
|
$lex->{src} = "buffer \`$str'" if (!defined($lex->{src})); |
|
188
|
|
|
|
|
|
|
} else { |
|
189
|
0
|
|
|
|
|
0
|
$lex->{bufr} = \$str; |
|
190
|
0
|
0
|
|
|
|
0
|
$lex->{src} = "string \`$str'" if (!defined($lex->{src})); |
|
191
|
|
|
|
|
|
|
} |
|
192
|
199
|
|
|
|
|
418
|
$lex->{bufp} = 0; |
|
193
|
199
|
|
|
|
|
321
|
$lex->{yylineno} = 0; |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
##-- utf8 checks |
|
196
|
199
|
50
|
33
|
|
|
532
|
if ($lex->{utf8} && !utf8::is_utf8(${$lex->{bufr}})) { |
|
|
199
|
0
|
0
|
|
|
793
|
|
|
197
|
|
|
|
|
|
|
##-- lexer:utf8, string:bytes --> assume string is utf8-encoded |
|
198
|
199
|
50
|
|
|
|
332
|
${$lex->{bufr}} = decode_utf8(${$lex->{bufr}}) if (!utf8::is_utf8(${$lex->{bufr}})); |
|
|
199
|
|
|
|
|
4864
|
|
|
|
199
|
|
|
|
|
707
|
|
|
|
199
|
|
|
|
|
533
|
|
|
199
|
|
|
|
|
|
|
} |
|
200
|
0
|
|
|
|
|
0
|
elsif (!$lex->{utf8} && utf8::is_utf8(${$lex->{bufr}})) { |
|
201
|
|
|
|
|
|
|
##-- lexer:bytes, string:utf8 --> encode as utf8 octets |
|
202
|
0
|
|
|
|
|
0
|
${$lex->{bufr}} = encode_utf8(${$lex->{bufr}}); |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
203
|
|
|
|
|
|
|
} |
|
204
|
|
|
|
|
|
|
|
|
205
|
199
|
|
|
|
|
763
|
return $lex; |
|
206
|
|
|
|
|
|
|
} |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
##====================================================================== |
|
209
|
|
|
|
|
|
|
## Utilities |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
## $bool = $lex->eof() |
|
212
|
|
|
|
|
|
|
## + true iff at end-of-file |
|
213
|
|
|
|
|
|
|
sub eof { |
|
214
|
1075
|
50
|
|
1075
|
0
|
2649
|
return $FH_SLURP ? $_[0]->eob() : !$_[0]->getmore(); |
|
215
|
|
|
|
|
|
|
} |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
## $bool = $lex->eob() |
|
218
|
|
|
|
|
|
|
## + true at end-of-buffer |
|
219
|
|
|
|
|
|
|
sub eob { |
|
220
|
1075
|
|
66
|
1075
|
0
|
2978
|
return (!$_[0]{bufr} || !${$_[0]{bufr}} || ($_[0]{bufp}||0) >= length(${$_[0]{bufr}})); |
|
221
|
|
|
|
|
|
|
} |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
## $bufr_or_undef = $lex->getmore() |
|
224
|
|
|
|
|
|
|
## + returns true iff there is still data in the buffer |
|
225
|
|
|
|
|
|
|
sub getmore { |
|
226
|
1075
|
100
|
|
1075
|
0
|
1994
|
return $_[0]{bufr} if (!$_[0]->eob()); |
|
227
|
197
|
50
|
|
|
|
665
|
if (defined($_[0]{fh})) { |
|
228
|
0
|
|
|
|
|
0
|
$_[0]{bufp} = 0; |
|
229
|
0
|
|
|
|
|
0
|
$_[0]{buf} = $_[0]{fh}->getline; |
|
230
|
0
|
|
|
|
|
0
|
$_[0]{bufr} = \$_[0]{buf}; |
|
231
|
0
|
|
|
|
|
0
|
$_[0]{yylineno} = $_[0]{fh}->input_line_number; |
|
232
|
0
|
0
|
|
|
|
0
|
return defined($_[0]{buf}) ? $_[0]{bufr} : undef; |
|
233
|
|
|
|
|
|
|
} |
|
234
|
197
|
|
|
|
|
623
|
return undef; |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
##====================================================================== |
|
238
|
|
|
|
|
|
|
## Runtime lexer accessors |
|
239
|
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
## $yytext = $lex->yytext |
|
241
|
|
|
|
|
|
|
## + always defined; otherwise using $lex->{yytext} is faster |
|
242
|
2
|
50
|
|
2
|
0
|
42
|
sub yytext { return defined($_[0]{yytext}) ? $_[0]{yytext} : ''; } |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
## $yytype = $lex->yytype |
|
245
|
|
|
|
|
|
|
## + always defined; otherwise using $lex->{yytype} is faster |
|
246
|
2
|
50
|
|
2
|
0
|
26
|
sub yytype { return defined($_[0]{yytype}) ? $_[0]{yytype} : '__EOF__'; } |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
## $line = $lex->yylineno() |
|
249
|
|
|
|
|
|
|
## + returns current line number |
|
250
|
|
|
|
|
|
|
sub yylineno { |
|
251
|
2
|
|
|
2
|
0
|
11
|
return $_[0]{yylineno}; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
## $pos = $lex->yycolumn() |
|
255
|
|
|
|
|
|
|
## + return column at which current token starts (if any) |
|
256
|
|
|
|
|
|
|
sub yycolumn { |
|
257
|
0
|
0
|
0
|
0
|
0
|
0
|
return ($_[0]{bufp}||0) - (defined($_[0]{yytext}) ? length($_[0]{yytext}) : 0); |
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
## $pos = $lex->yypos() |
|
261
|
|
|
|
|
|
|
## + return byte position in current line (or input string) |
|
262
|
|
|
|
|
|
|
sub yypos { |
|
263
|
0
|
|
0
|
0
|
0
|
0
|
return ($_[0]{bufp}||0); |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
## $string = $lex->yyerror(@msg) |
|
267
|
|
|
|
|
|
|
## + create a helpful error message |
|
268
|
|
|
|
|
|
|
sub yyerror { |
|
269
|
0
|
|
|
0
|
0
|
0
|
my $lex = shift; |
|
270
|
0
|
|
|
|
|
0
|
confess(ref($lex).": error in ".$lex->yywhere().join('',@_)); |
|
271
|
|
|
|
|
|
|
} |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
## $string = $lex->yywhere() |
|
274
|
|
|
|
|
|
|
## + location string used by yyerror() |
|
275
|
|
|
|
|
|
|
sub yywhere { |
|
276
|
0
|
|
|
0
|
0
|
0
|
my $lex = shift; |
|
277
|
|
|
|
|
|
|
return ("$lex->{src} at " |
|
278
|
|
|
|
|
|
|
.(defined($lex->{fh}) ? ("line $lex->{yylineno}, ") : '') |
|
279
|
|
|
|
|
|
|
."column ".$lex->yycolumn |
|
280
|
0
|
0
|
|
|
|
0
|
.", near ".(defined($lex->{yytext}) ? "\`$lex->{yytext}\'" : '__EOF__') |
|
|
|
0
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
); |
|
282
|
|
|
|
|
|
|
} |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
## $q = $lex->yypushq($new_state) |
|
285
|
|
|
|
|
|
|
sub yypushq { |
|
286
|
19
|
|
|
19
|
0
|
44
|
my ($lex,$qnew) = @_; |
|
287
|
19
|
|
|
|
|
34
|
push(@{$lex->{stack}},$lex->{state}); |
|
|
19
|
|
|
|
|
53
|
|
|
288
|
19
|
|
|
|
|
46
|
return $lex->{state} = $qnew; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
## $q = $lex->yypopq() |
|
292
|
|
|
|
|
|
|
sub yypopq { |
|
293
|
21
|
|
|
21
|
0
|
42
|
my $lex = shift; |
|
294
|
21
|
|
100
|
|
|
34
|
return $lex->{state} = pop(@{$lex->{stack}}) || 'INITIAL'; |
|
295
|
|
|
|
|
|
|
} |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
##====================================================================== |
|
298
|
|
|
|
|
|
|
## Runtime lexer routines |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
## ($typ,$text) = $lex->yylex() |
|
301
|
|
|
|
|
|
|
## + get next token from input stream |
|
302
|
|
|
|
|
|
|
sub yylex { |
|
303
|
1057
|
|
|
1057
|
0
|
1621
|
my $lex = shift; |
|
304
|
1057
|
|
|
|
|
1686
|
my ($bufr,$type,$text,$match,@part); |
|
305
|
|
|
|
|
|
|
#use re 'eval'; ##-- dangerous! |
|
306
|
|
|
|
|
|
|
LEXBUF: |
|
307
|
1057
|
|
|
|
|
2167
|
while (!$lex->eof()) { |
|
308
|
878
|
|
|
|
|
1611
|
$bufr = $lex->{bufr}; |
|
309
|
878
|
|
|
|
|
2173
|
pos($$bufr) = $lex->{bufp}; |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
LEXSKIP: |
|
312
|
|
|
|
|
|
|
##------------------------------------ |
|
313
|
|
|
|
|
|
|
## LEXSKIP: main lexer loop |
|
314
|
878
|
|
|
|
|
1535
|
while (1) { |
|
315
|
1241
|
|
|
|
|
2089
|
$type = $text = $match = undef; |
|
316
|
1241
|
|
|
|
|
1922
|
@part = qw(); |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
##------------------------ |
|
319
|
1241
|
100
|
100
|
|
|
3410
|
if ($lex->{state} eq 'INITIAL' || $lex->{state} eq 'Q_MATCHID') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
##-- end-of-file (should be first pattern) |
|
322
|
1137
|
100
|
100
|
|
|
54219
|
if ($$bufr =~ m/\G\z/) { $type = '__EOF__'; } |
|
|
18
|
100
|
100
|
|
|
38
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
##-- comments |
|
325
|
14
|
|
|
|
|
29
|
elsif ($$bufr =~ m/\G\#:[^\n]*/sp) { $type='__SKIP__'; push(@{$lex->{comments}},${^MATCH}."\n"); } |
|
|
14
|
|
|
|
|
20
|
|
|
|
14
|
|
|
|
|
65
|
|
|
326
|
19
|
|
|
|
|
40
|
elsif ($$bufr =~ m/\G\#\[/p) { $type='__SKIP__'; $lex->{_cmtbuf}=${^MATCH}; $lex->yypushq('Q_COMMENT'); } |
|
|
19
|
|
|
|
|
54
|
|
|
|
19
|
|
|
|
|
60
|
|
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
##-- operators |
|
329
|
21
|
|
|
|
|
41
|
elsif ($$bufr =~ m/\G\&\&/p) { $type = 'OP_BOOL_AND'; } |
|
330
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G\|\|/p) { $type = 'OP_BOOL_OR'; } |
|
331
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\Gnear/pi) { $type = 'NEAR'; } |
|
332
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:\!=|\&\!=|\&=\s*\!|\!with|with(?:out|\s*\!))/pi) { $type = 'WITHOUT'; } |
|
333
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G(?:\|=|withor|orwith|wor)/pi) { $type = 'WITHOR'; } |
|
334
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:\&=|with)/pi) { $type = 'WITH'; } |
|
335
|
28
|
|
|
|
|
64
|
elsif ($$bufr =~ m/\Gcount/pi) { $type = 'COUNT'; } |
|
336
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\Gkeys/pi) { $type = 'KEYS'; } |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
##-- count-by keywords |
|
339
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G(?:file|doc)_?id/pi) { $type = 'KW_FILEID'; } |
|
340
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:file|doc)_?(?:name)?/pi) { $type = 'KW_FILENAME'; } |
|
341
|
2
|
|
|
|
|
5
|
elsif ($$bufr =~ m/\Gdate/pi) { $type = 'KW_DATE'; $lex->{state}='Q_DATE' } |
|
|
2
|
|
|
|
|
5
|
|
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
##-- query operators |
|
344
|
17
|
|
|
|
|
34
|
elsif ($$bufr =~ m/\G\#(?:comment|cmt)/pi) { $type = 'KW_COMMENT'; } |
|
345
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:(?:co?n?te?xt?|n))/pi) { $type = 'CNTXT'; } |
|
346
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:with)?in/pi) { $type = 'WITHIN'; } |
|
347
|
3
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:sep(?:arate)?|nojoin)(?:_hits)?/pi) { $type = 'SEPARATE_HITS'; } |
|
348
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:nosep(?:arate)?|join)(?:_hits)?/pi) { $type = 'NOSEPARATE_HITS'; } |
|
349
|
4
|
|
|
|
|
12
|
elsif ($$bufr =~ m/\G\#(?:is_|has_)?date/pi) { $type = 'IS_DATE'; } |
|
350
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#has(?:_field)?/pi) { $type = 'HAS_FIELD'; } |
|
351
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#file(?:_?)names/pi) { $type = 'FILENAMES_ONLY'; } |
|
352
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#debug_rank/pi) { $type = 'DEBUG_RANK'; } |
|
353
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_rank/pi) { $type = 'GREATER_BY_RANK'; } |
|
354
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_rank/pi) { $type = 'LESS_BY_RANK'; } |
|
355
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_date/pi) { $type = 'GREATER_BY_DATE'; } |
|
356
|
10
|
|
|
|
|
26
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_date/pi) { $type = 'LESS_BY_DATE'; } |
|
357
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_size/pi) { $type = 'GREATER_BY_SIZE'; } |
|
358
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_size/pi) { $type = 'LESS_BY_SIZE'; } |
|
359
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:is_|has_)?size/pi) { $type = 'IS_SIZE'; } |
|
360
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?left/pi) { $type = 'LESS_BY_LEFT'; } |
|
361
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)left/pi) { $type = 'GREATER_BY_LEFT'; } |
|
362
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?right/pi) { $type = 'LESS_BY_RIGHT'; } |
|
363
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)right/pi) { $type = 'GREATER_BY_RIGHT'; } |
|
364
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?mid(?:dle)?/pi) { $type = 'LESS_BY_MIDDLE'; } |
|
365
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)mid(?:dle)?/pi) { $type = 'GREATER_BY_MIDDLE'; } |
|
366
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_key)/pi) { $type = 'LESS_BY_KEY'; } |
|
367
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_key)/pi) { $type = 'GREATER_BY_KEY'; } |
|
368
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_(?:count|val(?:ue)?))/pi) { $type = 'LESS_BY_COUNT'; } |
|
369
|
2
|
|
|
|
|
8
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_(?:count|val(?:ue)?))/pi) { $type = 'GREATER_BY_COUNT'; } |
|
370
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?/pi) { $type = 'LESS_BY'; } |
|
371
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?/pi) { $type = 'GREATER_BY'; } |
|
372
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#rand(?:om)?/pi) { $type = 'RANDOM'; } |
|
373
|
28
|
|
|
|
|
62
|
elsif ($$bufr =~ m/\G\#by/pi) { $type = 'BY'; } |
|
374
|
2
|
|
|
|
|
8
|
elsif ($$bufr =~ m/\G\#samp(?:le)?/pi) { $type = 'SAMPLE'; } |
|
375
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#clim(?:it)?/pi) { $type = 'CLIMIT'; } |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
##-- regexes |
|
378
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='NEG_REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
379
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\//po) { $type='NEG_REGEX'; $text=$1; } |
|
|
0
|
|
|
|
|
0
|
|
|
380
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
381
|
11
|
|
|
|
|
28
|
elsif ($$bufr =~ m/\G\/($DEF{regex_text})\//po) { $type='REGEX'; $text=$1; } |
|
|
11
|
|
|
|
|
31
|
|
|
382
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\Gs\/($DEF{regex_text})\//po) { $type='REGEX_SEARCH'; $text=$1; $lex->{state}='Q_REGREP'; } |
|
|
2
|
|
|
|
|
7
|
|
|
|
2
|
|
|
|
|
5
|
|
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
##-- punctutation & special characters |
|
385
|
2
|
|
|
|
|
85
|
elsif ($$bufr =~ m/\G#=/p) { $type = 'HASH_EQUAL'; } ##-- hash+equal: exact distance |
|
386
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G#
|
|
387
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G#>/p) { $type = 'HASH_GREATER'; } ##-- hash+greater: min distance |
|
388
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\$\./p) { $type = 'DOLLAR_DOT'; } ##-- positional anchor pseudo-index |
|
389
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\:\{/p) { $type = 'COLON_LBRACE'; } ##-- theusaurus-query operator |
|
390
|
4
|
|
|
|
|
12
|
elsif ($$bufr =~ m/\G\@\{/p) { $type = 'AT_LBRACE'; } ##-- literal-set operator |
|
391
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\{/p) { $type = 'STAR_LBRACE'; } ##-- prefix-set opener |
|
392
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\}\*/p) { $type = 'RBRACE_STAR'; } ##-- suffix-set closer |
|
393
|
74
|
|
|
|
|
224
|
elsif ($$bufr =~ m/\G[!.,;=@%^#~\/]/p) { $type = ${^MATCH}; } ##-- single-char punctuation operators |
|
394
|
164
|
|
|
|
|
464
|
elsif ($$bufr =~ m/\G[\[\]{}()<>]/p) { $type = ${^MATCH}; } ##-- parentheses |
|
395
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\"/p) { $type = ${^MATCH}; } ##-- double-quotes |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
##-- subcorpus path-lists |
|
398
|
26
|
|
|
|
|
73
|
elsif ($$bufr =~ m/\G\:/p) { $type = ${^MATCH}; $lex->{state}='Q_CORPORA'; } |
|
|
26
|
|
|
|
|
55
|
|
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
##-- truncated symbols |
|
401
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'\*/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated quoted string (infix symbol) |
|
|
0
|
|
|
|
|
0
|
|
|
402
|
2
|
|
|
|
|
8
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated quoted string (prefix symbol) |
|
|
2
|
|
|
|
|
6
|
|
|
403
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated quoted string (suffix symbol) |
|
|
0
|
|
|
|
|
0
|
|
|
404
|
|
|
|
|
|
|
|
|
405
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*($DEF{symbol_text})\*/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated bareword (infix symbol) |
|
|
0
|
|
|
|
|
0
|
|
|
406
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G($DEF{symbol_text})\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated bareword (prefix symbol) |
|
|
2
|
|
|
|
|
7
|
|
|
407
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*($DEF{symbol_text})/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated bareword (suffix symbol) |
|
|
0
|
|
|
|
|
0
|
|
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
##-- index names (special handling to allow count(foo) #by[$w-1] |
|
410
|
24
|
|
|
|
|
62
|
elsif ($$bufr =~ m/\G\$($DEF{index_name})/po) { $type='INDEX'; $text=$1; } |
|
|
24
|
|
|
|
|
64
|
|
|
411
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\$'($DEF{index_name})'/po) { $type='INDEX'; $text=$1; } |
|
|
0
|
|
|
|
|
0
|
|
|
412
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ /m\G\$/p) { $type='$'; } |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
##-- numeric tokens (handled below with symbols for perl lexer w/o longest-match) |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
##-- single-term wildcard |
|
417
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*/p) { $type = '*'; } |
|
418
|
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
##-- term expander pipelines |
|
420
|
4
|
|
|
|
|
13
|
elsif ($$bufr =~ m/\G\|/p) { $lex->{state}='Q_XPIPE'; $type='__SKIP__'; } |
|
|
4
|
|
|
|
|
7
|
|
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
##-- symbols: quoted |
|
423
|
|
|
|
|
|
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { |
|
424
|
39
|
|
|
|
|
133
|
$text = $1; |
|
425
|
39
|
100
|
|
|
|
203
|
if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; } |
|
|
10
|
100
|
|
|
|
21
|
|
|
426
|
6
|
|
|
|
|
20
|
elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; } |
|
427
|
23
|
|
|
|
|
46
|
else { $type='SYMBOL'; } |
|
428
|
|
|
|
|
|
|
} |
|
429
|
|
|
|
|
|
|
##-- symbols: barewords |
|
430
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'INITIAL' && $$bufr =~ m/\G$DEF{symbol_text}/po) { |
|
431
|
269
|
|
|
|
|
834
|
$text = ${^MATCH}; |
|
432
|
269
|
|
|
|
|
441
|
$match = ${^MATCH}; |
|
433
|
269
|
100
|
|
|
|
1307
|
if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; } |
|
|
36
|
100
|
|
|
|
86
|
|
|
434
|
6
|
|
|
|
|
16
|
elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; } |
|
435
|
227
|
|
|
|
|
427
|
else { $type='SYMBOL'; } |
|
436
|
|
|
|
|
|
|
} |
|
437
|
|
|
|
|
|
|
##-- barewords: integers and dates (Q_MATCHID) |
|
438
|
14
|
|
|
|
|
48
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+(?=$DEF{int_boundary})/po) { $type='INTEGER'; } |
|
439
|
26
|
|
|
|
|
143
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+\z/p) { $type='INTEGER'; } |
|
440
|
|
|
|
|
|
|
|
|
441
|
0
|
|
|
|
|
0
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}(?=$DEF{int_boundary})/po) { $type='DATE'; } |
|
442
|
0
|
|
|
|
|
0
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}\z/po) { $type='DATE'; } |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
##-- misc |
|
445
|
284
|
|
|
|
|
785
|
elsif ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } |
|
446
|
|
|
|
|
|
|
#elsif ($$bufr =~ m/\G./p) { $type = 'SYMBOL'; } |
|
447
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G./p) { $type = '__ERROR__'; } |
|
448
|
|
|
|
|
|
|
|
|
449
|
1137
|
100
|
|
|
|
3013
|
$match = ${^MATCH} if (!defined($match)); |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
##------------------------ |
|
452
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_CORPORA') { |
|
453
|
58
|
100
|
|
|
|
946
|
if ($$bufr =~ m/\G$DEF{corpus_text}/p) { $type='SYMBOL'; } |
|
|
22
|
100
|
|
|
|
53
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
454
|
18
|
|
|
|
|
42
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $type='SYMBOL'; $text=$1; } |
|
|
18
|
|
|
|
|
41
|
|
|
455
|
16
|
|
|
|
|
48
|
elsif ($$bufr =~ m/\G\,/p) { $type=${^MATCH}; } |
|
456
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\s+/p) { $type='__SKIP__'; } |
|
457
|
2
|
|
|
|
|
8
|
else { $type='__SKIP__'; $lex->yypopq(); } |
|
|
2
|
|
|
|
|
8
|
|
|
458
|
|
|
|
|
|
|
|
|
459
|
58
|
|
|
|
|
124
|
$match = ${^MATCH}; |
|
460
|
|
|
|
|
|
|
} |
|
461
|
|
|
|
|
|
|
##------------------------ |
|
462
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_COMMENT') { |
|
463
|
38
|
100
|
|
|
|
450
|
if ($$bufr =~ m/\G\]/p) { |
|
|
|
50
|
|
|
|
|
|
|
464
|
19
|
|
|
|
|
41
|
$type='__SKIP__'; |
|
465
|
19
|
|
|
|
|
45
|
$lex->{_cmtbuf} .= ${^MATCH}; |
|
466
|
19
|
|
|
|
|
41
|
push(@{$lex->{comments}}, $lex->{_cmtbuf}); |
|
|
19
|
|
|
|
|
50
|
|
|
467
|
19
|
|
|
|
|
43
|
delete $lex->{_cmtbuf}; |
|
468
|
19
|
|
|
|
|
50
|
$lex->yypopq(); |
|
469
|
|
|
|
|
|
|
} |
|
470
|
19
|
|
|
|
|
46
|
elsif ($$bufr =~ m/\G$DEF{comment_text}/sp) { $type='__SKIP__'; $lex->{_cmtbuf} .= ${^MATCH}; } |
|
|
19
|
|
|
|
|
54
|
|
|
471
|
0
|
|
|
|
|
0
|
else { $type='__ERROR__'; } |
|
472
|
|
|
|
|
|
|
|
|
473
|
38
|
|
|
|
|
91
|
$match = ${^MATCH}; |
|
474
|
|
|
|
|
|
|
} |
|
475
|
|
|
|
|
|
|
##------------------------ |
|
476
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_DATE') { |
|
477
|
2
|
50
|
|
|
|
21
|
if ($$bufr =~ m/^\s+/) { $type = '__SKIP__'; } |
|
|
0
|
50
|
|
|
|
0
|
|
|
478
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/^\//) { $type = '/'; $lex->{state}='INITIAL'; } |
|
|
0
|
|
|
|
|
0
|
|
|
479
|
2
|
|
|
|
|
5
|
else { $lex->{state}='INITIAL'; $type='__SKIP__'; } |
|
|
2
|
|
|
|
|
6
|
|
|
480
|
|
|
|
|
|
|
|
|
481
|
2
|
|
|
|
|
5
|
$match = ${^MATCH}; |
|
482
|
|
|
|
|
|
|
} |
|
483
|
|
|
|
|
|
|
##------------------------ |
|
484
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_REGREP') { |
|
485
|
2
|
50
|
|
|
|
149
|
if ($$bufr =~ m/\G($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
|
0
|
50
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
486
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G($DEF{regex_text})\//po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='INITIAL'; } |
|
|
2
|
|
|
|
|
7
|
|
|
|
2
|
|
|
|
|
6
|
|
|
487
|
|
|
|
|
|
|
|
|
488
|
2
|
|
|
|
|
8
|
$match = ${^MATCH}; |
|
489
|
|
|
|
|
|
|
} |
|
490
|
|
|
|
|
|
|
##------------------------ |
|
491
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_REGOPT') { |
|
492
|
0
|
0
|
|
|
|
0
|
if ($$bufr =~ m/\G$DEF{regex_modifier}+/po) { $type='REGOPT'; } |
|
|
0
|
|
|
|
|
0
|
|
|
493
|
0
|
|
|
|
|
0
|
else { $type='__SKIP__'; } |
|
494
|
0
|
|
|
|
|
0
|
$lex->{state} = 'INITIAL'; |
|
495
|
|
|
|
|
|
|
|
|
496
|
0
|
|
|
|
|
0
|
$match = ${^MATCH}; |
|
497
|
|
|
|
|
|
|
} |
|
498
|
|
|
|
|
|
|
##------------------------ |
|
499
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_XPIPE') { |
|
500
|
4
|
50
|
|
|
|
218
|
if ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } ##-- whitespace: skip |
|
|
0
|
50
|
|
|
|
0
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
501
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\-/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
|
0
|
|
|
|
|
0
|
|
|
502
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=$1; } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
503
|
4
|
|
|
|
|
12
|
elsif ($$bufr =~ m/\G$DEF{symbol_text}/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
|
4
|
|
|
|
|
9
|
|
|
504
|
|
|
|
|
|
|
#elsif ($$bufr =~ m/\G\z/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
505
|
0
|
|
|
|
|
0
|
else { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=''; } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
506
|
|
|
|
|
|
|
|
|
507
|
4
|
|
|
|
|
11
|
$match = ${^MATCH}; |
|
508
|
|
|
|
|
|
|
} |
|
509
|
|
|
|
|
|
|
##------------------------ |
|
510
|
|
|
|
|
|
|
## END perl-ification of flex sources |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
##-- guts |
|
513
|
1241
|
100
|
|
|
|
2582
|
$text = $match if (!defined($text)); |
|
514
|
1241
|
100
|
|
|
|
3196
|
$lex->{bufp} += length($match) if (defined($match)); |
|
515
|
|
|
|
|
|
|
|
|
516
|
1241
|
|
|
|
|
2852
|
pos($$bufr) = $lex->{bufp}; |
|
517
|
1241
|
50
|
|
|
|
2888
|
return if (!defined($type)); |
|
518
|
|
|
|
|
|
|
|
|
519
|
1241
|
100
|
|
|
|
2573
|
next LEXSKIP if ($type eq '__SKIP__'); |
|
520
|
878
|
100
|
|
|
|
1667
|
next LEXBUF if ($type eq '__EOF__'); |
|
521
|
|
|
|
|
|
|
#elsif ($type eq '__ERROR__') { |
|
522
|
|
|
|
|
|
|
# return $lex->yyerror(); |
|
523
|
|
|
|
|
|
|
#} |
|
524
|
860
|
|
|
|
|
4932
|
return @$lex{qw(yytype yytext)} = ($type,$text); |
|
525
|
|
|
|
|
|
|
} |
|
526
|
|
|
|
|
|
|
} |
|
527
|
197
|
|
|
|
|
790
|
return @$lex{qw(yytype yytext)} = ('__EOF__',undef); |
|
528
|
|
|
|
|
|
|
} |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
##====================================================================== |
|
531
|
|
|
|
|
|
|
## Testing: dummy lexing |
|
532
|
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
## undef = $lex->dummylex(@from_whatever) |
|
534
|
|
|
|
|
|
|
sub dummylex { |
|
535
|
0
|
|
|
0
|
0
|
|
my $lex = shift; |
|
536
|
0
|
|
|
|
|
|
$lex->reset(); |
|
537
|
0
|
|
|
|
|
|
$lex->from(@_); |
|
538
|
0
|
|
|
|
|
|
my ($type,$text); |
|
539
|
|
|
|
|
|
|
TOKEN: |
|
540
|
0
|
|
|
|
|
|
while(1) { |
|
541
|
0
|
|
|
|
|
|
($type,$text) = $lex->yylex(); |
|
542
|
|
|
|
|
|
|
print("-" x 64, "\n", |
|
543
|
|
|
|
|
|
|
">> Line: ", $lex->yylineno, ", Pos: ", $lex->yypos, "\n", |
|
544
|
0
|
0
|
|
|
|
|
">> State: ", (defined($lex->{state}) ? $lex->{state} : '(undef)'), "\n", |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
">> Type: ", (defined($type) ? $type : '(undef)'), "\n", |
|
546
|
|
|
|
|
|
|
">> Text: ", (defined($text) ? $text : '(undef)'), "\n", |
|
547
|
|
|
|
|
|
|
); |
|
548
|
0
|
0
|
|
|
|
|
if (!defined($type)) { |
|
549
|
0
|
|
|
|
|
|
warn(":: undef type!"); |
|
550
|
0
|
|
|
|
|
|
return; |
|
551
|
|
|
|
|
|
|
} |
|
552
|
0
|
0
|
|
|
|
|
if ($type eq '__ERROR__') { |
|
553
|
0
|
|
|
|
|
|
print(":: ERROR DETECTED\n"); |
|
554
|
0
|
|
|
|
|
|
$lex->yyerror(); |
|
555
|
|
|
|
|
|
|
} |
|
556
|
0
|
0
|
|
|
|
|
last if ($type eq '__EOF__'); |
|
557
|
|
|
|
|
|
|
} |
|
558
|
|
|
|
|
|
|
} |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
1; ##-- be happy |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
__END__ |