| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
##-*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
## File: DDC::::yyqlexer.pm |
|
4
|
|
|
|
|
|
|
## Author: Bryan Jurish |
|
5
|
|
|
|
|
|
|
## Description: |
|
6
|
|
|
|
|
|
|
## + lexer for ddc queries (formerly DDC::Query::yylexer) |
|
7
|
|
|
|
|
|
|
## + last updated for ddc v2.1.15 |
|
8
|
|
|
|
|
|
|
##====================================================================== |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
package DDC::PP::yyqlexer; |
|
11
|
20
|
|
|
20
|
|
509
|
use 5.010; ##-- we need at least v5.10.0 for /p regex modifier |
|
|
20
|
|
|
|
|
74
|
|
|
12
|
20
|
|
|
20
|
|
119
|
use Encode qw(encode_utf8 decode_utf8); |
|
|
20
|
|
|
|
|
66
|
|
|
|
20
|
|
|
|
|
1139
|
|
|
13
|
20
|
|
|
20
|
|
122
|
use Carp; |
|
|
20
|
|
|
|
|
61
|
|
|
|
20
|
|
|
|
|
1193
|
|
|
14
|
20
|
|
|
20
|
|
161
|
use IO::File; |
|
|
20
|
|
|
|
|
90
|
|
|
|
20
|
|
|
|
|
2971
|
|
|
15
|
20
|
|
|
20
|
|
178
|
use IO::Handle; |
|
|
20
|
|
|
|
|
39
|
|
|
|
20
|
|
|
|
|
753
|
|
|
16
|
20
|
|
|
20
|
|
114
|
use strict; |
|
|
20
|
|
|
|
|
58
|
|
|
|
20
|
|
|
|
|
7590
|
|
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##====================================================================== |
|
19
|
|
|
|
|
|
|
## Globals etc. |
|
20
|
|
|
|
|
|
|
our @ISA = qw(); |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
|
23
|
|
|
|
|
|
|
## Globals: regexes for Parse::Lex lexer token regexes |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
## %DEF |
|
26
|
|
|
|
|
|
|
## + common shared regex definitions |
|
27
|
|
|
|
|
|
|
our (%DEF); |
|
28
|
|
|
|
|
|
|
BEGIN { |
|
29
|
|
|
|
|
|
|
##-- adapted from ddc-2.1.1/src/ConcordLib/yyQLexer.l ; extra escapes needed for backslashes ('\\' -> '\\\\') |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
##-- whitespace |
|
32
|
20
|
|
|
20
|
|
109
|
$DEF{ws} = '[ \t\n\r\f\x0b]'; |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
##-- integer boundary characters |
|
35
|
20
|
|
|
|
|
43
|
$DEF{int_boundary} = '[ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\/\\\\\'\".$@_+-]'; |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
##-- bareword symbols |
|
38
|
20
|
|
|
|
|
42
|
$DEF{symbol_cfirst} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\".$@]'; |
|
39
|
20
|
|
|
|
|
66
|
$DEF{symbol_crest} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\"]'; |
|
40
|
20
|
|
|
|
|
36
|
$DEF{symbol_cescape} = '(?:\\\\.)'; |
|
41
|
20
|
|
|
|
|
140
|
$DEF{symbol_text} = "(?:$DEF{symbol_cescape}|$DEF{symbol_cfirst})(?:$DEF{symbol_cescape}|$DEF{symbol_crest})*"; |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
##-- subcorpus symbols (ddc >= v2.2.0; also allow '!' here) |
|
44
|
20
|
|
|
|
|
73
|
$DEF{corpus_cfirst} = '[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"$@]'; |
|
45
|
20
|
|
|
|
|
55
|
$DEF{corpus_crest} = '[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"]'; |
|
46
|
20
|
|
|
|
|
83
|
$DEF{corpus_text} = "(?:$DEF{symbol_cescape}|$DEF{corpus_cfirst})(?:$DEF{symbol_cescape}|$DEF{corpus_crest})*"; |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
##-- bareword index names (underscore and digits ok, but no '.', '-', or '+') |
|
49
|
20
|
|
|
|
|
42
|
$DEF{index_char} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\/\'\".$@+-]'; |
|
50
|
20
|
|
|
|
|
73
|
$DEF{index_name} = "(?:$DEF{index_char}|$DEF{symbol_cescape})+"; |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
##-- single-quoted symbols |
|
53
|
20
|
|
|
|
|
51
|
$DEF{sq_text} = "(?:[^\']|$DEF{symbol_cescape})*"; |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
##-- bareword dates (>= 4 digits of year, otherwise breaks {count(...) #by[$w=1-1]}) |
|
56
|
20
|
|
|
|
|
44
|
$DEF{date_bare} = "[+-]?[0-9]{4,}(-[0-9]{1,2}){1,2}"; |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
##-- regexes |
|
59
|
20
|
|
|
|
|
47
|
$DEF{regex_text} = "(?:(?:\\\\.)|[^\\\\/])*"; |
|
60
|
20
|
|
|
|
|
49
|
$DEF{regex_modifier} = '[dgimsx]'; |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
##-- comments |
|
63
|
20
|
|
|
|
|
79
|
$DEF{comment_text} = "(?:\\\\.|[^\\]])*"; |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
##-- compile patterns |
|
66
|
20
|
|
|
|
|
116
|
foreach (keys %DEF) { |
|
67
|
|
|
|
|
|
|
#print STDERR __PACKAGE__, ": compiling regex macro: $_ ~ /$DEF{$_}/\n"; |
|
68
|
320
|
|
|
|
|
11879
|
$DEF{$_} = qr/$DEF{$_}/; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
} |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
##====================================================================== |
|
73
|
|
|
|
|
|
|
## $lex = $CLASS_OR_OBJ->new(%args) |
|
74
|
|
|
|
|
|
|
## + abstract constructor |
|
75
|
|
|
|
|
|
|
## + %$lex, %args: |
|
76
|
|
|
|
|
|
|
## { |
|
77
|
|
|
|
|
|
|
## src => $name, ##-- source name |
|
78
|
|
|
|
|
|
|
## fh => $srcfh, ##-- source filehandle |
|
79
|
|
|
|
|
|
|
## bufr => \$buf, ##-- source buffer (string reference) |
|
80
|
|
|
|
|
|
|
## bufp => $pos, ##-- current pos() in source buffer |
|
81
|
|
|
|
|
|
|
## buf => $buf, ##-- local buffer (for filehandle input) |
|
82
|
|
|
|
|
|
|
## state => $q, ##-- symbolic state name (default: 'INITIAL') |
|
83
|
|
|
|
|
|
|
## stack => \@stack, ##-- state stack |
|
84
|
|
|
|
|
|
|
## |
|
85
|
|
|
|
|
|
|
## ##-- utf-8 or byte mode? |
|
86
|
|
|
|
|
|
|
## utf8 => $bool, ##-- whether to use utf8 or byte-mode (default: true (non-compatible but handy)) |
|
87
|
|
|
|
|
|
|
## |
|
88
|
|
|
|
|
|
|
## ##-- runtime data |
|
89
|
|
|
|
|
|
|
## yytext => $text, ##-- current text |
|
90
|
|
|
|
|
|
|
## yytype => $type, ##-- current token type |
|
91
|
|
|
|
|
|
|
## yylineno => $line, ##-- current source line (file input only) |
|
92
|
|
|
|
|
|
|
## } |
|
93
|
|
|
|
|
|
|
sub new { |
|
94
|
15
|
|
|
15
|
0
|
44
|
my $that = shift; |
|
95
|
15
|
|
33
|
|
|
244
|
my $lex = bless({ |
|
96
|
|
|
|
|
|
|
src =>undef, |
|
97
|
|
|
|
|
|
|
fh =>undef, |
|
98
|
|
|
|
|
|
|
bufr =>undef, |
|
99
|
|
|
|
|
|
|
bufp =>undef, |
|
100
|
|
|
|
|
|
|
buf =>undef, |
|
101
|
|
|
|
|
|
|
utf8 =>1, |
|
102
|
|
|
|
|
|
|
state => 'INITIAL', |
|
103
|
|
|
|
|
|
|
stack => [], |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
yytext=>undef, |
|
106
|
|
|
|
|
|
|
yytype=>undef, |
|
107
|
|
|
|
|
|
|
yylineno=>undef, |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
##-- lexer comment retention hacks |
|
110
|
|
|
|
|
|
|
comments=>[], |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
##-- user args |
|
113
|
|
|
|
|
|
|
@_ |
|
114
|
|
|
|
|
|
|
}, |
|
115
|
|
|
|
|
|
|
ref($that)||$that |
|
116
|
|
|
|
|
|
|
); |
|
117
|
15
|
|
|
|
|
129
|
return $lex; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
## $lex = $lex->clear() |
|
121
|
|
|
|
|
|
|
## + clear lexer buffer, source, etc |
|
122
|
|
|
|
|
|
|
sub clear { |
|
123
|
422
|
|
|
422
|
0
|
615
|
my $lex = shift; |
|
124
|
422
|
|
|
|
|
1393
|
delete @$lex{qw(src fh bufr bufp buf yytext yytype yylineno)}; |
|
125
|
422
|
|
|
|
|
762
|
$lex->{state} = 'INITIAL'; |
|
126
|
422
|
|
|
|
|
603
|
@{$lex->{stack}} = qw(); |
|
|
422
|
|
|
|
|
827
|
|
|
127
|
422
|
|
|
|
|
643
|
@{$lex->{comments}} = qw(); |
|
|
422
|
|
|
|
|
661
|
|
|
128
|
422
|
|
|
|
|
660
|
$lex->{yyDtrsPos} = 0; |
|
129
|
422
|
|
|
|
|
601
|
$lex->{yyDtrsLen} = 0; |
|
130
|
422
|
|
|
|
|
710
|
delete $lex->{_cmtbuf}; |
|
131
|
422
|
|
|
|
|
694
|
return $lex; |
|
132
|
|
|
|
|
|
|
} |
|
133
|
20
|
|
|
20
|
|
96156
|
BEGIN { *reset = *close = \&clear; } |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
##====================================================================== |
|
136
|
|
|
|
|
|
|
## I/O |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
## $lex = $lex->from($which,$src, %opts) |
|
139
|
|
|
|
|
|
|
## + $which is one of qw(fh file string) |
|
140
|
|
|
|
|
|
|
## + $src is the actual source (default: 'string') |
|
141
|
|
|
|
|
|
|
sub from { |
|
142
|
211
|
|
|
211
|
0
|
496
|
my ($lex,$which,$src,%opts) = @_; |
|
143
|
211
|
50
|
|
|
|
514
|
return $lex->fromFh($src,%opts) if ($which eq 'fh'); |
|
144
|
211
|
50
|
|
|
|
431
|
return $lex->fromFile($src,%opts) if ($which eq 'file'); |
|
145
|
211
|
|
|
|
|
585
|
return $lex->fromString($src,%opts); |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
## $lex = $lex->fromFile($filename_or_handle,%opts) |
|
149
|
|
|
|
|
|
|
sub fromFile { |
|
150
|
0
|
|
|
0
|
0
|
0
|
my ($lex,$file,%opts) = @_; |
|
151
|
0
|
0
|
|
|
|
0
|
return $lex->fromFh($file,%opts) if (ref($file)); |
|
152
|
0
|
0
|
|
|
|
0
|
my $fh = IO::File->new("<$file") |
|
153
|
|
|
|
|
|
|
or confess("cannot open '$file' for read: $!"); |
|
154
|
0
|
0
|
|
|
|
0
|
binmode($fh,':encoding(utf8)') if ($lex->{utf8}); |
|
155
|
0
|
|
|
|
|
0
|
return $lex->fromFh($fh,src=>"file \`$file'",%opts); |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
our $FH_SLURP=0; ##-- DEBUG: slurp whole files instead of line-wise input |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
## $lex = $lex->fromFh($fh,%opts) |
|
161
|
|
|
|
|
|
|
## + uses native $fh encoding |
|
162
|
|
|
|
|
|
|
sub fromFh { |
|
163
|
0
|
|
|
0
|
0
|
0
|
my ($lex,$fh,%opts) = @_; |
|
164
|
0
|
0
|
|
|
|
0
|
if ($FH_SLURP) { |
|
165
|
|
|
|
|
|
|
##-- always use string mode |
|
166
|
0
|
|
|
|
|
0
|
local $/=undef; |
|
167
|
0
|
|
|
|
|
0
|
my $buf = $fh->getline; |
|
168
|
0
|
|
|
|
|
0
|
$fh->close(); |
|
169
|
0
|
|
|
|
|
0
|
return $lex->fromString(\$buf,src=>"filehandle \`$fh'",%opts); |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
##-- line-wise buffering |
|
172
|
0
|
|
|
|
|
0
|
$lex->clear(); |
|
173
|
0
|
|
|
|
|
0
|
@$lex{keys %opts} = values(%opts); |
|
174
|
0
|
|
|
|
|
0
|
$lex->{fh} = $fh; |
|
175
|
0
|
|
|
|
|
0
|
$lex->{buf} = undef; |
|
176
|
0
|
|
|
|
|
0
|
$lex->{bufr} = \$lex->{buf}; |
|
177
|
0
|
|
|
|
|
0
|
$lex->{bufp} = 0; |
|
178
|
0
|
0
|
|
|
|
0
|
$lex->{src} = "filehandle \`$fh'" if (!defined($lex->{src})); |
|
179
|
0
|
|
|
|
|
0
|
return $lex; |
|
180
|
|
|
|
|
|
|
} |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
## $lex = $lex->fromString($str,%opts) |
|
183
|
|
|
|
|
|
|
## $lex = $lex->fromString(\$str,%opts) |
|
184
|
|
|
|
|
|
|
sub fromString { |
|
185
|
211
|
|
|
211
|
0
|
419
|
my ($lex,$str,%opts) = @_; |
|
186
|
211
|
|
|
|
|
487
|
$lex->clear(); |
|
187
|
211
|
50
|
|
|
|
523
|
if (ref($str)) { |
|
188
|
211
|
|
|
|
|
397
|
$lex->{bufr} = $str; |
|
189
|
211
|
50
|
|
|
|
965
|
$lex->{src} = "buffer \`$str'" if (!defined($lex->{src})); |
|
190
|
|
|
|
|
|
|
} else { |
|
191
|
0
|
|
|
|
|
0
|
$lex->{bufr} = \$str; |
|
192
|
0
|
0
|
|
|
|
0
|
$lex->{src} = "string \`$str'" if (!defined($lex->{src})); |
|
193
|
|
|
|
|
|
|
} |
|
194
|
211
|
|
|
|
|
396
|
$lex->{bufp} = 0; |
|
195
|
211
|
|
|
|
|
379
|
$lex->{yylineno} = 0; |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
##-- utf8 checks |
|
198
|
211
|
50
|
33
|
|
|
567
|
if ($lex->{utf8} && !utf8::is_utf8(${$lex->{bufr}})) { |
|
|
211
|
0
|
0
|
|
|
866
|
|
|
199
|
|
|
|
|
|
|
##-- lexer:utf8, string:bytes --> assume string is utf8-encoded |
|
200
|
211
|
50
|
|
|
|
336
|
${$lex->{bufr}} = decode_utf8(${$lex->{bufr}}) if (!utf8::is_utf8(${$lex->{bufr}})); |
|
|
211
|
|
|
|
|
5157
|
|
|
|
211
|
|
|
|
|
708
|
|
|
|
211
|
|
|
|
|
581
|
|
|
201
|
|
|
|
|
|
|
} |
|
202
|
0
|
|
|
|
|
0
|
elsif (!$lex->{utf8} && utf8::is_utf8(${$lex->{bufr}})) { |
|
203
|
|
|
|
|
|
|
##-- lexer:bytes, string:utf8 --> encode as utf8 octets |
|
204
|
0
|
|
|
|
|
0
|
${$lex->{bufr}} = encode_utf8(${$lex->{bufr}}); |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
205
|
|
|
|
|
|
|
} |
|
206
|
|
|
|
|
|
|
|
|
207
|
211
|
|
|
|
|
804
|
return $lex; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
##====================================================================== |
|
211
|
|
|
|
|
|
|
## Utilities |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
## $bool = $lex->eof() |
|
214
|
|
|
|
|
|
|
## + true iff at end-of-file |
|
215
|
|
|
|
|
|
|
sub eof { |
|
216
|
1151
|
50
|
|
1151
|
0
|
2877
|
return $FH_SLURP ? $_[0]->eob() : !$_[0]->getmore(); |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
## $bool = $lex->eob() |
|
220
|
|
|
|
|
|
|
## + true at end-of-buffer |
|
221
|
|
|
|
|
|
|
sub eob { |
|
222
|
1151
|
|
66
|
1151
|
0
|
2746
|
return (!$_[0]{bufr} || !${$_[0]{bufr}} || ($_[0]{bufp}||0) >= length(${$_[0]{bufr}})); |
|
223
|
|
|
|
|
|
|
} |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
## $bufr_or_undef = $lex->getmore() |
|
226
|
|
|
|
|
|
|
## + returns true iff there is still data in the buffer |
|
227
|
|
|
|
|
|
|
sub getmore { |
|
228
|
1151
|
100
|
|
1151
|
0
|
1976
|
return $_[0]{bufr} if (!$_[0]->eob()); |
|
229
|
209
|
50
|
|
|
|
655
|
if (defined($_[0]{fh})) { |
|
230
|
0
|
|
|
|
|
0
|
$_[0]{bufp} = 0; |
|
231
|
0
|
|
|
|
|
0
|
$_[0]{buf} = $_[0]{fh}->getline; |
|
232
|
0
|
|
|
|
|
0
|
$_[0]{bufr} = \$_[0]{buf}; |
|
233
|
0
|
|
|
|
|
0
|
$_[0]{yylineno} = $_[0]{fh}->input_line_number; |
|
234
|
0
|
0
|
|
|
|
0
|
return defined($_[0]{buf}) ? $_[0]{bufr} : undef; |
|
235
|
|
|
|
|
|
|
} |
|
236
|
209
|
|
|
|
|
576
|
return undef; |
|
237
|
|
|
|
|
|
|
} |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
##====================================================================== |
|
240
|
|
|
|
|
|
|
## Runtime lexer accessors |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
## $yytext = $lex->yytext |
|
243
|
|
|
|
|
|
|
## + always defined; otherwise using $lex->{yytext} is faster |
|
244
|
2
|
50
|
|
2
|
0
|
42
|
sub yytext { return defined($_[0]{yytext}) ? $_[0]{yytext} : ''; } |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
## $yytype = $lex->yytype |
|
247
|
|
|
|
|
|
|
## + always defined; otherwise using $lex->{yytype} is faster |
|
248
|
2
|
50
|
|
2
|
0
|
22
|
sub yytype { return defined($_[0]{yytype}) ? $_[0]{yytype} : '__EOF__'; } |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
## $line = $lex->yylineno() |
|
251
|
|
|
|
|
|
|
## + returns current line number |
|
252
|
|
|
|
|
|
|
sub yylineno { |
|
253
|
2
|
|
|
2
|
0
|
23
|
return $_[0]{yylineno}; |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
## $pos = $lex->yycolumn() |
|
257
|
|
|
|
|
|
|
## + return column at which current token starts (if any) |
|
258
|
|
|
|
|
|
|
sub yycolumn { |
|
259
|
0
|
0
|
0
|
0
|
0
|
0
|
return ($_[0]{bufp}||0) - (defined($_[0]{yytext}) ? length($_[0]{yytext}) : 0); |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
## $pos = $lex->yypos() |
|
263
|
|
|
|
|
|
|
## + return byte position in current line (or input string) |
|
264
|
|
|
|
|
|
|
sub yypos { |
|
265
|
102
|
|
50
|
102
|
0
|
356
|
return ($_[0]{bufp}||0); |
|
266
|
|
|
|
|
|
|
} |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
## $string = $lex->yyerror(@msg) |
|
269
|
|
|
|
|
|
|
## + create a helpful error message |
|
270
|
|
|
|
|
|
|
sub yyerror { |
|
271
|
0
|
|
|
0
|
0
|
0
|
my $lex = shift; |
|
272
|
0
|
|
|
|
|
0
|
confess(ref($lex).": error in ".$lex->yywhere().join('',@_)); |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
## $string = $lex->yywhere() |
|
276
|
|
|
|
|
|
|
## + location string used by yyerror() |
|
277
|
|
|
|
|
|
|
sub yywhere { |
|
278
|
0
|
|
|
0
|
0
|
0
|
my $lex = shift; |
|
279
|
|
|
|
|
|
|
return ("$lex->{src} at " |
|
280
|
|
|
|
|
|
|
.(defined($lex->{fh}) ? ("line $lex->{yylineno}, ") : '') |
|
281
|
|
|
|
|
|
|
."column ".$lex->yycolumn |
|
282
|
0
|
0
|
|
|
|
0
|
.", near ".(defined($lex->{yytext}) ? "\`$lex->{yytext}\'" : '__EOF__') |
|
|
|
0
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
); |
|
284
|
|
|
|
|
|
|
} |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
## $q = $lex->yypushq($new_state) |
|
287
|
|
|
|
|
|
|
sub yypushq { |
|
288
|
19
|
|
|
19
|
0
|
49
|
my ($lex,$qnew) = @_; |
|
289
|
19
|
|
|
|
|
56
|
push(@{$lex->{stack}},$lex->{state}); |
|
|
19
|
|
|
|
|
59
|
|
|
290
|
19
|
|
|
|
|
45
|
return $lex->{state} = $qnew; |
|
291
|
|
|
|
|
|
|
} |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
## $q = $lex->yypopq() |
|
294
|
|
|
|
|
|
|
sub yypopq { |
|
295
|
29
|
|
|
29
|
0
|
55
|
my $lex = shift; |
|
296
|
29
|
|
100
|
|
|
49
|
return $lex->{state} = pop(@{$lex->{stack}}) || 'INITIAL'; |
|
297
|
|
|
|
|
|
|
} |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
##====================================================================== |
|
300
|
|
|
|
|
|
|
## Runtime lexer routines |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
## ($typ,$text) = $lex->yylex() |
|
303
|
|
|
|
|
|
|
## + get next token from input stream |
|
304
|
|
|
|
|
|
|
sub yylex { |
|
305
|
1125
|
|
|
1125
|
0
|
1736
|
my $lex = shift; |
|
306
|
1125
|
|
|
|
|
1804
|
my ($bufr,$type,$text,$match,@part); |
|
307
|
|
|
|
|
|
|
#use re 'eval'; ##-- dangerous! |
|
308
|
|
|
|
|
|
|
LEXBUF: |
|
309
|
1125
|
|
|
|
|
2404
|
while (!$lex->eof()) { |
|
310
|
942
|
|
|
|
|
1679
|
$bufr = $lex->{bufr}; |
|
311
|
942
|
|
|
|
|
2334
|
pos($$bufr) = $lex->{bufp}; |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
LEXSKIP: |
|
314
|
|
|
|
|
|
|
##------------------------------------ |
|
315
|
|
|
|
|
|
|
## LEXSKIP: main lexer loop |
|
316
|
942
|
|
|
|
|
1618
|
while (1) { |
|
317
|
1345
|
|
|
|
|
2312
|
$type = $text = $match = undef; |
|
318
|
1345
|
|
|
|
|
2114
|
@part = qw(); |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
##------------------------ |
|
321
|
1345
|
100
|
100
|
|
|
3844
|
if ($lex->{state} eq 'INITIAL' || $lex->{state} eq 'Q_MATCHID') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
##-- end-of-file (should be first pattern) |
|
324
|
1213
|
100
|
100
|
|
|
58883
|
if ($$bufr =~ m/\G\z/) { $type = '__EOF__'; } |
|
|
26
|
100
|
100
|
|
|
42
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
##-- comments |
|
327
|
22
|
|
|
|
|
77
|
elsif ($$bufr =~ m/\G\#:[^\n]*/sp) { $type='__SKIP__'; push(@{$lex->{comments}},${^MATCH}."\n"); } |
|
|
22
|
|
|
|
|
38
|
|
|
|
22
|
|
|
|
|
97
|
|
|
328
|
19
|
|
|
|
|
39
|
elsif ($$bufr =~ m/\G\#\[/p) { $type='__SKIP__'; $lex->{_cmtbuf}=${^MATCH}; $lex->yypushq('Q_COMMENT'); } |
|
|
19
|
|
|
|
|
59
|
|
|
|
19
|
|
|
|
|
64
|
|
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
##-- operators |
|
331
|
21
|
|
|
|
|
39
|
elsif ($$bufr =~ m/\G\&\&/p) { $type = 'OP_BOOL_AND'; } |
|
332
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\|\|/p) { $type = 'OP_BOOL_OR'; } |
|
333
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\Gnear/pi) { $type = 'NEAR'; } |
|
334
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:\!=|\&\!=|\&=\s*\!|\!with|with(?:out|\s*\!))/pi) { $type = 'WITHOUT'; } |
|
335
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:\|=|withor|orwith|wor)/pi) { $type = 'WITHOR'; } |
|
336
|
2
|
|
|
|
|
5
|
elsif ($$bufr =~ m/\G(?:\&=|with)/pi) { $type = 'WITH'; } |
|
337
|
28
|
|
|
|
|
66
|
elsif ($$bufr =~ m/\Gcount/pi) { $type = 'COUNT'; } |
|
338
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\Gkeys/pi) { $type = 'KEYS'; } |
|
339
|
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
##-- count-by keywords |
|
341
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G(?:file|doc)_?id/pi) { $type = 'KW_FILEID'; } |
|
342
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:file|doc)_?(?:name)?/pi) { $type = 'KW_FILENAME'; } |
|
343
|
2
|
|
|
|
|
5
|
elsif ($$bufr =~ m/\Gdate/pi) { $type = 'KW_DATE'; $lex->{state}='Q_DATE' } |
|
|
2
|
|
|
|
|
6
|
|
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
##-- query operators |
|
346
|
17
|
|
|
|
|
40
|
elsif ($$bufr =~ m/\G\#(?:comment|cmt)/pi) { $type = 'KW_COMMENT'; } |
|
347
|
2
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\#(?:(?:co?n?te?xt?|n))/pi) { $type = 'CNTXT'; } |
|
348
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G\#(?:with)?in/pi) { $type = 'WITHIN'; } |
|
349
|
3
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:sep(?:arate)?|nojoin)(?:_hits)?/pi) { $type = 'SEPARATE_HITS'; } |
|
350
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:nosep(?:arate)?|join)(?:_hits)?/pi) { $type = 'NOSEPARATE_HITS'; } |
|
351
|
4
|
|
|
|
|
9
|
elsif ($$bufr =~ m/\G\#(?:is_|has_)?date/pi) { $type = 'IS_DATE'; } |
|
352
|
2
|
|
|
|
|
5
|
elsif ($$bufr =~ m/\G\#has(?:_field)?/pi) { $type = 'HAS_FIELD'; } |
|
353
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#file(?:_?)names/pi) { $type = 'FILENAMES_ONLY'; } |
|
354
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#debug_rank/pi) { $type = 'DEBUG_RANK'; } |
|
355
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_rank/pi) { $type = 'GREATER_BY_RANK'; } |
|
356
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_rank/pi) { $type = 'LESS_BY_RANK'; } |
|
357
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_date/pi) { $type = 'GREATER_BY_DATE'; } |
|
358
|
10
|
|
|
|
|
26
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_date/pi) { $type = 'LESS_BY_DATE'; } |
|
359
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_size/pi) { $type = 'GREATER_BY_SIZE'; } |
|
360
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_size/pi) { $type = 'LESS_BY_SIZE'; } |
|
361
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:is_|has_)?size/pi) { $type = 'IS_SIZE'; } |
|
362
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?left/pi) { $type = 'LESS_BY_LEFT'; } |
|
363
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)left/pi) { $type = 'GREATER_BY_LEFT'; } |
|
364
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?right/pi) { $type = 'LESS_BY_RIGHT'; } |
|
365
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)right/pi) { $type = 'GREATER_BY_RIGHT'; } |
|
366
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?mid(?:dle)?/pi) { $type = 'LESS_BY_MIDDLE'; } |
|
367
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)mid(?:dle)?/pi) { $type = 'GREATER_BY_MIDDLE'; } |
|
368
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_key)/pi) { $type = 'LESS_BY_KEY'; } |
|
369
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_key)/pi) { $type = 'GREATER_BY_KEY'; } |
|
370
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_(?:count|val(?:ue)?))/pi) { $type = 'LESS_BY_COUNT'; } |
|
371
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_(?:count|val(?:ue)?))/pi) { $type = 'GREATER_BY_COUNT'; } |
|
372
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?/pi) { $type = 'LESS_BY'; } |
|
373
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?/pi) { $type = 'GREATER_BY'; } |
|
374
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\#prune(?:_less|_asc)?(?:_by)?/pi) { $type = 'PRUNE_ASC'; } |
|
375
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#prune(?:_greater|_de?sc)(?:_by)?/pi) { $type = 'PRUNE_DESC'; } |
|
376
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#rand(?:om)?/pi) { $type = 'RANDOM'; } |
|
377
|
28
|
|
|
|
|
69
|
elsif ($$bufr =~ m/\G\#by/pi) { $type = 'BY'; } |
|
378
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G\#samp(?:le)?/pi) { $type = 'SAMPLE'; } |
|
379
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#clim(?:it)?/pi) { $type = 'CLIMIT'; } |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
##-- regexes |
|
382
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='NEG_REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
383
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\//po) { $type='NEG_REGEX'; $text=$1; } |
|
|
0
|
|
|
|
|
0
|
|
|
384
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
385
|
11
|
|
|
|
|
30
|
elsif ($$bufr =~ m/\G\/($DEF{regex_text})\//po) { $type='REGEX'; $text=$1; } |
|
|
11
|
|
|
|
|
31
|
|
|
386
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\Gs\/($DEF{regex_text})\//po) { $type='REGEX_SEARCH'; $text=$1; $lex->{state}='Q_REGREP'; } |
|
|
2
|
|
|
|
|
7
|
|
|
|
2
|
|
|
|
|
5
|
|
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
##-- punctutation & special characters |
|
389
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G#=/p) { $type = 'HASH_EQUAL'; } ##-- hash+equal: exact distance |
|
390
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G#
|
|
391
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G#>/p) { $type = 'HASH_GREATER'; } ##-- hash+greater: min distance |
|
392
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\$\./p) { $type = 'DOLLAR_DOT'; } ##-- positional anchor pseudo-index |
|
393
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\:\{/p) { $type = 'COLON_LBRACE'; } ##-- theusaurus-query operator |
|
394
|
4
|
|
|
|
|
11
|
elsif ($$bufr =~ m/\G\@\{/p) { $type = 'AT_LBRACE'; } ##-- literal-set operator |
|
395
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\{/p) { $type = 'STAR_LBRACE'; } ##-- prefix-set opener |
|
396
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\}\*/p) { $type = 'RBRACE_STAR'; } ##-- suffix-set closer |
|
397
|
78
|
|
|
|
|
234
|
elsif ($$bufr =~ m/\G[!.,;=@%^#~\/]/p) { $type = ${^MATCH}; } ##-- single-char punctuation operators |
|
398
|
172
|
|
|
|
|
477
|
elsif ($$bufr =~ m/\G[\[\]{}()<>]/p) { $type = ${^MATCH}; } ##-- parentheses |
|
399
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\"/p) { $type = ${^MATCH}; } ##-- double-quotes |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
##-- subcorpus path-lists |
|
402
|
34
|
|
|
|
|
158
|
elsif ($$bufr =~ m/\G\:/p) { $type = ${^MATCH}; $lex->{yyDtrsPos}=$lex->yypos(); $lex->{state}='Q_CORPORA'; } |
|
|
34
|
|
|
|
|
96
|
|
|
|
34
|
|
|
|
|
73
|
|
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
##-- truncated symbols |
|
405
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'\*/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated quoted string (infix symbol) |
|
|
0
|
|
|
|
|
0
|
|
|
406
|
2
|
|
|
|
|
8
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated quoted string (prefix symbol) |
|
|
2
|
|
|
|
|
9
|
|
|
407
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated quoted string (suffix symbol) |
|
|
0
|
|
|
|
|
0
|
|
|
408
|
|
|
|
|
|
|
|
|
409
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*($DEF{symbol_text})\*/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated bareword (infix symbol) |
|
|
0
|
|
|
|
|
0
|
|
|
410
|
2
|
|
|
|
|
9
|
elsif ($$bufr =~ m/\G($DEF{symbol_text})\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated bareword (prefix symbol) |
|
|
2
|
|
|
|
|
9
|
|
|
411
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*($DEF{symbol_text})/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated bareword (suffix symbol) |
|
|
0
|
|
|
|
|
0
|
|
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
##-- index names (special handling to allow count(foo) #by[$w-1] |
|
414
|
24
|
|
|
|
|
71
|
elsif ($$bufr =~ m/\G\$($DEF{index_name})/po) { $type='INDEX'; $text=$1; } |
|
|
24
|
|
|
|
|
66
|
|
|
415
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\$'($DEF{index_name})'/po) { $type='INDEX'; $text=$1; } |
|
|
0
|
|
|
|
|
0
|
|
|
416
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ /m\G\$/p) { $type='$'; } |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
##-- numeric tokens (handled below with symbols for perl lexer w/o longest-match) |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
##-- single-term wildcard |
|
421
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*/p) { $type = '*'; } |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
##-- term expander pipelines |
|
424
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\|/p) { $lex->{state}='Q_XPIPE'; $type='__SKIP__'; } |
|
|
4
|
|
|
|
|
5
|
|
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
##-- symbols: quoted |
|
427
|
|
|
|
|
|
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { |
|
428
|
43
|
|
|
|
|
152
|
$text = $1; |
|
429
|
43
|
100
|
|
|
|
222
|
if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; } |
|
|
10
|
100
|
|
|
|
22
|
|
|
430
|
6
|
|
|
|
|
16
|
elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; } |
|
431
|
27
|
|
|
|
|
66
|
else { $type='SYMBOL'; } |
|
432
|
|
|
|
|
|
|
} |
|
433
|
|
|
|
|
|
|
##-- symbols: barewords |
|
434
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'INITIAL' && $$bufr =~ m/\G$DEF{symbol_text}/po) { |
|
435
|
285
|
|
|
|
|
877
|
$text = ${^MATCH}; |
|
436
|
285
|
|
|
|
|
452
|
$match = ${^MATCH}; |
|
437
|
285
|
100
|
|
|
|
1423
|
if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; } |
|
|
40
|
100
|
|
|
|
79
|
|
|
438
|
6
|
|
|
|
|
17
|
elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; } |
|
439
|
239
|
|
|
|
|
479
|
else { $type='SYMBOL'; } |
|
440
|
|
|
|
|
|
|
} |
|
441
|
|
|
|
|
|
|
##-- barewords: integers and dates (Q_MATCHID) |
|
442
|
14
|
|
|
|
|
50
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+(?=$DEF{int_boundary})/po) { $type='INTEGER'; } |
|
443
|
26
|
|
|
|
|
71
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+\z/p) { $type='INTEGER'; } |
|
444
|
|
|
|
|
|
|
|
|
445
|
0
|
|
|
|
|
0
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}(?=$DEF{int_boundary})/po) { $type='DATE'; } |
|
446
|
0
|
|
|
|
|
0
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}\z/po) { $type='DATE'; } |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
##-- misc |
|
449
|
300
|
|
|
|
|
763
|
elsif ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } |
|
450
|
|
|
|
|
|
|
#elsif ($$bufr =~ m/\G./p) { $type = 'SYMBOL'; } |
|
451
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G./p) { $type = '__ERROR__'; } |
|
452
|
|
|
|
|
|
|
|
|
453
|
1213
|
100
|
|
|
|
3462
|
$match = ${^MATCH} if (!defined($match)); |
|
454
|
|
|
|
|
|
|
} |
|
455
|
|
|
|
|
|
|
##------------------------ |
|
456
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_CORPORA') { |
|
457
|
86
|
100
|
|
|
|
1194
|
if ($$bufr =~ m/\G$DEF{corpus_text}/p) { $type='SYMBOL'; } |
|
|
28
|
100
|
|
|
|
74
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
458
|
22
|
|
|
|
|
51
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $type='SYMBOL'; $text=$1; } |
|
|
22
|
|
|
|
|
49
|
|
|
459
|
18
|
|
|
|
|
53
|
elsif ($$bufr =~ m/\G\,/p) { $type=${^MATCH}; } |
|
460
|
8
|
|
|
|
|
22
|
elsif ($$bufr =~ m/\G\s+/p) { $type='__SKIP__'; } |
|
461
|
10
|
|
|
|
|
23
|
else { $type='__SKIP__'; $lex->yypopq(); } |
|
|
10
|
|
|
|
|
32
|
|
|
462
|
|
|
|
|
|
|
|
|
463
|
86
|
|
|
|
|
174
|
$match = ${^MATCH}; |
|
464
|
86
|
100
|
|
|
|
254
|
$lex->{yyDtrsLen} = $lex->yypos()-$lex->{yyDtrsPos} if ($type ne '__SKIP__'); |
|
465
|
|
|
|
|
|
|
} |
|
466
|
|
|
|
|
|
|
##------------------------ |
|
467
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_COMMENT') { |
|
468
|
38
|
100
|
|
|
|
462
|
if ($$bufr =~ m/\G\]/p) { |
|
|
|
50
|
|
|
|
|
|
|
469
|
19
|
|
|
|
|
36
|
$type='__SKIP__'; |
|
470
|
19
|
|
|
|
|
41
|
$lex->{_cmtbuf} .= ${^MATCH}; |
|
471
|
19
|
|
|
|
|
29
|
push(@{$lex->{comments}}, $lex->{_cmtbuf}); |
|
|
19
|
|
|
|
|
60
|
|
|
472
|
19
|
|
|
|
|
56
|
delete $lex->{_cmtbuf}; |
|
473
|
19
|
|
|
|
|
60
|
$lex->yypopq(); |
|
474
|
|
|
|
|
|
|
} |
|
475
|
19
|
|
|
|
|
50
|
elsif ($$bufr =~ m/\G$DEF{comment_text}/sp) { $type='__SKIP__'; $lex->{_cmtbuf} .= ${^MATCH}; } |
|
|
19
|
|
|
|
|
54
|
|
|
476
|
0
|
|
|
|
|
0
|
else { $type='__ERROR__'; } |
|
477
|
|
|
|
|
|
|
|
|
478
|
38
|
|
|
|
|
96
|
$match = ${^MATCH}; |
|
479
|
|
|
|
|
|
|
} |
|
480
|
|
|
|
|
|
|
##------------------------ |
|
481
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_DATE') { |
|
482
|
2
|
50
|
|
|
|
19
|
if ($$bufr =~ m/^\s+/) { $type = '__SKIP__'; } |
|
|
0
|
50
|
|
|
|
0
|
|
|
483
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/^\//) { $type = '/'; $lex->{state}='INITIAL'; } |
|
|
0
|
|
|
|
|
0
|
|
|
484
|
2
|
|
|
|
|
6
|
else { $lex->{state}='INITIAL'; $type='__SKIP__'; } |
|
|
2
|
|
|
|
|
5
|
|
|
485
|
|
|
|
|
|
|
|
|
486
|
2
|
|
|
|
|
5
|
$match = ${^MATCH}; |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
##------------------------ |
|
489
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_REGREP') { |
|
490
|
2
|
50
|
|
|
|
158
|
if ($$bufr =~ m/\G($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
|
0
|
50
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
491
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G($DEF{regex_text})\//po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='INITIAL'; } |
|
|
2
|
|
|
|
|
8
|
|
|
|
2
|
|
|
|
|
7
|
|
|
492
|
|
|
|
|
|
|
|
|
493
|
2
|
|
|
|
|
7
|
$match = ${^MATCH}; |
|
494
|
|
|
|
|
|
|
} |
|
495
|
|
|
|
|
|
|
##------------------------ |
|
496
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_REGOPT') { |
|
497
|
0
|
0
|
|
|
|
0
|
if ($$bufr =~ m/\G$DEF{regex_modifier}+/po) { $type='REGOPT'; } |
|
|
0
|
|
|
|
|
0
|
|
|
498
|
0
|
|
|
|
|
0
|
else { $type='__SKIP__'; } |
|
499
|
0
|
|
|
|
|
0
|
$lex->{state} = 'INITIAL'; |
|
500
|
|
|
|
|
|
|
|
|
501
|
0
|
|
|
|
|
0
|
$match = ${^MATCH}; |
|
502
|
|
|
|
|
|
|
} |
|
503
|
|
|
|
|
|
|
##------------------------ |
|
504
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_XPIPE') { |
|
505
|
4
|
50
|
|
|
|
248
|
if ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } ##-- whitespace: skip |
|
|
0
|
50
|
|
|
|
0
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
506
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\-/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
|
0
|
|
|
|
|
0
|
|
|
507
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=$1; } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
508
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G$DEF{symbol_text}/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
|
4
|
|
|
|
|
7
|
|
|
509
|
|
|
|
|
|
|
#elsif ($$bufr =~ m/\G\z/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
510
|
0
|
|
|
|
|
0
|
else { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=''; } |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
511
|
|
|
|
|
|
|
|
|
512
|
4
|
|
|
|
|
12
|
$match = ${^MATCH}; |
|
513
|
|
|
|
|
|
|
} |
|
514
|
|
|
|
|
|
|
##------------------------ |
|
515
|
|
|
|
|
|
|
## END perl-ification of flex sources |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
##-- guts |
|
518
|
1345
|
100
|
|
|
|
2662
|
$text = $match if (!defined($text)); |
|
519
|
1345
|
100
|
|
|
|
3572
|
$lex->{bufp} += length($match) if (defined($match)); |
|
520
|
|
|
|
|
|
|
|
|
521
|
1345
|
|
|
|
|
3111
|
pos($$bufr) = $lex->{bufp}; |
|
522
|
1345
|
50
|
|
|
|
3352
|
return if (!defined($type)); |
|
523
|
|
|
|
|
|
|
|
|
524
|
1345
|
100
|
|
|
|
2719
|
next LEXSKIP if ($type eq '__SKIP__'); |
|
525
|
942
|
100
|
|
|
|
1704
|
next LEXBUF if ($type eq '__EOF__'); |
|
526
|
|
|
|
|
|
|
#elsif ($type eq '__ERROR__') { |
|
527
|
|
|
|
|
|
|
# return $lex->yyerror(); |
|
528
|
|
|
|
|
|
|
#} |
|
529
|
916
|
|
|
|
|
5159
|
return @$lex{qw(yytype yytext)} = ($type,$text); |
|
530
|
|
|
|
|
|
|
} |
|
531
|
|
|
|
|
|
|
} |
|
532
|
209
|
|
|
|
|
861
|
return @$lex{qw(yytype yytext)} = ('__EOF__',undef); |
|
533
|
|
|
|
|
|
|
} |
|
534
|
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
##====================================================================== |
|
536
|
|
|
|
|
|
|
## Testing: dummy lexing |
|
537
|
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
## undef = $lex->dummylex(@from_whatever) |
|
539
|
|
|
|
|
|
|
sub dummylex { |
|
540
|
0
|
|
|
0
|
0
|
|
my $lex = shift; |
|
541
|
0
|
|
|
|
|
|
$lex->reset(); |
|
542
|
0
|
|
|
|
|
|
$lex->from(@_); |
|
543
|
0
|
|
|
|
|
|
my ($type,$text); |
|
544
|
|
|
|
|
|
|
TOKEN: |
|
545
|
0
|
|
|
|
|
|
while(1) { |
|
546
|
0
|
|
|
|
|
|
($type,$text) = $lex->yylex(); |
|
547
|
|
|
|
|
|
|
print("-" x 64, "\n", |
|
548
|
|
|
|
|
|
|
">> Line: ", $lex->yylineno, ", Pos: ", $lex->yypos, "\n", |
|
549
|
0
|
0
|
|
|
|
|
">> State: ", (defined($lex->{state}) ? $lex->{state} : '(undef)'), "\n", |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
">> Type: ", (defined($type) ? $type : '(undef)'), "\n", |
|
551
|
|
|
|
|
|
|
">> Text: ", (defined($text) ? $text : '(undef)'), "\n", |
|
552
|
|
|
|
|
|
|
); |
|
553
|
0
|
0
|
|
|
|
|
if (!defined($type)) { |
|
554
|
0
|
|
|
|
|
|
warn(":: undef type!"); |
|
555
|
0
|
|
|
|
|
|
return; |
|
556
|
|
|
|
|
|
|
} |
|
557
|
0
|
0
|
|
|
|
|
if ($type eq '__ERROR__') { |
|
558
|
0
|
|
|
|
|
|
print(":: ERROR DETECTED\n"); |
|
559
|
0
|
|
|
|
|
|
$lex->yyerror(); |
|
560
|
|
|
|
|
|
|
} |
|
561
|
0
|
0
|
|
|
|
|
last if ($type eq '__EOF__'); |
|
562
|
|
|
|
|
|
|
} |
|
563
|
|
|
|
|
|
|
} |
|
564
|
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
1; ##-- be happy |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
__END__ |