line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
##-*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
## File: DDC::::yyqlexer.pm |
4
|
|
|
|
|
|
|
## Author: Bryan Jurish |
5
|
|
|
|
|
|
|
## Description: |
6
|
|
|
|
|
|
|
## + lexer for ddc queries (formerly DDC::Query::yylexer) |
7
|
|
|
|
|
|
|
## + last updated for ddc v2.1.15 |
8
|
|
|
|
|
|
|
##====================================================================== |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
package DDC::PP::yyqlexer; |
11
|
20
|
|
|
20
|
|
509
|
use 5.010; ##-- we need at least v5.10.0 for /p regex modifier |
|
20
|
|
|
|
|
74
|
|
12
|
20
|
|
|
20
|
|
119
|
use Encode qw(encode_utf8 decode_utf8); |
|
20
|
|
|
|
|
66
|
|
|
20
|
|
|
|
|
1139
|
|
13
|
20
|
|
|
20
|
|
122
|
use Carp; |
|
20
|
|
|
|
|
61
|
|
|
20
|
|
|
|
|
1193
|
|
14
|
20
|
|
|
20
|
|
161
|
use IO::File; |
|
20
|
|
|
|
|
90
|
|
|
20
|
|
|
|
|
2971
|
|
15
|
20
|
|
|
20
|
|
178
|
use IO::Handle; |
|
20
|
|
|
|
|
39
|
|
|
20
|
|
|
|
|
753
|
|
16
|
20
|
|
|
20
|
|
114
|
use strict; |
|
20
|
|
|
|
|
58
|
|
|
20
|
|
|
|
|
7590
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##====================================================================== |
19
|
|
|
|
|
|
|
## Globals etc. |
20
|
|
|
|
|
|
|
our @ISA = qw(); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
23
|
|
|
|
|
|
|
## Globals: regexes for Parse::Lex lexer token regexes |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
## %DEF |
26
|
|
|
|
|
|
|
## + common shared regex definitions |
27
|
|
|
|
|
|
|
our (%DEF); |
28
|
|
|
|
|
|
|
BEGIN { |
29
|
|
|
|
|
|
|
##-- adapted from ddc-2.1.1/src/ConcordLib/yyQLexer.l ; extra escapes needed for backslashes ('\\' -> '\\\\') |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
##-- whitespace |
32
|
20
|
|
|
20
|
|
109
|
$DEF{ws} = '[ \t\n\r\f\x0b]'; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
##-- integer boundary characters |
35
|
20
|
|
|
|
|
43
|
$DEF{int_boundary} = '[ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\/\\\\\'\".$@_+-]'; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
##-- bareword symbols |
38
|
20
|
|
|
|
|
42
|
$DEF{symbol_cfirst} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\".$@]'; |
39
|
20
|
|
|
|
|
66
|
$DEF{symbol_crest} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\"]'; |
40
|
20
|
|
|
|
|
36
|
$DEF{symbol_cescape} = '(?:\\\\.)'; |
41
|
20
|
|
|
|
|
140
|
$DEF{symbol_text} = "(?:$DEF{symbol_cescape}|$DEF{symbol_cfirst})(?:$DEF{symbol_cescape}|$DEF{symbol_crest})*"; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
##-- subcorpus symbols (ddc >= v2.2.0; also allow '!' here) |
44
|
20
|
|
|
|
|
73
|
$DEF{corpus_cfirst} = '[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"$@]'; |
45
|
20
|
|
|
|
|
55
|
$DEF{corpus_crest} = '[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"]'; |
46
|
20
|
|
|
|
|
83
|
$DEF{corpus_text} = "(?:$DEF{symbol_cescape}|$DEF{corpus_cfirst})(?:$DEF{symbol_cescape}|$DEF{corpus_crest})*"; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
##-- bareword index names (underscore and digits ok, but no '.', '-', or '+') |
49
|
20
|
|
|
|
|
42
|
$DEF{index_char} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\/\'\".$@+-]'; |
50
|
20
|
|
|
|
|
73
|
$DEF{index_name} = "(?:$DEF{index_char}|$DEF{symbol_cescape})+"; |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
##-- single-quoted symbols |
53
|
20
|
|
|
|
|
51
|
$DEF{sq_text} = "(?:[^\']|$DEF{symbol_cescape})*"; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
##-- bareword dates (>= 4 digits of year, otherwise breaks {count(...) #by[$w=1-1]}) |
56
|
20
|
|
|
|
|
44
|
$DEF{date_bare} = "[+-]?[0-9]{4,}(-[0-9]{1,2}){1,2}"; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
##-- regexes |
59
|
20
|
|
|
|
|
47
|
$DEF{regex_text} = "(?:(?:\\\\.)|[^\\\\/])*"; |
60
|
20
|
|
|
|
|
49
|
$DEF{regex_modifier} = '[dgimsx]'; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
##-- comments |
63
|
20
|
|
|
|
|
79
|
$DEF{comment_text} = "(?:\\\\.|[^\\]])*"; |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
##-- compile patterns |
66
|
20
|
|
|
|
|
116
|
foreach (keys %DEF) { |
67
|
|
|
|
|
|
|
#print STDERR __PACKAGE__, ": compiling regex macro: $_ ~ /$DEF{$_}/\n"; |
68
|
320
|
|
|
|
|
11879
|
$DEF{$_} = qr/$DEF{$_}/; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
##====================================================================== |
73
|
|
|
|
|
|
|
## $lex = $CLASS_OR_OBJ->new(%args) |
74
|
|
|
|
|
|
|
## + abstract constructor |
75
|
|
|
|
|
|
|
## + %$lex, %args: |
76
|
|
|
|
|
|
|
## { |
77
|
|
|
|
|
|
|
## src => $name, ##-- source name |
78
|
|
|
|
|
|
|
## fh => $srcfh, ##-- source filehandle |
79
|
|
|
|
|
|
|
## bufr => \$buf, ##-- source buffer (string reference) |
80
|
|
|
|
|
|
|
## bufp => $pos, ##-- current pos() in source buffer |
81
|
|
|
|
|
|
|
## buf => $buf, ##-- local buffer (for filehandle input) |
82
|
|
|
|
|
|
|
## state => $q, ##-- symbolic state name (default: 'INITIAL') |
83
|
|
|
|
|
|
|
## stack => \@stack, ##-- state stack |
84
|
|
|
|
|
|
|
## |
85
|
|
|
|
|
|
|
## ##-- utf-8 or byte mode? |
86
|
|
|
|
|
|
|
## utf8 => $bool, ##-- whether to use utf8 or byte-mode (default: true (non-compatible but handy)) |
87
|
|
|
|
|
|
|
## |
88
|
|
|
|
|
|
|
## ##-- runtime data |
89
|
|
|
|
|
|
|
## yytext => $text, ##-- current text |
90
|
|
|
|
|
|
|
## yytype => $type, ##-- current token type |
91
|
|
|
|
|
|
|
## yylineno => $line, ##-- current source line (file input only) |
92
|
|
|
|
|
|
|
## } |
93
|
|
|
|
|
|
|
sub new { |
94
|
15
|
|
|
15
|
0
|
44
|
my $that = shift; |
95
|
15
|
|
33
|
|
|
244
|
my $lex = bless({ |
96
|
|
|
|
|
|
|
src =>undef, |
97
|
|
|
|
|
|
|
fh =>undef, |
98
|
|
|
|
|
|
|
bufr =>undef, |
99
|
|
|
|
|
|
|
bufp =>undef, |
100
|
|
|
|
|
|
|
buf =>undef, |
101
|
|
|
|
|
|
|
utf8 =>1, |
102
|
|
|
|
|
|
|
state => 'INITIAL', |
103
|
|
|
|
|
|
|
stack => [], |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
yytext=>undef, |
106
|
|
|
|
|
|
|
yytype=>undef, |
107
|
|
|
|
|
|
|
yylineno=>undef, |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
##-- lexer comment retention hacks |
110
|
|
|
|
|
|
|
comments=>[], |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
##-- user args |
113
|
|
|
|
|
|
|
@_ |
114
|
|
|
|
|
|
|
}, |
115
|
|
|
|
|
|
|
ref($that)||$that |
116
|
|
|
|
|
|
|
); |
117
|
15
|
|
|
|
|
129
|
return $lex; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
## $lex = $lex->clear() |
121
|
|
|
|
|
|
|
## + clear lexer buffer, source, etc |
122
|
|
|
|
|
|
|
sub clear { |
123
|
422
|
|
|
422
|
0
|
615
|
my $lex = shift; |
124
|
422
|
|
|
|
|
1393
|
delete @$lex{qw(src fh bufr bufp buf yytext yytype yylineno)}; |
125
|
422
|
|
|
|
|
762
|
$lex->{state} = 'INITIAL'; |
126
|
422
|
|
|
|
|
603
|
@{$lex->{stack}} = qw(); |
|
422
|
|
|
|
|
827
|
|
127
|
422
|
|
|
|
|
643
|
@{$lex->{comments}} = qw(); |
|
422
|
|
|
|
|
661
|
|
128
|
422
|
|
|
|
|
660
|
$lex->{yyDtrsPos} = 0; |
129
|
422
|
|
|
|
|
601
|
$lex->{yyDtrsLen} = 0; |
130
|
422
|
|
|
|
|
710
|
delete $lex->{_cmtbuf}; |
131
|
422
|
|
|
|
|
694
|
return $lex; |
132
|
|
|
|
|
|
|
} |
133
|
20
|
|
|
20
|
|
96156
|
BEGIN { *reset = *close = \&clear; } |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
##====================================================================== |
136
|
|
|
|
|
|
|
## I/O |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
## $lex = $lex->from($which,$src, %opts) |
139
|
|
|
|
|
|
|
## + $which is one of qw(fh file string) |
140
|
|
|
|
|
|
|
## + $src is the actual source (default: 'string') |
141
|
|
|
|
|
|
|
sub from { |
142
|
211
|
|
|
211
|
0
|
496
|
my ($lex,$which,$src,%opts) = @_; |
143
|
211
|
50
|
|
|
|
514
|
return $lex->fromFh($src,%opts) if ($which eq 'fh'); |
144
|
211
|
50
|
|
|
|
431
|
return $lex->fromFile($src,%opts) if ($which eq 'file'); |
145
|
211
|
|
|
|
|
585
|
return $lex->fromString($src,%opts); |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
## $lex = $lex->fromFile($filename_or_handle,%opts) |
149
|
|
|
|
|
|
|
sub fromFile { |
150
|
0
|
|
|
0
|
0
|
0
|
my ($lex,$file,%opts) = @_; |
151
|
0
|
0
|
|
|
|
0
|
return $lex->fromFh($file,%opts) if (ref($file)); |
152
|
0
|
0
|
|
|
|
0
|
my $fh = IO::File->new("<$file") |
153
|
|
|
|
|
|
|
or confess("cannot open '$file' for read: $!"); |
154
|
0
|
0
|
|
|
|
0
|
binmode($fh,':encoding(utf8)') if ($lex->{utf8}); |
155
|
0
|
|
|
|
|
0
|
return $lex->fromFh($fh,src=>"file \`$file'",%opts); |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
our $FH_SLURP=0; ##-- DEBUG: slurp whole files instead of line-wise input |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
## $lex = $lex->fromFh($fh,%opts) |
161
|
|
|
|
|
|
|
## + uses native $fh encoding |
162
|
|
|
|
|
|
|
sub fromFh { |
163
|
0
|
|
|
0
|
0
|
0
|
my ($lex,$fh,%opts) = @_; |
164
|
0
|
0
|
|
|
|
0
|
if ($FH_SLURP) { |
165
|
|
|
|
|
|
|
##-- always use string mode |
166
|
0
|
|
|
|
|
0
|
local $/=undef; |
167
|
0
|
|
|
|
|
0
|
my $buf = $fh->getline; |
168
|
0
|
|
|
|
|
0
|
$fh->close(); |
169
|
0
|
|
|
|
|
0
|
return $lex->fromString(\$buf,src=>"filehandle \`$fh'",%opts); |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
##-- line-wise buffering |
172
|
0
|
|
|
|
|
0
|
$lex->clear(); |
173
|
0
|
|
|
|
|
0
|
@$lex{keys %opts} = values(%opts); |
174
|
0
|
|
|
|
|
0
|
$lex->{fh} = $fh; |
175
|
0
|
|
|
|
|
0
|
$lex->{buf} = undef; |
176
|
0
|
|
|
|
|
0
|
$lex->{bufr} = \$lex->{buf}; |
177
|
0
|
|
|
|
|
0
|
$lex->{bufp} = 0; |
178
|
0
|
0
|
|
|
|
0
|
$lex->{src} = "filehandle \`$fh'" if (!defined($lex->{src})); |
179
|
0
|
|
|
|
|
0
|
return $lex; |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
## $lex = $lex->fromString($str,%opts) |
183
|
|
|
|
|
|
|
## $lex = $lex->fromString(\$str,%opts) |
184
|
|
|
|
|
|
|
sub fromString { |
185
|
211
|
|
|
211
|
0
|
419
|
my ($lex,$str,%opts) = @_; |
186
|
211
|
|
|
|
|
487
|
$lex->clear(); |
187
|
211
|
50
|
|
|
|
523
|
if (ref($str)) { |
188
|
211
|
|
|
|
|
397
|
$lex->{bufr} = $str; |
189
|
211
|
50
|
|
|
|
965
|
$lex->{src} = "buffer \`$str'" if (!defined($lex->{src})); |
190
|
|
|
|
|
|
|
} else { |
191
|
0
|
|
|
|
|
0
|
$lex->{bufr} = \$str; |
192
|
0
|
0
|
|
|
|
0
|
$lex->{src} = "string \`$str'" if (!defined($lex->{src})); |
193
|
|
|
|
|
|
|
} |
194
|
211
|
|
|
|
|
396
|
$lex->{bufp} = 0; |
195
|
211
|
|
|
|
|
379
|
$lex->{yylineno} = 0; |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
##-- utf8 checks |
198
|
211
|
50
|
33
|
|
|
567
|
if ($lex->{utf8} && !utf8::is_utf8(${$lex->{bufr}})) { |
|
211
|
0
|
0
|
|
|
866
|
|
199
|
|
|
|
|
|
|
##-- lexer:utf8, string:bytes --> assume string is utf8-encoded |
200
|
211
|
50
|
|
|
|
336
|
${$lex->{bufr}} = decode_utf8(${$lex->{bufr}}) if (!utf8::is_utf8(${$lex->{bufr}})); |
|
211
|
|
|
|
|
5157
|
|
|
211
|
|
|
|
|
708
|
|
|
211
|
|
|
|
|
581
|
|
201
|
|
|
|
|
|
|
} |
202
|
0
|
|
|
|
|
0
|
elsif (!$lex->{utf8} && utf8::is_utf8(${$lex->{bufr}})) { |
203
|
|
|
|
|
|
|
##-- lexer:bytes, string:utf8 --> encode as utf8 octets |
204
|
0
|
|
|
|
|
0
|
${$lex->{bufr}} = encode_utf8(${$lex->{bufr}}); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
211
|
|
|
|
|
804
|
return $lex; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
##====================================================================== |
211
|
|
|
|
|
|
|
## Utilities |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
## $bool = $lex->eof() |
214
|
|
|
|
|
|
|
## + true iff at end-of-file |
215
|
|
|
|
|
|
|
sub eof { |
216
|
1151
|
50
|
|
1151
|
0
|
2877
|
return $FH_SLURP ? $_[0]->eob() : !$_[0]->getmore(); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
## $bool = $lex->eob() |
220
|
|
|
|
|
|
|
## + true at end-of-buffer |
221
|
|
|
|
|
|
|
sub eob { |
222
|
1151
|
|
66
|
1151
|
0
|
2746
|
return (!$_[0]{bufr} || !${$_[0]{bufr}} || ($_[0]{bufp}||0) >= length(${$_[0]{bufr}})); |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
## $bufr_or_undef = $lex->getmore() |
226
|
|
|
|
|
|
|
## + returns true iff there is still data in the buffer |
227
|
|
|
|
|
|
|
sub getmore { |
228
|
1151
|
100
|
|
1151
|
0
|
1976
|
return $_[0]{bufr} if (!$_[0]->eob()); |
229
|
209
|
50
|
|
|
|
655
|
if (defined($_[0]{fh})) { |
230
|
0
|
|
|
|
|
0
|
$_[0]{bufp} = 0; |
231
|
0
|
|
|
|
|
0
|
$_[0]{buf} = $_[0]{fh}->getline; |
232
|
0
|
|
|
|
|
0
|
$_[0]{bufr} = \$_[0]{buf}; |
233
|
0
|
|
|
|
|
0
|
$_[0]{yylineno} = $_[0]{fh}->input_line_number; |
234
|
0
|
0
|
|
|
|
0
|
return defined($_[0]{buf}) ? $_[0]{bufr} : undef; |
235
|
|
|
|
|
|
|
} |
236
|
209
|
|
|
|
|
576
|
return undef; |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
##====================================================================== |
240
|
|
|
|
|
|
|
## Runtime lexer accessors |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
## $yytext = $lex->yytext |
243
|
|
|
|
|
|
|
## + always defined; otherwise using $lex->{yytext} is faster |
244
|
2
|
50
|
|
2
|
0
|
42
|
sub yytext { return defined($_[0]{yytext}) ? $_[0]{yytext} : ''; } |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
## $yytype = $lex->yytype |
247
|
|
|
|
|
|
|
## + always defined; otherwise using $lex->{yytype} is faster |
248
|
2
|
50
|
|
2
|
0
|
22
|
sub yytype { return defined($_[0]{yytype}) ? $_[0]{yytype} : '__EOF__'; } |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
## $line = $lex->yylineno() |
251
|
|
|
|
|
|
|
## + returns current line number |
252
|
|
|
|
|
|
|
sub yylineno { |
253
|
2
|
|
|
2
|
0
|
23
|
return $_[0]{yylineno}; |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
## $pos = $lex->yycolumn() |
257
|
|
|
|
|
|
|
## + return column at which current token starts (if any) |
258
|
|
|
|
|
|
|
sub yycolumn { |
259
|
0
|
0
|
0
|
0
|
0
|
0
|
return ($_[0]{bufp}||0) - (defined($_[0]{yytext}) ? length($_[0]{yytext}) : 0); |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
## $pos = $lex->yypos() |
263
|
|
|
|
|
|
|
## + return byte position in current line (or input string) |
264
|
|
|
|
|
|
|
sub yypos { |
265
|
102
|
|
50
|
102
|
0
|
356
|
return ($_[0]{bufp}||0); |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
## $string = $lex->yyerror(@msg) |
269
|
|
|
|
|
|
|
## + create a helpful error message |
270
|
|
|
|
|
|
|
sub yyerror { |
271
|
0
|
|
|
0
|
0
|
0
|
my $lex = shift; |
272
|
0
|
|
|
|
|
0
|
confess(ref($lex).": error in ".$lex->yywhere().join('',@_)); |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
## $string = $lex->yywhere() |
276
|
|
|
|
|
|
|
## + location string used by yyerror() |
277
|
|
|
|
|
|
|
sub yywhere { |
278
|
0
|
|
|
0
|
0
|
0
|
my $lex = shift; |
279
|
|
|
|
|
|
|
return ("$lex->{src} at " |
280
|
|
|
|
|
|
|
.(defined($lex->{fh}) ? ("line $lex->{yylineno}, ") : '') |
281
|
|
|
|
|
|
|
."column ".$lex->yycolumn |
282
|
0
|
0
|
|
|
|
0
|
.", near ".(defined($lex->{yytext}) ? "\`$lex->{yytext}\'" : '__EOF__') |
|
|
0
|
|
|
|
|
|
283
|
|
|
|
|
|
|
); |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
## $q = $lex->yypushq($new_state) |
287
|
|
|
|
|
|
|
sub yypushq { |
288
|
19
|
|
|
19
|
0
|
49
|
my ($lex,$qnew) = @_; |
289
|
19
|
|
|
|
|
56
|
push(@{$lex->{stack}},$lex->{state}); |
|
19
|
|
|
|
|
59
|
|
290
|
19
|
|
|
|
|
45
|
return $lex->{state} = $qnew; |
291
|
|
|
|
|
|
|
} |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
## $q = $lex->yypopq() |
294
|
|
|
|
|
|
|
sub yypopq { |
295
|
29
|
|
|
29
|
0
|
55
|
my $lex = shift; |
296
|
29
|
|
100
|
|
|
49
|
return $lex->{state} = pop(@{$lex->{stack}}) || 'INITIAL'; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
##====================================================================== |
300
|
|
|
|
|
|
|
## Runtime lexer routines |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
## ($typ,$text) = $lex->yylex() |
303
|
|
|
|
|
|
|
## + get next token from input stream |
304
|
|
|
|
|
|
|
sub yylex { |
305
|
1125
|
|
|
1125
|
0
|
1736
|
my $lex = shift; |
306
|
1125
|
|
|
|
|
1804
|
my ($bufr,$type,$text,$match,@part); |
307
|
|
|
|
|
|
|
#use re 'eval'; ##-- dangerous! |
308
|
|
|
|
|
|
|
LEXBUF: |
309
|
1125
|
|
|
|
|
2404
|
while (!$lex->eof()) { |
310
|
942
|
|
|
|
|
1679
|
$bufr = $lex->{bufr}; |
311
|
942
|
|
|
|
|
2334
|
pos($$bufr) = $lex->{bufp}; |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
LEXSKIP: |
314
|
|
|
|
|
|
|
##------------------------------------ |
315
|
|
|
|
|
|
|
## LEXSKIP: main lexer loop |
316
|
942
|
|
|
|
|
1618
|
while (1) { |
317
|
1345
|
|
|
|
|
2312
|
$type = $text = $match = undef; |
318
|
1345
|
|
|
|
|
2114
|
@part = qw(); |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
##------------------------ |
321
|
1345
|
100
|
100
|
|
|
3844
|
if ($lex->{state} eq 'INITIAL' || $lex->{state} eq 'Q_MATCHID') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
##-- end-of-file (should be first pattern) |
324
|
1213
|
100
|
100
|
|
|
58883
|
if ($$bufr =~ m/\G\z/) { $type = '__EOF__'; } |
|
26
|
100
|
100
|
|
|
42
|
|
|
|
100
|
66
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
##-- comments |
327
|
22
|
|
|
|
|
77
|
elsif ($$bufr =~ m/\G\#:[^\n]*/sp) { $type='__SKIP__'; push(@{$lex->{comments}},${^MATCH}."\n"); } |
|
22
|
|
|
|
|
38
|
|
|
22
|
|
|
|
|
97
|
|
328
|
19
|
|
|
|
|
39
|
elsif ($$bufr =~ m/\G\#\[/p) { $type='__SKIP__'; $lex->{_cmtbuf}=${^MATCH}; $lex->yypushq('Q_COMMENT'); } |
|
19
|
|
|
|
|
59
|
|
|
19
|
|
|
|
|
64
|
|
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
##-- operators |
331
|
21
|
|
|
|
|
39
|
elsif ($$bufr =~ m/\G\&\&/p) { $type = 'OP_BOOL_AND'; } |
332
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\|\|/p) { $type = 'OP_BOOL_OR'; } |
333
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\Gnear/pi) { $type = 'NEAR'; } |
334
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:\!=|\&\!=|\&=\s*\!|\!with|with(?:out|\s*\!))/pi) { $type = 'WITHOUT'; } |
335
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:\|=|withor|orwith|wor)/pi) { $type = 'WITHOR'; } |
336
|
2
|
|
|
|
|
5
|
elsif ($$bufr =~ m/\G(?:\&=|with)/pi) { $type = 'WITH'; } |
337
|
28
|
|
|
|
|
66
|
elsif ($$bufr =~ m/\Gcount/pi) { $type = 'COUNT'; } |
338
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\Gkeys/pi) { $type = 'KEYS'; } |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
##-- count-by keywords |
341
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G(?:file|doc)_?id/pi) { $type = 'KW_FILEID'; } |
342
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:file|doc)_?(?:name)?/pi) { $type = 'KW_FILENAME'; } |
343
|
2
|
|
|
|
|
5
|
elsif ($$bufr =~ m/\Gdate/pi) { $type = 'KW_DATE'; $lex->{state}='Q_DATE' } |
|
2
|
|
|
|
|
6
|
|
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
##-- query operators |
346
|
17
|
|
|
|
|
40
|
elsif ($$bufr =~ m/\G\#(?:comment|cmt)/pi) { $type = 'KW_COMMENT'; } |
347
|
2
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\#(?:(?:co?n?te?xt?|n))/pi) { $type = 'CNTXT'; } |
348
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G\#(?:with)?in/pi) { $type = 'WITHIN'; } |
349
|
3
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:sep(?:arate)?|nojoin)(?:_hits)?/pi) { $type = 'SEPARATE_HITS'; } |
350
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:nosep(?:arate)?|join)(?:_hits)?/pi) { $type = 'NOSEPARATE_HITS'; } |
351
|
4
|
|
|
|
|
9
|
elsif ($$bufr =~ m/\G\#(?:is_|has_)?date/pi) { $type = 'IS_DATE'; } |
352
|
2
|
|
|
|
|
5
|
elsif ($$bufr =~ m/\G\#has(?:_field)?/pi) { $type = 'HAS_FIELD'; } |
353
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#file(?:_?)names/pi) { $type = 'FILENAMES_ONLY'; } |
354
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#debug_rank/pi) { $type = 'DEBUG_RANK'; } |
355
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_rank/pi) { $type = 'GREATER_BY_RANK'; } |
356
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_rank/pi) { $type = 'LESS_BY_RANK'; } |
357
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_date/pi) { $type = 'GREATER_BY_DATE'; } |
358
|
10
|
|
|
|
|
26
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_date/pi) { $type = 'LESS_BY_DATE'; } |
359
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_size/pi) { $type = 'GREATER_BY_SIZE'; } |
360
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_size/pi) { $type = 'LESS_BY_SIZE'; } |
361
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:is_|has_)?size/pi) { $type = 'IS_SIZE'; } |
362
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?left/pi) { $type = 'LESS_BY_LEFT'; } |
363
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)left/pi) { $type = 'GREATER_BY_LEFT'; } |
364
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?right/pi) { $type = 'LESS_BY_RIGHT'; } |
365
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)right/pi) { $type = 'GREATER_BY_RIGHT'; } |
366
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?mid(?:dle)?/pi) { $type = 'LESS_BY_MIDDLE'; } |
367
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)mid(?:dle)?/pi) { $type = 'GREATER_BY_MIDDLE'; } |
368
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_key)/pi) { $type = 'LESS_BY_KEY'; } |
369
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_key)/pi) { $type = 'GREATER_BY_KEY'; } |
370
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_(?:count|val(?:ue)?))/pi) { $type = 'LESS_BY_COUNT'; } |
371
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_(?:count|val(?:ue)?))/pi) { $type = 'GREATER_BY_COUNT'; } |
372
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?/pi) { $type = 'LESS_BY'; } |
373
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?/pi) { $type = 'GREATER_BY'; } |
374
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\#prune(?:_less|_asc)?(?:_by)?/pi) { $type = 'PRUNE_ASC'; } |
375
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#prune(?:_greater|_de?sc)(?:_by)?/pi) { $type = 'PRUNE_DESC'; } |
376
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#rand(?:om)?/pi) { $type = 'RANDOM'; } |
377
|
28
|
|
|
|
|
69
|
elsif ($$bufr =~ m/\G\#by/pi) { $type = 'BY'; } |
378
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G\#samp(?:le)?/pi) { $type = 'SAMPLE'; } |
379
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#clim(?:it)?/pi) { $type = 'CLIMIT'; } |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
##-- regexes |
382
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='NEG_REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
383
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\//po) { $type='NEG_REGEX'; $text=$1; } |
|
0
|
|
|
|
|
0
|
|
384
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
385
|
11
|
|
|
|
|
30
|
elsif ($$bufr =~ m/\G\/($DEF{regex_text})\//po) { $type='REGEX'; $text=$1; } |
|
11
|
|
|
|
|
31
|
|
386
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\Gs\/($DEF{regex_text})\//po) { $type='REGEX_SEARCH'; $text=$1; $lex->{state}='Q_REGREP'; } |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
5
|
|
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
##-- punctutation & special characters |
389
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G#=/p) { $type = 'HASH_EQUAL'; } ##-- hash+equal: exact distance |
390
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G#
|
391
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G#>/p) { $type = 'HASH_GREATER'; } ##-- hash+greater: min distance |
392
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\$\./p) { $type = 'DOLLAR_DOT'; } ##-- positional anchor pseudo-index |
393
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\:\{/p) { $type = 'COLON_LBRACE'; } ##-- theusaurus-query operator |
394
|
4
|
|
|
|
|
11
|
elsif ($$bufr =~ m/\G\@\{/p) { $type = 'AT_LBRACE'; } ##-- literal-set operator |
395
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\{/p) { $type = 'STAR_LBRACE'; } ##-- prefix-set opener |
396
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\}\*/p) { $type = 'RBRACE_STAR'; } ##-- suffix-set closer |
397
|
78
|
|
|
|
|
234
|
elsif ($$bufr =~ m/\G[!.,;=@%^#~\/]/p) { $type = ${^MATCH}; } ##-- single-char punctuation operators |
398
|
172
|
|
|
|
|
477
|
elsif ($$bufr =~ m/\G[\[\]{}()<>]/p) { $type = ${^MATCH}; } ##-- parentheses |
399
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\"/p) { $type = ${^MATCH}; } ##-- double-quotes |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
##-- subcorpus path-lists |
402
|
34
|
|
|
|
|
158
|
elsif ($$bufr =~ m/\G\:/p) { $type = ${^MATCH}; $lex->{yyDtrsPos}=$lex->yypos(); $lex->{state}='Q_CORPORA'; } |
|
34
|
|
|
|
|
96
|
|
|
34
|
|
|
|
|
73
|
|
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
##-- truncated symbols |
405
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'\*/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated quoted string (infix symbol) |
|
0
|
|
|
|
|
0
|
|
406
|
2
|
|
|
|
|
8
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated quoted string (prefix symbol) |
|
2
|
|
|
|
|
9
|
|
407
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated quoted string (suffix symbol) |
|
0
|
|
|
|
|
0
|
|
408
|
|
|
|
|
|
|
|
409
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*($DEF{symbol_text})\*/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated bareword (infix symbol) |
|
0
|
|
|
|
|
0
|
|
410
|
2
|
|
|
|
|
9
|
elsif ($$bufr =~ m/\G($DEF{symbol_text})\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated bareword (prefix symbol) |
|
2
|
|
|
|
|
9
|
|
411
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*($DEF{symbol_text})/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated bareword (suffix symbol) |
|
0
|
|
|
|
|
0
|
|
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
##-- index names (special handling to allow count(foo) #by[$w-1] |
414
|
24
|
|
|
|
|
71
|
elsif ($$bufr =~ m/\G\$($DEF{index_name})/po) { $type='INDEX'; $text=$1; } |
|
24
|
|
|
|
|
66
|
|
415
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\$'($DEF{index_name})'/po) { $type='INDEX'; $text=$1; } |
|
0
|
|
|
|
|
0
|
|
416
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ /m\G\$/p) { $type='$'; } |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
##-- numeric tokens (handled below with symbols for perl lexer w/o longest-match) |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
##-- single-term wildcard |
421
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*/p) { $type = '*'; } |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
##-- term expander pipelines |
424
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\|/p) { $lex->{state}='Q_XPIPE'; $type='__SKIP__'; } |
|
4
|
|
|
|
|
5
|
|
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
##-- symbols: quoted |
427
|
|
|
|
|
|
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { |
428
|
43
|
|
|
|
|
152
|
$text = $1; |
429
|
43
|
100
|
|
|
|
222
|
if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; } |
|
10
|
100
|
|
|
|
22
|
|
430
|
6
|
|
|
|
|
16
|
elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; } |
431
|
27
|
|
|
|
|
66
|
else { $type='SYMBOL'; } |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
##-- symbols: barewords |
434
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'INITIAL' && $$bufr =~ m/\G$DEF{symbol_text}/po) { |
435
|
285
|
|
|
|
|
877
|
$text = ${^MATCH}; |
436
|
285
|
|
|
|
|
452
|
$match = ${^MATCH}; |
437
|
285
|
100
|
|
|
|
1423
|
if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; } |
|
40
|
100
|
|
|
|
79
|
|
438
|
6
|
|
|
|
|
17
|
elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; } |
439
|
239
|
|
|
|
|
479
|
else { $type='SYMBOL'; } |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
##-- barewords: integers and dates (Q_MATCHID) |
442
|
14
|
|
|
|
|
50
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+(?=$DEF{int_boundary})/po) { $type='INTEGER'; } |
443
|
26
|
|
|
|
|
71
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+\z/p) { $type='INTEGER'; } |
444
|
|
|
|
|
|
|
|
445
|
0
|
|
|
|
|
0
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}(?=$DEF{int_boundary})/po) { $type='DATE'; } |
446
|
0
|
|
|
|
|
0
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}\z/po) { $type='DATE'; } |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
##-- misc |
449
|
300
|
|
|
|
|
763
|
elsif ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } |
450
|
|
|
|
|
|
|
#elsif ($$bufr =~ m/\G./p) { $type = 'SYMBOL'; } |
451
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G./p) { $type = '__ERROR__'; } |
452
|
|
|
|
|
|
|
|
453
|
1213
|
100
|
|
|
|
3462
|
$match = ${^MATCH} if (!defined($match)); |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
##------------------------ |
456
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_CORPORA') { |
457
|
86
|
100
|
|
|
|
1194
|
if ($$bufr =~ m/\G$DEF{corpus_text}/p) { $type='SYMBOL'; } |
|
28
|
100
|
|
|
|
74
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
458
|
22
|
|
|
|
|
51
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $type='SYMBOL'; $text=$1; } |
|
22
|
|
|
|
|
49
|
|
459
|
18
|
|
|
|
|
53
|
elsif ($$bufr =~ m/\G\,/p) { $type=${^MATCH}; } |
460
|
8
|
|
|
|
|
22
|
elsif ($$bufr =~ m/\G\s+/p) { $type='__SKIP__'; } |
461
|
10
|
|
|
|
|
23
|
else { $type='__SKIP__'; $lex->yypopq(); } |
|
10
|
|
|
|
|
32
|
|
462
|
|
|
|
|
|
|
|
463
|
86
|
|
|
|
|
174
|
$match = ${^MATCH}; |
464
|
86
|
100
|
|
|
|
254
|
$lex->{yyDtrsLen} = $lex->yypos()-$lex->{yyDtrsPos} if ($type ne '__SKIP__'); |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
##------------------------ |
467
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_COMMENT') { |
468
|
38
|
100
|
|
|
|
462
|
if ($$bufr =~ m/\G\]/p) { |
|
|
50
|
|
|
|
|
|
469
|
19
|
|
|
|
|
36
|
$type='__SKIP__'; |
470
|
19
|
|
|
|
|
41
|
$lex->{_cmtbuf} .= ${^MATCH}; |
471
|
19
|
|
|
|
|
29
|
push(@{$lex->{comments}}, $lex->{_cmtbuf}); |
|
19
|
|
|
|
|
60
|
|
472
|
19
|
|
|
|
|
56
|
delete $lex->{_cmtbuf}; |
473
|
19
|
|
|
|
|
60
|
$lex->yypopq(); |
474
|
|
|
|
|
|
|
} |
475
|
19
|
|
|
|
|
50
|
elsif ($$bufr =~ m/\G$DEF{comment_text}/sp) { $type='__SKIP__'; $lex->{_cmtbuf} .= ${^MATCH}; } |
|
19
|
|
|
|
|
54
|
|
476
|
0
|
|
|
|
|
0
|
else { $type='__ERROR__'; } |
477
|
|
|
|
|
|
|
|
478
|
38
|
|
|
|
|
96
|
$match = ${^MATCH}; |
479
|
|
|
|
|
|
|
} |
480
|
|
|
|
|
|
|
##------------------------ |
481
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_DATE') { |
482
|
2
|
50
|
|
|
|
19
|
if ($$bufr =~ m/^\s+/) { $type = '__SKIP__'; } |
|
0
|
50
|
|
|
|
0
|
|
483
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/^\//) { $type = '/'; $lex->{state}='INITIAL'; } |
|
0
|
|
|
|
|
0
|
|
484
|
2
|
|
|
|
|
6
|
else { $lex->{state}='INITIAL'; $type='__SKIP__'; } |
|
2
|
|
|
|
|
5
|
|
485
|
|
|
|
|
|
|
|
486
|
2
|
|
|
|
|
5
|
$match = ${^MATCH}; |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
##------------------------ |
489
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_REGREP') { |
490
|
2
|
50
|
|
|
|
158
|
if ($$bufr =~ m/\G($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
0
|
50
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
491
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G($DEF{regex_text})\//po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='INITIAL'; } |
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
7
|
|
492
|
|
|
|
|
|
|
|
493
|
2
|
|
|
|
|
7
|
$match = ${^MATCH}; |
494
|
|
|
|
|
|
|
} |
495
|
|
|
|
|
|
|
##------------------------ |
496
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_REGOPT') { |
497
|
0
|
0
|
|
|
|
0
|
if ($$bufr =~ m/\G$DEF{regex_modifier}+/po) { $type='REGOPT'; } |
|
0
|
|
|
|
|
0
|
|
498
|
0
|
|
|
|
|
0
|
else { $type='__SKIP__'; } |
499
|
0
|
|
|
|
|
0
|
$lex->{state} = 'INITIAL'; |
500
|
|
|
|
|
|
|
|
501
|
0
|
|
|
|
|
0
|
$match = ${^MATCH}; |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
##------------------------ |
504
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_XPIPE') { |
505
|
4
|
50
|
|
|
|
248
|
if ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } ##-- whitespace: skip |
|
0
|
50
|
|
|
|
0
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
506
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\-/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
0
|
|
|
|
|
0
|
|
507
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=$1; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
508
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G$DEF{symbol_text}/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
4
|
|
|
|
|
7
|
|
509
|
|
|
|
|
|
|
#elsif ($$bufr =~ m/\G\z/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
510
|
0
|
|
|
|
|
0
|
else { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=''; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
511
|
|
|
|
|
|
|
|
512
|
4
|
|
|
|
|
12
|
$match = ${^MATCH}; |
513
|
|
|
|
|
|
|
} |
514
|
|
|
|
|
|
|
##------------------------ |
515
|
|
|
|
|
|
|
## END perl-ification of flex sources |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
##-- guts |
518
|
1345
|
100
|
|
|
|
2662
|
$text = $match if (!defined($text)); |
519
|
1345
|
100
|
|
|
|
3572
|
$lex->{bufp} += length($match) if (defined($match)); |
520
|
|
|
|
|
|
|
|
521
|
1345
|
|
|
|
|
3111
|
pos($$bufr) = $lex->{bufp}; |
522
|
1345
|
50
|
|
|
|
3352
|
return if (!defined($type)); |
523
|
|
|
|
|
|
|
|
524
|
1345
|
100
|
|
|
|
2719
|
next LEXSKIP if ($type eq '__SKIP__'); |
525
|
942
|
100
|
|
|
|
1704
|
next LEXBUF if ($type eq '__EOF__'); |
526
|
|
|
|
|
|
|
#elsif ($type eq '__ERROR__') { |
527
|
|
|
|
|
|
|
# return $lex->yyerror(); |
528
|
|
|
|
|
|
|
#} |
529
|
916
|
|
|
|
|
5159
|
return @$lex{qw(yytype yytext)} = ($type,$text); |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
} |
532
|
209
|
|
|
|
|
861
|
return @$lex{qw(yytype yytext)} = ('__EOF__',undef); |
533
|
|
|
|
|
|
|
} |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
##====================================================================== |
536
|
|
|
|
|
|
|
## Testing: dummy lexing |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
## undef = $lex->dummylex(@from_whatever) |
539
|
|
|
|
|
|
|
sub dummylex { |
540
|
0
|
|
|
0
|
0
|
|
my $lex = shift; |
541
|
0
|
|
|
|
|
|
$lex->reset(); |
542
|
0
|
|
|
|
|
|
$lex->from(@_); |
543
|
0
|
|
|
|
|
|
my ($type,$text); |
544
|
|
|
|
|
|
|
TOKEN: |
545
|
0
|
|
|
|
|
|
while(1) { |
546
|
0
|
|
|
|
|
|
($type,$text) = $lex->yylex(); |
547
|
|
|
|
|
|
|
print("-" x 64, "\n", |
548
|
|
|
|
|
|
|
">> Line: ", $lex->yylineno, ", Pos: ", $lex->yypos, "\n", |
549
|
0
|
0
|
|
|
|
|
">> State: ", (defined($lex->{state}) ? $lex->{state} : '(undef)'), "\n", |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
550
|
|
|
|
|
|
|
">> Type: ", (defined($type) ? $type : '(undef)'), "\n", |
551
|
|
|
|
|
|
|
">> Text: ", (defined($text) ? $text : '(undef)'), "\n", |
552
|
|
|
|
|
|
|
); |
553
|
0
|
0
|
|
|
|
|
if (!defined($type)) { |
554
|
0
|
|
|
|
|
|
warn(":: undef type!"); |
555
|
0
|
|
|
|
|
|
return; |
556
|
|
|
|
|
|
|
} |
557
|
0
|
0
|
|
|
|
|
if ($type eq '__ERROR__') { |
558
|
0
|
|
|
|
|
|
print(":: ERROR DETECTED\n"); |
559
|
0
|
|
|
|
|
|
$lex->yyerror(); |
560
|
|
|
|
|
|
|
} |
561
|
0
|
0
|
|
|
|
|
last if ($type eq '__EOF__'); |
562
|
|
|
|
|
|
|
} |
563
|
|
|
|
|
|
|
} |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
1; ##-- be happy |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
__END__ |