line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
##-*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
## File: DDC::::yyqlexer.pm |
4
|
|
|
|
|
|
|
## Author: Bryan Jurish |
5
|
|
|
|
|
|
|
## Description: |
6
|
|
|
|
|
|
|
## + lexer for ddc queries (formerly DDC::Query::yylexer) |
7
|
|
|
|
|
|
|
## + last updated for ddc v2.1.15 |
8
|
|
|
|
|
|
|
##====================================================================== |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
package DDC::PP::yyqlexer; |
11
|
20
|
|
|
20
|
|
603
|
use 5.010; ##-- we need at least v5.10.0 for /p regex modifier |
|
20
|
|
|
|
|
76
|
|
12
|
20
|
|
|
20
|
|
117
|
use Encode qw(encode_utf8 decode_utf8); |
|
20
|
|
|
|
|
40
|
|
|
20
|
|
|
|
|
1086
|
|
13
|
20
|
|
|
20
|
|
121
|
use Carp; |
|
20
|
|
|
|
|
39
|
|
|
20
|
|
|
|
|
1222
|
|
14
|
20
|
|
|
20
|
|
136
|
use IO::File; |
|
20
|
|
|
|
|
43
|
|
|
20
|
|
|
|
|
2954
|
|
15
|
20
|
|
|
20
|
|
153
|
use IO::Handle; |
|
20
|
|
|
|
|
41
|
|
|
20
|
|
|
|
|
833
|
|
16
|
20
|
|
|
20
|
|
128
|
use strict; |
|
20
|
|
|
|
|
51
|
|
|
20
|
|
|
|
|
7378
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
##====================================================================== |
19
|
|
|
|
|
|
|
## Globals etc. |
20
|
|
|
|
|
|
|
our @ISA = qw(); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
##---------------------------------------------------------------------- |
23
|
|
|
|
|
|
|
## Globals: regexes for Parse::Lex lexer token regexes |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
## %DEF |
26
|
|
|
|
|
|
|
## + common shared regex definitions |
27
|
|
|
|
|
|
|
our (%DEF); |
28
|
|
|
|
|
|
|
BEGIN { |
29
|
|
|
|
|
|
|
##-- adapted from ddc-2.1.1/src/ConcordLib/yyQLexer.l ; extra escapes needed for backslashes ('\\' -> '\\\\') |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
##-- whitespace |
32
|
20
|
|
|
20
|
|
96
|
$DEF{ws} = '[ \t\n\r\f\x0b]'; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
##-- integer boundary characters |
35
|
20
|
|
|
|
|
48
|
$DEF{int_boundary} = '[ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\/\\\\\'\".$@_+-]'; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
##-- bareword symbols |
38
|
20
|
|
|
|
|
50
|
$DEF{symbol_cfirst} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\".$@]'; |
39
|
20
|
|
|
|
|
50
|
$DEF{symbol_crest} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\"]'; |
40
|
20
|
|
|
|
|
54
|
$DEF{symbol_cescape} = '(?:\\\\.)'; |
41
|
20
|
|
|
|
|
141
|
$DEF{symbol_text} = "(?:$DEF{symbol_cescape}|$DEF{symbol_cfirst})(?:$DEF{symbol_cescape}|$DEF{symbol_crest})*"; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
##-- subcorpus symbols (ddc >= v2.2.0; also allow '!' here) |
44
|
20
|
|
|
|
|
46
|
$DEF{corpus_cfirst} = '[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"$@]'; |
45
|
20
|
|
|
|
|
51
|
$DEF{corpus_crest} = '[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"]'; |
46
|
20
|
|
|
|
|
78
|
$DEF{corpus_text} = "(?:$DEF{symbol_cescape}|$DEF{corpus_cfirst})(?:$DEF{symbol_cescape}|$DEF{corpus_crest})*"; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
##-- bareword index names (underscore and digits ok, but no '.', '-', or '+') |
49
|
20
|
|
|
|
|
67
|
$DEF{index_char} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\/\'\".$@+-]'; |
50
|
20
|
|
|
|
|
65
|
$DEF{index_name} = "(?:$DEF{index_char}|$DEF{symbol_cescape})+"; |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
##-- single-quoted symbols |
53
|
20
|
|
|
|
|
49
|
$DEF{sq_text} = "(?:[^\']|$DEF{symbol_cescape})*"; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
##-- bareword dates (>= 4 digits of year, otherwise breaks {count(...) #by[$w=1-1]}) |
56
|
20
|
|
|
|
|
47
|
$DEF{date_bare} = "[+-]?[0-9]{4,}(-[0-9]{1,2}){1,2}"; |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
##-- regexes |
59
|
20
|
|
|
|
|
46
|
$DEF{regex_text} = "(?:(?:\\\\.)|[^\\\\/])*"; |
60
|
20
|
|
|
|
|
34
|
$DEF{regex_modifier} = '[dgimsx]'; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
##-- comments |
63
|
20
|
|
|
|
|
77
|
$DEF{comment_text} = "(?:\\\\.|[^\\]])*"; |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
##-- compile patterns |
66
|
20
|
|
|
|
|
218
|
foreach (keys %DEF) { |
67
|
|
|
|
|
|
|
#print STDERR __PACKAGE__, ": compiling regex macro: $_ ~ /$DEF{$_}/\n"; |
68
|
320
|
|
|
|
|
12096
|
$DEF{$_} = qr/$DEF{$_}/; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
##====================================================================== |
73
|
|
|
|
|
|
|
## $lex = $CLASS_OR_OBJ->new(%args) |
74
|
|
|
|
|
|
|
## + abstract constructor |
75
|
|
|
|
|
|
|
## + %$lex, %args: |
76
|
|
|
|
|
|
|
## { |
77
|
|
|
|
|
|
|
## src => $name, ##-- source name |
78
|
|
|
|
|
|
|
## fh => $srcfh, ##-- source filehandle |
79
|
|
|
|
|
|
|
## bufr => \$buf, ##-- source buffer (string reference) |
80
|
|
|
|
|
|
|
## bufp => $pos, ##-- current pos() in source buffer |
81
|
|
|
|
|
|
|
## buf => $buf, ##-- local buffer (for filehandle input) |
82
|
|
|
|
|
|
|
## state => $q, ##-- symbolic state name (default: 'INITIAL') |
83
|
|
|
|
|
|
|
## stack => \@stack, ##-- state stack |
84
|
|
|
|
|
|
|
## |
85
|
|
|
|
|
|
|
## ##-- utf-8 or byte mode? |
86
|
|
|
|
|
|
|
## utf8 => $bool, ##-- whether to use utf8 or byte-mode (default: true (non-compatible but handy)) |
87
|
|
|
|
|
|
|
## |
88
|
|
|
|
|
|
|
## ##-- runtime data |
89
|
|
|
|
|
|
|
## yytext => $text, ##-- current text |
90
|
|
|
|
|
|
|
## yytype => $type, ##-- current token type |
91
|
|
|
|
|
|
|
## yylineno => $line, ##-- current source line (file input only) |
92
|
|
|
|
|
|
|
## } |
93
|
|
|
|
|
|
|
sub new { |
94
|
15
|
|
|
15
|
0
|
46
|
my $that = shift; |
95
|
15
|
|
33
|
|
|
249
|
my $lex = bless({ |
96
|
|
|
|
|
|
|
src =>undef, |
97
|
|
|
|
|
|
|
fh =>undef, |
98
|
|
|
|
|
|
|
bufr =>undef, |
99
|
|
|
|
|
|
|
bufp =>undef, |
100
|
|
|
|
|
|
|
buf =>undef, |
101
|
|
|
|
|
|
|
utf8 =>1, |
102
|
|
|
|
|
|
|
state => 'INITIAL', |
103
|
|
|
|
|
|
|
stack => [], |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
yytext=>undef, |
106
|
|
|
|
|
|
|
yytype=>undef, |
107
|
|
|
|
|
|
|
yylineno=>undef, |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
##-- lexer comment retention hacks |
110
|
|
|
|
|
|
|
comments=>[], |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
##-- user args |
113
|
|
|
|
|
|
|
@_ |
114
|
|
|
|
|
|
|
}, |
115
|
|
|
|
|
|
|
ref($that)||$that |
116
|
|
|
|
|
|
|
); |
117
|
15
|
|
|
|
|
141
|
return $lex; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
## $lex = $lex->clear() |
121
|
|
|
|
|
|
|
## + clear lexer buffer, source, etc |
122
|
|
|
|
|
|
|
sub clear { |
123
|
398
|
|
|
398
|
0
|
598
|
my $lex = shift; |
124
|
398
|
|
|
|
|
1298
|
delete @$lex{qw(src fh bufr bufp buf yytext yytype yylineno)}; |
125
|
398
|
|
|
|
|
680
|
$lex->{state} = 'INITIAL'; |
126
|
398
|
|
|
|
|
568
|
@{$lex->{stack}} = qw(); |
|
398
|
|
|
|
|
772
|
|
127
|
398
|
|
|
|
|
604
|
@{$lex->{comments}} = qw(); |
|
398
|
|
|
|
|
633
|
|
128
|
398
|
|
|
|
|
574
|
delete $lex->{_cmtbuf}; |
129
|
398
|
|
|
|
|
667
|
return $lex; |
130
|
|
|
|
|
|
|
} |
131
|
20
|
|
|
20
|
|
95152
|
BEGIN { *reset = *close = \&clear; } |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
##====================================================================== |
134
|
|
|
|
|
|
|
## I/O |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
## $lex = $lex->from($which,$src, %opts) |
137
|
|
|
|
|
|
|
## + $which is one of qw(fh file string) |
138
|
|
|
|
|
|
|
## + $src is the actual source (default: 'string') |
139
|
|
|
|
|
|
|
sub from { |
140
|
199
|
|
|
199
|
0
|
457
|
my ($lex,$which,$src,%opts) = @_; |
141
|
199
|
50
|
|
|
|
474
|
return $lex->fromFh($src,%opts) if ($which eq 'fh'); |
142
|
199
|
50
|
|
|
|
393
|
return $lex->fromFile($src,%opts) if ($which eq 'file'); |
143
|
199
|
|
|
|
|
563
|
return $lex->fromString($src,%opts); |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
## $lex = $lex->fromFile($filename_or_handle,%opts) |
147
|
|
|
|
|
|
|
sub fromFile { |
148
|
0
|
|
|
0
|
0
|
0
|
my ($lex,$file,%opts) = @_; |
149
|
0
|
0
|
|
|
|
0
|
return $lex->fromFh($file,%opts) if (ref($file)); |
150
|
0
|
0
|
|
|
|
0
|
my $fh = IO::File->new("<$file") |
151
|
|
|
|
|
|
|
or confess("cannot open '$file' for read: $!"); |
152
|
0
|
0
|
|
|
|
0
|
binmode($fh,':encoding(utf8)') if ($lex->{utf8}); |
153
|
0
|
|
|
|
|
0
|
return $lex->fromFh($fh,src=>"file \`$file'",%opts); |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
our $FH_SLURP=0; ##-- DEBUG: slurp whole files instead of line-wise input |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
## $lex = $lex->fromFh($fh,%opts) |
159
|
|
|
|
|
|
|
## + uses native $fh encoding |
160
|
|
|
|
|
|
|
sub fromFh { |
161
|
0
|
|
|
0
|
0
|
0
|
my ($lex,$fh,%opts) = @_; |
162
|
0
|
0
|
|
|
|
0
|
if ($FH_SLURP) { |
163
|
|
|
|
|
|
|
##-- always use string mode |
164
|
0
|
|
|
|
|
0
|
local $/=undef; |
165
|
0
|
|
|
|
|
0
|
my $buf = $fh->getline; |
166
|
0
|
|
|
|
|
0
|
$fh->close(); |
167
|
0
|
|
|
|
|
0
|
return $lex->fromString(\$buf,src=>"filehandle \`$fh'",%opts); |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
##-- line-wise buffering |
170
|
0
|
|
|
|
|
0
|
$lex->clear(); |
171
|
0
|
|
|
|
|
0
|
@$lex{keys %opts} = values(%opts); |
172
|
0
|
|
|
|
|
0
|
$lex->{fh} = $fh; |
173
|
0
|
|
|
|
|
0
|
$lex->{buf} = undef; |
174
|
0
|
|
|
|
|
0
|
$lex->{bufr} = \$lex->{buf}; |
175
|
0
|
|
|
|
|
0
|
$lex->{bufp} = 0; |
176
|
0
|
0
|
|
|
|
0
|
$lex->{src} = "filehandle \`$fh'" if (!defined($lex->{src})); |
177
|
0
|
|
|
|
|
0
|
return $lex; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
## $lex = $lex->fromString($str,%opts) |
181
|
|
|
|
|
|
|
## $lex = $lex->fromString(\$str,%opts) |
182
|
|
|
|
|
|
|
sub fromString { |
183
|
199
|
|
|
199
|
0
|
426
|
my ($lex,$str,%opts) = @_; |
184
|
199
|
|
|
|
|
477
|
$lex->clear(); |
185
|
199
|
50
|
|
|
|
522
|
if (ref($str)) { |
186
|
199
|
|
|
|
|
357
|
$lex->{bufr} = $str; |
187
|
199
|
50
|
|
|
|
903
|
$lex->{src} = "buffer \`$str'" if (!defined($lex->{src})); |
188
|
|
|
|
|
|
|
} else { |
189
|
0
|
|
|
|
|
0
|
$lex->{bufr} = \$str; |
190
|
0
|
0
|
|
|
|
0
|
$lex->{src} = "string \`$str'" if (!defined($lex->{src})); |
191
|
|
|
|
|
|
|
} |
192
|
199
|
|
|
|
|
418
|
$lex->{bufp} = 0; |
193
|
199
|
|
|
|
|
321
|
$lex->{yylineno} = 0; |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
##-- utf8 checks |
196
|
199
|
50
|
33
|
|
|
532
|
if ($lex->{utf8} && !utf8::is_utf8(${$lex->{bufr}})) { |
|
199
|
0
|
0
|
|
|
793
|
|
197
|
|
|
|
|
|
|
##-- lexer:utf8, string:bytes --> assume string is utf8-encoded |
198
|
199
|
50
|
|
|
|
332
|
${$lex->{bufr}} = decode_utf8(${$lex->{bufr}}) if (!utf8::is_utf8(${$lex->{bufr}})); |
|
199
|
|
|
|
|
4864
|
|
|
199
|
|
|
|
|
707
|
|
|
199
|
|
|
|
|
533
|
|
199
|
|
|
|
|
|
|
} |
200
|
0
|
|
|
|
|
0
|
elsif (!$lex->{utf8} && utf8::is_utf8(${$lex->{bufr}})) { |
201
|
|
|
|
|
|
|
##-- lexer:bytes, string:utf8 --> encode as utf8 octets |
202
|
0
|
|
|
|
|
0
|
${$lex->{bufr}} = encode_utf8(${$lex->{bufr}}); |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
199
|
|
|
|
|
763
|
return $lex; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
##====================================================================== |
209
|
|
|
|
|
|
|
## Utilities |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
## $bool = $lex->eof() |
212
|
|
|
|
|
|
|
## + true iff at end-of-file |
213
|
|
|
|
|
|
|
sub eof { |
214
|
1075
|
50
|
|
1075
|
0
|
2649
|
return $FH_SLURP ? $_[0]->eob() : !$_[0]->getmore(); |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
## $bool = $lex->eob() |
218
|
|
|
|
|
|
|
## + true at end-of-buffer |
219
|
|
|
|
|
|
|
sub eob { |
220
|
1075
|
|
66
|
1075
|
0
|
2978
|
return (!$_[0]{bufr} || !${$_[0]{bufr}} || ($_[0]{bufp}||0) >= length(${$_[0]{bufr}})); |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
## $bufr_or_undef = $lex->getmore() |
224
|
|
|
|
|
|
|
## + returns true iff there is still data in the buffer |
225
|
|
|
|
|
|
|
sub getmore { |
226
|
1075
|
100
|
|
1075
|
0
|
1994
|
return $_[0]{bufr} if (!$_[0]->eob()); |
227
|
197
|
50
|
|
|
|
665
|
if (defined($_[0]{fh})) { |
228
|
0
|
|
|
|
|
0
|
$_[0]{bufp} = 0; |
229
|
0
|
|
|
|
|
0
|
$_[0]{buf} = $_[0]{fh}->getline; |
230
|
0
|
|
|
|
|
0
|
$_[0]{bufr} = \$_[0]{buf}; |
231
|
0
|
|
|
|
|
0
|
$_[0]{yylineno} = $_[0]{fh}->input_line_number; |
232
|
0
|
0
|
|
|
|
0
|
return defined($_[0]{buf}) ? $_[0]{bufr} : undef; |
233
|
|
|
|
|
|
|
} |
234
|
197
|
|
|
|
|
623
|
return undef; |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
##====================================================================== |
238
|
|
|
|
|
|
|
## Runtime lexer accessors |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
## $yytext = $lex->yytext |
241
|
|
|
|
|
|
|
## + always defined; otherwise using $lex->{yytext} is faster |
242
|
2
|
50
|
|
2
|
0
|
42
|
sub yytext { return defined($_[0]{yytext}) ? $_[0]{yytext} : ''; } |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
## $yytype = $lex->yytype |
245
|
|
|
|
|
|
|
## + always defined; otherwise using $lex->{yytype} is faster |
246
|
2
|
50
|
|
2
|
0
|
26
|
sub yytype { return defined($_[0]{yytype}) ? $_[0]{yytype} : '__EOF__'; } |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
## $line = $lex->yylineno() |
249
|
|
|
|
|
|
|
## + returns current line number |
250
|
|
|
|
|
|
|
sub yylineno { |
251
|
2
|
|
|
2
|
0
|
11
|
return $_[0]{yylineno}; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
## $pos = $lex->yycolumn() |
255
|
|
|
|
|
|
|
## + return column at which current token starts (if any) |
256
|
|
|
|
|
|
|
sub yycolumn { |
257
|
0
|
0
|
0
|
0
|
0
|
0
|
return ($_[0]{bufp}||0) - (defined($_[0]{yytext}) ? length($_[0]{yytext}) : 0); |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
## $pos = $lex->yypos() |
261
|
|
|
|
|
|
|
## + return byte position in current line (or input string) |
262
|
|
|
|
|
|
|
sub yypos { |
263
|
0
|
|
0
|
0
|
0
|
0
|
return ($_[0]{bufp}||0); |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
## $string = $lex->yyerror(@msg) |
267
|
|
|
|
|
|
|
## + create a helpful error message |
268
|
|
|
|
|
|
|
sub yyerror { |
269
|
0
|
|
|
0
|
0
|
0
|
my $lex = shift; |
270
|
0
|
|
|
|
|
0
|
confess(ref($lex).": error in ".$lex->yywhere().join('',@_)); |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
## $string = $lex->yywhere() |
274
|
|
|
|
|
|
|
## + location string used by yyerror() |
275
|
|
|
|
|
|
|
sub yywhere { |
276
|
0
|
|
|
0
|
0
|
0
|
my $lex = shift; |
277
|
|
|
|
|
|
|
return ("$lex->{src} at " |
278
|
|
|
|
|
|
|
.(defined($lex->{fh}) ? ("line $lex->{yylineno}, ") : '') |
279
|
|
|
|
|
|
|
."column ".$lex->yycolumn |
280
|
0
|
0
|
|
|
|
0
|
.", near ".(defined($lex->{yytext}) ? "\`$lex->{yytext}\'" : '__EOF__') |
|
|
0
|
|
|
|
|
|
281
|
|
|
|
|
|
|
); |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
## $q = $lex->yypushq($new_state) |
285
|
|
|
|
|
|
|
sub yypushq { |
286
|
19
|
|
|
19
|
0
|
44
|
my ($lex,$qnew) = @_; |
287
|
19
|
|
|
|
|
34
|
push(@{$lex->{stack}},$lex->{state}); |
|
19
|
|
|
|
|
53
|
|
288
|
19
|
|
|
|
|
46
|
return $lex->{state} = $qnew; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
## $q = $lex->yypopq() |
292
|
|
|
|
|
|
|
sub yypopq { |
293
|
21
|
|
|
21
|
0
|
42
|
my $lex = shift; |
294
|
21
|
|
100
|
|
|
34
|
return $lex->{state} = pop(@{$lex->{stack}}) || 'INITIAL'; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
##====================================================================== |
298
|
|
|
|
|
|
|
## Runtime lexer routines |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
## ($typ,$text) = $lex->yylex() |
301
|
|
|
|
|
|
|
## + get next token from input stream |
302
|
|
|
|
|
|
|
sub yylex { |
303
|
1057
|
|
|
1057
|
0
|
1621
|
my $lex = shift; |
304
|
1057
|
|
|
|
|
1686
|
my ($bufr,$type,$text,$match,@part); |
305
|
|
|
|
|
|
|
#use re 'eval'; ##-- dangerous! |
306
|
|
|
|
|
|
|
LEXBUF: |
307
|
1057
|
|
|
|
|
2167
|
while (!$lex->eof()) { |
308
|
878
|
|
|
|
|
1611
|
$bufr = $lex->{bufr}; |
309
|
878
|
|
|
|
|
2173
|
pos($$bufr) = $lex->{bufp}; |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
LEXSKIP: |
312
|
|
|
|
|
|
|
##------------------------------------ |
313
|
|
|
|
|
|
|
## LEXSKIP: main lexer loop |
314
|
878
|
|
|
|
|
1535
|
while (1) { |
315
|
1241
|
|
|
|
|
2089
|
$type = $text = $match = undef; |
316
|
1241
|
|
|
|
|
1922
|
@part = qw(); |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
##------------------------ |
319
|
1241
|
100
|
100
|
|
|
3410
|
if ($lex->{state} eq 'INITIAL' || $lex->{state} eq 'Q_MATCHID') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
##-- end-of-file (should be first pattern) |
322
|
1137
|
100
|
100
|
|
|
54219
|
if ($$bufr =~ m/\G\z/) { $type = '__EOF__'; } |
|
18
|
100
|
100
|
|
|
38
|
|
|
|
100
|
66
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
##-- comments |
325
|
14
|
|
|
|
|
29
|
elsif ($$bufr =~ m/\G\#:[^\n]*/sp) { $type='__SKIP__'; push(@{$lex->{comments}},${^MATCH}."\n"); } |
|
14
|
|
|
|
|
20
|
|
|
14
|
|
|
|
|
65
|
|
326
|
19
|
|
|
|
|
40
|
elsif ($$bufr =~ m/\G\#\[/p) { $type='__SKIP__'; $lex->{_cmtbuf}=${^MATCH}; $lex->yypushq('Q_COMMENT'); } |
|
19
|
|
|
|
|
54
|
|
|
19
|
|
|
|
|
60
|
|
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
##-- operators |
329
|
21
|
|
|
|
|
41
|
elsif ($$bufr =~ m/\G\&\&/p) { $type = 'OP_BOOL_AND'; } |
330
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G\|\|/p) { $type = 'OP_BOOL_OR'; } |
331
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\Gnear/pi) { $type = 'NEAR'; } |
332
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:\!=|\&\!=|\&=\s*\!|\!with|with(?:out|\s*\!))/pi) { $type = 'WITHOUT'; } |
333
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G(?:\|=|withor|orwith|wor)/pi) { $type = 'WITHOR'; } |
334
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:\&=|with)/pi) { $type = 'WITH'; } |
335
|
28
|
|
|
|
|
64
|
elsif ($$bufr =~ m/\Gcount/pi) { $type = 'COUNT'; } |
336
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\Gkeys/pi) { $type = 'KEYS'; } |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
##-- count-by keywords |
339
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G(?:file|doc)_?id/pi) { $type = 'KW_FILEID'; } |
340
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\G(?:file|doc)_?(?:name)?/pi) { $type = 'KW_FILENAME'; } |
341
|
2
|
|
|
|
|
5
|
elsif ($$bufr =~ m/\Gdate/pi) { $type = 'KW_DATE'; $lex->{state}='Q_DATE' } |
|
2
|
|
|
|
|
5
|
|
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
##-- query operators |
344
|
17
|
|
|
|
|
34
|
elsif ($$bufr =~ m/\G\#(?:comment|cmt)/pi) { $type = 'KW_COMMENT'; } |
345
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:(?:co?n?te?xt?|n))/pi) { $type = 'CNTXT'; } |
346
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:with)?in/pi) { $type = 'WITHIN'; } |
347
|
3
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:sep(?:arate)?|nojoin)(?:_hits)?/pi) { $type = 'SEPARATE_HITS'; } |
348
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:nosep(?:arate)?|join)(?:_hits)?/pi) { $type = 'NOSEPARATE_HITS'; } |
349
|
4
|
|
|
|
|
12
|
elsif ($$bufr =~ m/\G\#(?:is_|has_)?date/pi) { $type = 'IS_DATE'; } |
350
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#has(?:_field)?/pi) { $type = 'HAS_FIELD'; } |
351
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#file(?:_?)names/pi) { $type = 'FILENAMES_ONLY'; } |
352
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#debug_rank/pi) { $type = 'DEBUG_RANK'; } |
353
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_rank/pi) { $type = 'GREATER_BY_RANK'; } |
354
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_rank/pi) { $type = 'LESS_BY_RANK'; } |
355
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_date/pi) { $type = 'GREATER_BY_DATE'; } |
356
|
10
|
|
|
|
|
26
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_date/pi) { $type = 'LESS_BY_DATE'; } |
357
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_size/pi) { $type = 'GREATER_BY_SIZE'; } |
358
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_size/pi) { $type = 'LESS_BY_SIZE'; } |
359
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:is_|has_)?size/pi) { $type = 'IS_SIZE'; } |
360
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?left/pi) { $type = 'LESS_BY_LEFT'; } |
361
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)left/pi) { $type = 'GREATER_BY_LEFT'; } |
362
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?right/pi) { $type = 'LESS_BY_RIGHT'; } |
363
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)right/pi) { $type = 'GREATER_BY_RIGHT'; } |
364
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?mid(?:dle)?/pi) { $type = 'LESS_BY_MIDDLE'; } |
365
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)mid(?:dle)?/pi) { $type = 'GREATER_BY_MIDDLE'; } |
366
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_key)/pi) { $type = 'LESS_BY_KEY'; } |
367
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_key)/pi) { $type = 'GREATER_BY_KEY'; } |
368
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_(?:count|val(?:ue)?))/pi) { $type = 'LESS_BY_COUNT'; } |
369
|
2
|
|
|
|
|
8
|
elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_(?:count|val(?:ue)?))/pi) { $type = 'GREATER_BY_COUNT'; } |
370
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?/pi) { $type = 'LESS_BY'; } |
371
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?/pi) { $type = 'GREATER_BY'; } |
372
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#rand(?:om)?/pi) { $type = 'RANDOM'; } |
373
|
28
|
|
|
|
|
62
|
elsif ($$bufr =~ m/\G\#by/pi) { $type = 'BY'; } |
374
|
2
|
|
|
|
|
8
|
elsif ($$bufr =~ m/\G\#samp(?:le)?/pi) { $type = 'SAMPLE'; } |
375
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\#clim(?:it)?/pi) { $type = 'CLIMIT'; } |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
##-- regexes |
378
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='NEG_REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
379
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\//po) { $type='NEG_REGEX'; $text=$1; } |
|
0
|
|
|
|
|
0
|
|
380
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
381
|
11
|
|
|
|
|
28
|
elsif ($$bufr =~ m/\G\/($DEF{regex_text})\//po) { $type='REGEX'; $text=$1; } |
|
11
|
|
|
|
|
31
|
|
382
|
2
|
|
|
|
|
6
|
elsif ($$bufr =~ m/\Gs\/($DEF{regex_text})\//po) { $type='REGEX_SEARCH'; $text=$1; $lex->{state}='Q_REGREP'; } |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
5
|
|
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
##-- punctutation & special characters |
385
|
2
|
|
|
|
|
85
|
elsif ($$bufr =~ m/\G#=/p) { $type = 'HASH_EQUAL'; } ##-- hash+equal: exact distance |
386
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G#
|
387
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G#>/p) { $type = 'HASH_GREATER'; } ##-- hash+greater: min distance |
388
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\$\./p) { $type = 'DOLLAR_DOT'; } ##-- positional anchor pseudo-index |
389
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\:\{/p) { $type = 'COLON_LBRACE'; } ##-- theusaurus-query operator |
390
|
4
|
|
|
|
|
12
|
elsif ($$bufr =~ m/\G\@\{/p) { $type = 'AT_LBRACE'; } ##-- literal-set operator |
391
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\{/p) { $type = 'STAR_LBRACE'; } ##-- prefix-set opener |
392
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\}\*/p) { $type = 'RBRACE_STAR'; } ##-- suffix-set closer |
393
|
74
|
|
|
|
|
224
|
elsif ($$bufr =~ m/\G[!.,;=@%^#~\/]/p) { $type = ${^MATCH}; } ##-- single-char punctuation operators |
394
|
164
|
|
|
|
|
464
|
elsif ($$bufr =~ m/\G[\[\]{}()<>]/p) { $type = ${^MATCH}; } ##-- parentheses |
395
|
4
|
|
|
|
|
14
|
elsif ($$bufr =~ m/\G\"/p) { $type = ${^MATCH}; } ##-- double-quotes |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
##-- subcorpus path-lists |
398
|
26
|
|
|
|
|
73
|
elsif ($$bufr =~ m/\G\:/p) { $type = ${^MATCH}; $lex->{state}='Q_CORPORA'; } |
|
26
|
|
|
|
|
55
|
|
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
##-- truncated symbols |
401
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'\*/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated quoted string (infix symbol) |
|
0
|
|
|
|
|
0
|
|
402
|
2
|
|
|
|
|
8
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated quoted string (prefix symbol) |
|
2
|
|
|
|
|
6
|
|
403
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated quoted string (suffix symbol) |
|
0
|
|
|
|
|
0
|
|
404
|
|
|
|
|
|
|
|
405
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*($DEF{symbol_text})\*/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated bareword (infix symbol) |
|
0
|
|
|
|
|
0
|
|
406
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G($DEF{symbol_text})\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated bareword (prefix symbol) |
|
2
|
|
|
|
|
7
|
|
407
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*($DEF{symbol_text})/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated bareword (suffix symbol) |
|
0
|
|
|
|
|
0
|
|
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
##-- index names (special handling to allow count(foo) #by[$w-1] |
410
|
24
|
|
|
|
|
62
|
elsif ($$bufr =~ m/\G\$($DEF{index_name})/po) { $type='INDEX'; $text=$1; } |
|
24
|
|
|
|
|
64
|
|
411
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\$'($DEF{index_name})'/po) { $type='INDEX'; $text=$1; } |
|
0
|
|
|
|
|
0
|
|
412
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ /m\G\$/p) { $type='$'; } |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
##-- numeric tokens (handled below with symbols for perl lexer w/o longest-match) |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
##-- single-term wildcard |
417
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\*/p) { $type = '*'; } |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
##-- term expander pipelines |
420
|
4
|
|
|
|
|
13
|
elsif ($$bufr =~ m/\G\|/p) { $lex->{state}='Q_XPIPE'; $type='__SKIP__'; } |
|
4
|
|
|
|
|
7
|
|
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
##-- symbols: quoted |
423
|
|
|
|
|
|
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { |
424
|
39
|
|
|
|
|
133
|
$text = $1; |
425
|
39
|
100
|
|
|
|
203
|
if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; } |
|
10
|
100
|
|
|
|
21
|
|
426
|
6
|
|
|
|
|
20
|
elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; } |
427
|
23
|
|
|
|
|
46
|
else { $type='SYMBOL'; } |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
##-- symbols: barewords |
430
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'INITIAL' && $$bufr =~ m/\G$DEF{symbol_text}/po) { |
431
|
269
|
|
|
|
|
834
|
$text = ${^MATCH}; |
432
|
269
|
|
|
|
|
441
|
$match = ${^MATCH}; |
433
|
269
|
100
|
|
|
|
1307
|
if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; } |
|
36
|
100
|
|
|
|
86
|
|
434
|
6
|
|
|
|
|
16
|
elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; } |
435
|
227
|
|
|
|
|
427
|
else { $type='SYMBOL'; } |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
##-- barewords: integers and dates (Q_MATCHID) |
438
|
14
|
|
|
|
|
48
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+(?=$DEF{int_boundary})/po) { $type='INTEGER'; } |
439
|
26
|
|
|
|
|
143
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+\z/p) { $type='INTEGER'; } |
440
|
|
|
|
|
|
|
|
441
|
0
|
|
|
|
|
0
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}(?=$DEF{int_boundary})/po) { $type='DATE'; } |
442
|
0
|
|
|
|
|
0
|
elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}\z/po) { $type='DATE'; } |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
##-- misc |
445
|
284
|
|
|
|
|
785
|
elsif ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } |
446
|
|
|
|
|
|
|
#elsif ($$bufr =~ m/\G./p) { $type = 'SYMBOL'; } |
447
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G./p) { $type = '__ERROR__'; } |
448
|
|
|
|
|
|
|
|
449
|
1137
|
100
|
|
|
|
3013
|
$match = ${^MATCH} if (!defined($match)); |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
##------------------------ |
452
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_CORPORA') { |
453
|
58
|
100
|
|
|
|
946
|
if ($$bufr =~ m/\G$DEF{corpus_text}/p) { $type='SYMBOL'; } |
|
22
|
100
|
|
|
|
53
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
454
|
18
|
|
|
|
|
42
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $type='SYMBOL'; $text=$1; } |
|
18
|
|
|
|
|
41
|
|
455
|
16
|
|
|
|
|
48
|
elsif ($$bufr =~ m/\G\,/p) { $type=${^MATCH}; } |
456
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\s+/p) { $type='__SKIP__'; } |
457
|
2
|
|
|
|
|
8
|
else { $type='__SKIP__'; $lex->yypopq(); } |
|
2
|
|
|
|
|
8
|
|
458
|
|
|
|
|
|
|
|
459
|
58
|
|
|
|
|
124
|
$match = ${^MATCH}; |
460
|
|
|
|
|
|
|
} |
461
|
|
|
|
|
|
|
##------------------------ |
462
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_COMMENT') { |
463
|
38
|
100
|
|
|
|
450
|
if ($$bufr =~ m/\G\]/p) { |
|
|
50
|
|
|
|
|
|
464
|
19
|
|
|
|
|
41
|
$type='__SKIP__'; |
465
|
19
|
|
|
|
|
45
|
$lex->{_cmtbuf} .= ${^MATCH}; |
466
|
19
|
|
|
|
|
41
|
push(@{$lex->{comments}}, $lex->{_cmtbuf}); |
|
19
|
|
|
|
|
50
|
|
467
|
19
|
|
|
|
|
43
|
delete $lex->{_cmtbuf}; |
468
|
19
|
|
|
|
|
50
|
$lex->yypopq(); |
469
|
|
|
|
|
|
|
} |
470
|
19
|
|
|
|
|
46
|
elsif ($$bufr =~ m/\G$DEF{comment_text}/sp) { $type='__SKIP__'; $lex->{_cmtbuf} .= ${^MATCH}; } |
|
19
|
|
|
|
|
54
|
|
471
|
0
|
|
|
|
|
0
|
else { $type='__ERROR__'; } |
472
|
|
|
|
|
|
|
|
473
|
38
|
|
|
|
|
91
|
$match = ${^MATCH}; |
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
##------------------------ |
476
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_DATE') { |
477
|
2
|
50
|
|
|
|
21
|
if ($$bufr =~ m/^\s+/) { $type = '__SKIP__'; } |
|
0
|
50
|
|
|
|
0
|
|
478
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/^\//) { $type = '/'; $lex->{state}='INITIAL'; } |
|
0
|
|
|
|
|
0
|
|
479
|
2
|
|
|
|
|
5
|
else { $lex->{state}='INITIAL'; $type='__SKIP__'; } |
|
2
|
|
|
|
|
6
|
|
480
|
|
|
|
|
|
|
|
481
|
2
|
|
|
|
|
5
|
$match = ${^MATCH}; |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
##------------------------ |
484
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_REGREP') { |
485
|
2
|
50
|
|
|
|
149
|
if ($$bufr =~ m/\G($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='Q_REGOPT'; } |
|
0
|
50
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
486
|
2
|
|
|
|
|
7
|
elsif ($$bufr =~ m/\G($DEF{regex_text})\//po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='INITIAL'; } |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
6
|
|
487
|
|
|
|
|
|
|
|
488
|
2
|
|
|
|
|
8
|
$match = ${^MATCH}; |
489
|
|
|
|
|
|
|
} |
490
|
|
|
|
|
|
|
##------------------------ |
491
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_REGOPT') { |
492
|
0
|
0
|
|
|
|
0
|
if ($$bufr =~ m/\G$DEF{regex_modifier}+/po) { $type='REGOPT'; } |
|
0
|
|
|
|
|
0
|
|
493
|
0
|
|
|
|
|
0
|
else { $type='__SKIP__'; } |
494
|
0
|
|
|
|
|
0
|
$lex->{state} = 'INITIAL'; |
495
|
|
|
|
|
|
|
|
496
|
0
|
|
|
|
|
0
|
$match = ${^MATCH}; |
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
##------------------------ |
499
|
|
|
|
|
|
|
elsif ($lex->{state} eq 'Q_XPIPE') { |
500
|
4
|
50
|
|
|
|
218
|
if ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } ##-- whitespace: skip |
|
0
|
50
|
|
|
|
0
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
501
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\-/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
0
|
|
|
|
|
0
|
|
502
|
0
|
|
|
|
|
0
|
elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=$1; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
503
|
4
|
|
|
|
|
12
|
elsif ($$bufr =~ m/\G$DEF{symbol_text}/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
|
4
|
|
|
|
|
9
|
|
504
|
|
|
|
|
|
|
#elsif ($$bufr =~ m/\G\z/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; } |
505
|
0
|
|
|
|
|
0
|
else { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=''; } |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
506
|
|
|
|
|
|
|
|
507
|
4
|
|
|
|
|
11
|
$match = ${^MATCH}; |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
##------------------------ |
510
|
|
|
|
|
|
|
## END perl-ification of flex sources |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
##-- guts |
513
|
1241
|
100
|
|
|
|
2582
|
$text = $match if (!defined($text)); |
514
|
1241
|
100
|
|
|
|
3196
|
$lex->{bufp} += length($match) if (defined($match)); |
515
|
|
|
|
|
|
|
|
516
|
1241
|
|
|
|
|
2852
|
pos($$bufr) = $lex->{bufp}; |
517
|
1241
|
50
|
|
|
|
2888
|
return if (!defined($type)); |
518
|
|
|
|
|
|
|
|
519
|
1241
|
100
|
|
|
|
2573
|
next LEXSKIP if ($type eq '__SKIP__'); |
520
|
878
|
100
|
|
|
|
1667
|
next LEXBUF if ($type eq '__EOF__'); |
521
|
|
|
|
|
|
|
#elsif ($type eq '__ERROR__') { |
522
|
|
|
|
|
|
|
# return $lex->yyerror(); |
523
|
|
|
|
|
|
|
#} |
524
|
860
|
|
|
|
|
4932
|
return @$lex{qw(yytype yytext)} = ($type,$text); |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
} |
527
|
197
|
|
|
|
|
790
|
return @$lex{qw(yytype yytext)} = ('__EOF__',undef); |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
##====================================================================== |
531
|
|
|
|
|
|
|
## Testing: dummy lexing |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
## undef = $lex->dummylex(@from_whatever) |
534
|
|
|
|
|
|
|
sub dummylex { |
535
|
0
|
|
|
0
|
0
|
|
my $lex = shift; |
536
|
0
|
|
|
|
|
|
$lex->reset(); |
537
|
0
|
|
|
|
|
|
$lex->from(@_); |
538
|
0
|
|
|
|
|
|
my ($type,$text); |
539
|
|
|
|
|
|
|
TOKEN: |
540
|
0
|
|
|
|
|
|
while(1) { |
541
|
0
|
|
|
|
|
|
($type,$text) = $lex->yylex(); |
542
|
|
|
|
|
|
|
print("-" x 64, "\n", |
543
|
|
|
|
|
|
|
">> Line: ", $lex->yylineno, ", Pos: ", $lex->yypos, "\n", |
544
|
0
|
0
|
|
|
|
|
">> State: ", (defined($lex->{state}) ? $lex->{state} : '(undef)'), "\n", |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
545
|
|
|
|
|
|
|
">> Type: ", (defined($type) ? $type : '(undef)'), "\n", |
546
|
|
|
|
|
|
|
">> Text: ", (defined($text) ? $text : '(undef)'), "\n", |
547
|
|
|
|
|
|
|
); |
548
|
0
|
0
|
|
|
|
|
if (!defined($type)) { |
549
|
0
|
|
|
|
|
|
warn(":: undef type!"); |
550
|
0
|
|
|
|
|
|
return; |
551
|
|
|
|
|
|
|
} |
552
|
0
|
0
|
|
|
|
|
if ($type eq '__ERROR__') { |
553
|
0
|
|
|
|
|
|
print(":: ERROR DETECTED\n"); |
554
|
0
|
|
|
|
|
|
$lex->yyerror(); |
555
|
|
|
|
|
|
|
} |
556
|
0
|
0
|
|
|
|
|
last if ($type eq '__EOF__'); |
557
|
|
|
|
|
|
|
} |
558
|
|
|
|
|
|
|
} |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
1; ##-- be happy |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
__END__ |