File Coverage

blib/lib/DDC/PP/yyqlexer.pm

Criterion	Covered	Total	%
statement	191	316	60.4
branch	180	284	63.3
condition	19	36	52.7
subroutine	21	28	75.0
pod	0	20	0.0
total	411	684	60.0

line	stmt	bran	cond	sub	pod	time	code
1							##-- Mode: CPerl --
2
3							## File: DDC::::yyqlexer.pm
4							## Author: Bryan Jurish
5							## Description:
6							## + lexer for ddc queries (formerly DDC::Query::yylexer)
7							## + last updated for ddc v2.1.15
8							##======================================================================
9
10							package DDC::PP::yyqlexer;
11	20			20		603	use 5.010; ##-- we need at least v5.10.0 for /p regex modifier
	20					76
12	20			20		117	use Encode qw(encode_utf8 decode_utf8);
	20					40
	20					1086
13	20			20		121	use Carp;
	20					39
	20					1222
14	20			20		136	use IO::File;
	20					43
	20					2954
15	20			20		153	use IO::Handle;
	20					41
	20					833
16	20			20		128	use strict;
	20					51
	20					7378
17
18							##======================================================================
19							## Globals etc.
20							our @ISA = qw();
21
22							##----------------------------------------------------------------------
23							## Globals: regexes for Parse::Lex lexer token regexes
24
25							## %DEF
26							## + common shared regex definitions
27							our (%DEF);
28							BEGIN {
29							##-- adapted from ddc-2.1.1/src/ConcordLib/yyQLexer.l ; extra escapes needed for backslashes ('\\' -> '\\\\')
30
31							##-- whitespace
32	20			20		96	$DEF{ws} = '[ \t\n\r\f\x0b]';
33
34							##-- integer boundary characters
35	20					48	$DEF{int_boundary} = '[ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\/\\\\\'\".$@_+-]';
36
37							##-- bareword symbols
38	20					50	$DEF{symbol_cfirst} = '[^ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\".$@]';
39	20					50	$DEF{symbol_crest} = '[^ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\"]';
40	20					54	$DEF{symbol_cescape} = '(?:\\\\.)';
41	20					141	$DEF{symbol_text} = "(?:$DEF{symbol_cescape}\|$DEF{symbol_cfirst})(?:$DEF{symbol_cescape}\|$DEF{symbol_crest})*";
42
43							##-- subcorpus symbols (ddc >= v2.2.0; also allow '!' here)
44	20					46	$DEF{corpus_cfirst} = '[^ \t\n\r\f\v\0&\|?^%,:;#=~(){}<>\[\]\\\'\"$@]';
45	20					51	$DEF{corpus_crest} = '[^ \t\n\r\f\v\0&\|?^%,:;#=~(){}<>\[\]\\\'\"]';
46	20					78	$DEF{corpus_text} = "(?:$DEF{symbol_cescape}\|$DEF{corpus_cfirst})(?:$DEF{symbol_cescape}\|$DEF{corpus_crest})*";
47
48							##-- bareword index names (underscore and digits ok, but no '.', '-', or '+')
49	20					67	$DEF{index_char} = '[^ \t\n\r\f\x0b\0&\|!?^%,:;#*=~(){}<>\[\]\\\\/\'\".$@+-]';
50	20					65	$DEF{index_name} = "(?:$DEF{index_char}\|$DEF{symbol_cescape})+";
51
52							##-- single-quoted symbols
53	20					49	$DEF{sq_text} = "(?:[^\']\|$DEF{symbol_cescape})*";
54
55							##-- bareword dates (>= 4 digits of year, otherwise breaks {count(...) #by[$w=1-1]})
56	20					47	$DEF{date_bare} = "[+-]?[0-9]{4,}(-[0-9]{1,2}){1,2}";
57
58							##-- regexes
59	20					46	$DEF{regex_text} = "(?:(?:\\\\.)\|[^\\\\/])*";
60	20					34	$DEF{regex_modifier} = '[dgimsx]';
61
62							##-- comments
63	20					77	$DEF{comment_text} = "(?:\\\\.\|[^\\]])*";
64
65							##-- compile patterns
66	20					218	foreach (keys %DEF) {
67							#print STDERR __PACKAGE__, ": compiling regex macro: $_ ~ /$DEF{$_}/\n";
68	320					12096	$DEF{$_} = qr/$DEF{$_}/;
69							}
70							}
71
72							##======================================================================
73							## $lex = $CLASS_OR_OBJ->new(%args)
74							## + abstract constructor
75							## + %$lex, %args:
76							## {
77							## src => $name, ##-- source name
78							## fh => $srcfh, ##-- source filehandle
79							## bufr => \$buf, ##-- source buffer (string reference)
80							## bufp => $pos, ##-- current pos() in source buffer
81							## buf => $buf, ##-- local buffer (for filehandle input)
82							## state => $q, ##-- symbolic state name (default: 'INITIAL')
83							## stack => \@stack, ##-- state stack
84							##
85							## ##-- utf-8 or byte mode?
86							## utf8 => $bool, ##-- whether to use utf8 or byte-mode (default: true (non-compatible but handy))
87							##
88							## ##-- runtime data
89							## yytext => $text, ##-- current text
90							## yytype => $type, ##-- current token type
91							## yylineno => $line, ##-- current source line (file input only)
92							## }
93							sub new {
94	15			15	0	46	my $that = shift;
95	15		33			249	my $lex = bless({
96							src =>undef,
97							fh =>undef,
98							bufr =>undef,
99							bufp =>undef,
100							buf =>undef,
101							utf8 =>1,
102							state => 'INITIAL',
103							stack => [],
104
105							yytext=>undef,
106							yytype=>undef,
107							yylineno=>undef,
108
109							##-- lexer comment retention hacks
110							comments=>[],
111
112							##-- user args
113							@_
114							},
115							ref($that)\|\|$that
116							);
117	15					141	return $lex;
118							}
119
120							## $lex = $lex->clear()
121							## + clear lexer buffer, source, etc
122							sub clear {
123	398			398	0	598	my $lex = shift;
124	398					1298	delete @$lex{qw(src fh bufr bufp buf yytext yytype yylineno)};
125	398					680	$lex->{state} = 'INITIAL';
126	398					568	@{$lex->{stack}} = qw();
	398					772
127	398					604	@{$lex->{comments}} = qw();
	398					633
128	398					574	delete $lex->{_cmtbuf};
129	398					667	return $lex;
130							}
131	20			20		95152	BEGIN { reset = close = \&clear; }
132
133							##======================================================================
134							## I/O
135
136							## $lex = $lex->from($which,$src, %opts)
137							## + $which is one of qw(fh file string)
138							## + $src is the actual source (default: 'string')
139							sub from {
140	199			199	0	457	my ($lex,$which,$src,%opts) = @_;
141	199	50				474	return $lex->fromFh($src,%opts) if ($which eq 'fh');
142	199	50				393	return $lex->fromFile($src,%opts) if ($which eq 'file');
143	199					563	return $lex->fromString($src,%opts);
144							}
145
146							## $lex = $lex->fromFile($filename_or_handle,%opts)
147							sub fromFile {
148	0			0	0	0	my ($lex,$file,%opts) = @_;
149	0	0				0	return $lex->fromFh($file,%opts) if (ref($file));
150	0	0				0	my $fh = IO::File->new("<$file")
151							or confess("cannot open '$file' for read: $!");
152	0	0				0	binmode($fh,':encoding(utf8)') if ($lex->{utf8});
153	0					0	return $lex->fromFh($fh,src=>"file \`$file'",%opts);
154							}
155
156							our $FH_SLURP=0; ##-- DEBUG: slurp whole files instead of line-wise input
157
158							## $lex = $lex->fromFh($fh,%opts)
159							## + uses native $fh encoding
160							sub fromFh {
161	0			0	0	0	my ($lex,$fh,%opts) = @_;
162	0	0				0	if ($FH_SLURP) {
163							##-- always use string mode
164	0					0	local $/=undef;
165	0					0	my $buf = $fh->getline;
166	0					0	$fh->close();
167	0					0	return $lex->fromString(\$buf,src=>"filehandle \`$fh'",%opts);
168							}
169							##-- line-wise buffering
170	0					0	$lex->clear();
171	0					0	@$lex{keys %opts} = values(%opts);
172	0					0	$lex->{fh} = $fh;
173	0					0	$lex->{buf} = undef;
174	0					0	$lex->{bufr} = \$lex->{buf};
175	0					0	$lex->{bufp} = 0;
176	0	0				0	$lex->{src} = "filehandle \`$fh'" if (!defined($lex->{src}));
177	0					0	return $lex;
178							}
179
180							## $lex = $lex->fromString($str,%opts)
181							## $lex = $lex->fromString(\$str,%opts)
182							sub fromString {
183	199			199	0	426	my ($lex,$str,%opts) = @_;
184	199					477	$lex->clear();
185	199	50				522	if (ref($str)) {
186	199					357	$lex->{bufr} = $str;
187	199	50				903	$lex->{src} = "buffer \`$str'" if (!defined($lex->{src}));
188							} else {
189	0					0	$lex->{bufr} = \$str;
190	0	0				0	$lex->{src} = "string \`$str'" if (!defined($lex->{src}));
191							}
192	199					418	$lex->{bufp} = 0;
193	199					321	$lex->{yylineno} = 0;
194
195							##-- utf8 checks
196	199	50	33			532	if ($lex->{utf8} && !utf8::is_utf8(${$lex->{bufr}})) {
	199	0	0			793
197							##-- lexer:utf8, string:bytes --> assume string is utf8-encoded
198	199	50				332	${$lex->{bufr}} = decode_utf8(${$lex->{bufr}}) if (!utf8::is_utf8(${$lex->{bufr}}));
	199					4864
	199					707
	199					533
199							}
200	0					0	elsif (!$lex->{utf8} && utf8::is_utf8(${$lex->{bufr}})) {
201							##-- lexer:bytes, string:utf8 --> encode as utf8 octets
202	0					0	${$lex->{bufr}} = encode_utf8(${$lex->{bufr}});
	0					0
	0					0
203							}
204
205	199					763	return $lex;
206							}
207
208							##======================================================================
209							## Utilities
210
211							## $bool = $lex->eof()
212							## + true iff at end-of-file
213							sub eof {
214	1075	50		1075	0	2649	return $FH_SLURP ? $_[0]->eob() : !$_[0]->getmore();
215							}
216
217							## $bool = $lex->eob()
218							## + true at end-of-buffer
219							sub eob {
220	1075		66	1075	0	2978	return (!$_[0]{bufr} \|\| !${$_[0]{bufr}} \|\| ($_[0]{bufp}\|\|0) >= length(${$_[0]{bufr}}));
221							}
222
223							## $bufr_or_undef = $lex->getmore()
224							## + returns true iff there is still data in the buffer
225							sub getmore {
226	1075	100		1075	0	1994	return $_[0]{bufr} if (!$_[0]->eob());
227	197	50				665	if (defined($_[0]{fh})) {
228	0					0	$_[0]{bufp} = 0;
229	0					0	$_[0]{buf} = $_[0]{fh}->getline;
230	0					0	$_[0]{bufr} = \$_[0]{buf};
231	0					0	$_[0]{yylineno} = $_[0]{fh}->input_line_number;
232	0	0				0	return defined($_[0]{buf}) ? $_[0]{bufr} : undef;
233							}
234	197					623	return undef;
235							}
236
237							##======================================================================
238							## Runtime lexer accessors
239
240							## $yytext = $lex->yytext
241							## + always defined; otherwise using $lex->{yytext} is faster
242	2	50		2	0	42	sub yytext { return defined($_[0]{yytext}) ? $_[0]{yytext} : ''; }
243
244							## $yytype = $lex->yytype
245							## + always defined; otherwise using $lex->{yytype} is faster
246	2	50		2	0	26	sub yytype { return defined($_[0]{yytype}) ? $_[0]{yytype} : '__EOF__'; }
247
248							## $line = $lex->yylineno()
249							## + returns current line number
250							sub yylineno {
251	2			2	0	11	return $_[0]{yylineno};
252							}
253
254							## $pos = $lex->yycolumn()
255							## + return column at which current token starts (if any)
256							sub yycolumn {
257	0	0	0	0	0	0	return ($_[0]{bufp}\|\|0) - (defined($_[0]{yytext}) ? length($_[0]{yytext}) : 0);
258							}
259
260							## $pos = $lex->yypos()
261							## + return byte position in current line (or input string)
262							sub yypos {
263	0		0	0	0	0	return ($_[0]{bufp}\|\|0);
264							}
265
266							## $string = $lex->yyerror(@msg)
267							## + create a helpful error message
268							sub yyerror {
269	0			0	0	0	my $lex = shift;
270	0					0	confess(ref($lex).": error in ".$lex->yywhere().join('',@_));
271							}
272
273							## $string = $lex->yywhere()
274							## + location string used by yyerror()
275							sub yywhere {
276	0			0	0	0	my $lex = shift;
277							return ("$lex->{src} at "
278							.(defined($lex->{fh}) ? ("line $lex->{yylineno}, ") : '')
279							."column ".$lex->yycolumn
280	0	0				0	.", near ".(defined($lex->{yytext}) ? "\`$lex->{yytext}\'" : '__EOF__')
		0
281							);
282							}
283
284							## $q = $lex->yypushq($new_state)
285							sub yypushq {
286	19			19	0	44	my ($lex,$qnew) = @_;
287	19					34	push(@{$lex->{stack}},$lex->{state});
	19					53
288	19					46	return $lex->{state} = $qnew;
289							}
290
291							## $q = $lex->yypopq()
292							sub yypopq {
293	21			21	0	42	my $lex = shift;
294	21		100			34	return $lex->{state} = pop(@{$lex->{stack}}) \|\| 'INITIAL';
295							}
296
297							##======================================================================
298							## Runtime lexer routines
299
300							## ($typ,$text) = $lex->yylex()
301							## + get next token from input stream
302							sub yylex {
303	1057			1057	0	1621	my $lex = shift;
304	1057					1686	my ($bufr,$type,$text,$match,@part);
305							#use re 'eval'; ##-- dangerous!
306							LEXBUF:
307	1057					2167	while (!$lex->eof()) {
308	878					1611	$bufr = $lex->{bufr};
309	878					2173	pos($$bufr) = $lex->{bufp};
310
311							LEXSKIP:
312							##------------------------------------
313							## LEXSKIP: main lexer loop
314	878					1535	while (1) {
315	1241					2089	$type = $text = $match = undef;
316	1241					1922	@part = qw();
317
318							##------------------------
319	1241	100	100			3410	if ($lex->{state} eq 'INITIAL' \|\| $lex->{state} eq 'Q_MATCHID') {
		100
		100
		100
		100
		50
		50
320
321							##-- end-of-file (should be first pattern)
322	1137	100	100			54219	if ($$bufr =~ m/\G\z/) { $type = '__EOF__'; }
	18	100	100			38
		100	66
		100	33
		100	33
		50
		100
		100
		100
		100
		50
		50
		100
		100
		100
		100
		100
		100
		50
		100
		100
		50
		50
		50
		50
		50
		100
		50
		50
		50
		50
		50
		50
		50
		50
		50
		100
		50
		50
		100
		50
		50
		50
		100
		100
		50
		50
		50
		50
		100
		100
		100
		50
		50
		50
		50
		100
		50
		50
		100
		100
		100
		100
		50
		100
		50
		50
		100
		50
		100
		50
		50
		50
		100
		100
		100
		100
		100
		50
		50
		50
		0
323
324							##-- comments
325	14					29	elsif ($$bufr =~ m/\G\#:[^\n]*/sp) { $type='__SKIP__'; push(@{$lex->{comments}},${^MATCH}."\n"); }
	14					20
	14					65
326	19					40	elsif ($$bufr =~ m/\G\#\[/p) { $type='__SKIP__'; $lex->{_cmtbuf}=${^MATCH}; $lex->yypushq('Q_COMMENT'); }
	19					54
	19					60
327
328							##-- operators
329	21					41	elsif ($$bufr =~ m/\G\&\&/p) { $type = 'OP_BOOL_AND'; }
330	2					6	elsif ($$bufr =~ m/\G\\|\\|/p) { $type = 'OP_BOOL_OR'; }
331	0					0	elsif ($$bufr =~ m/\Gnear/pi) { $type = 'NEAR'; }
332	2					6	elsif ($$bufr =~ m/\G(?:\!=\|\&\!=\|\&=\s\!\|\!with\|with(?:out\|\s\!))/pi) { $type = 'WITHOUT'; }
333	2					7	elsif ($$bufr =~ m/\G(?:\\|=\|withor\|orwith\|wor)/pi) { $type = 'WITHOR'; }
334	2					6	elsif ($$bufr =~ m/\G(?:\&=\|with)/pi) { $type = 'WITH'; }
335	28					64	elsif ($$bufr =~ m/\Gcount/pi) { $type = 'COUNT'; }
336	0					0	elsif ($$bufr =~ m/\Gkeys/pi) { $type = 'KEYS'; }
337
338							##-- count-by keywords
339	0					0	elsif ($$bufr =~ m/\G(?:file\|doc)_?id/pi) { $type = 'KW_FILEID'; }
340	2					6	elsif ($$bufr =~ m/\G(?:file\|doc)_?(?:name)?/pi) { $type = 'KW_FILENAME'; }
341	2					5	elsif ($$bufr =~ m/\Gdate/pi) { $type = 'KW_DATE'; $lex->{state}='Q_DATE' }
	2					5
342
343							##-- query operators
344	17					34	elsif ($$bufr =~ m/\G\#(?:comment\|cmt)/pi) { $type = 'KW_COMMENT'; }
345	2					7	elsif ($$bufr =~ m/\G\#(?:(?:co?n?te?xt?\|n))/pi) { $type = 'CNTXT'; }
346	2					7	elsif ($$bufr =~ m/\G\#(?:with)?in/pi) { $type = 'WITHIN'; }
347	3					7	elsif ($$bufr =~ m/\G\#(?:sep(?:arate)?\|nojoin)(?:_hits)?/pi) { $type = 'SEPARATE_HITS'; }
348	0					0	elsif ($$bufr =~ m/\G\#(?:nosep(?:arate)?\|join)(?:_hits)?/pi) { $type = 'NOSEPARATE_HITS'; }
349	4					12	elsif ($$bufr =~ m/\G\#(?:is_\|has_)?date/pi) { $type = 'IS_DATE'; }
350	2					7	elsif ($$bufr =~ m/\G\#has(?:_field)?/pi) { $type = 'HAS_FIELD'; }
351	0					0	elsif ($$bufr =~ m/\G\#file(?:_?)names/pi) { $type = 'FILENAMES_ONLY'; }
352	0					0	elsif ($$bufr =~ m/\G\#debug_rank/pi) { $type = 'DEBUG_RANK'; }
353	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?_rank/pi) { $type = 'GREATER_BY_RANK'; }
354	0					0	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?_rank/pi) { $type = 'LESS_BY_RANK'; }
355	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?_date/pi) { $type = 'GREATER_BY_DATE'; }
356	10					26	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?_date/pi) { $type = 'LESS_BY_DATE'; }
357	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?_size/pi) { $type = 'GREATER_BY_SIZE'; }
358	0					0	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?_size/pi) { $type = 'LESS_BY_SIZE'; }
359	0					0	elsif ($$bufr =~ m/\G\#(?:is_\|has_)?size/pi) { $type = 'IS_SIZE'; }
360	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)_(?:by_)?)?left/pi) { $type = 'LESS_BY_LEFT'; }
361	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)_(?:by_)?)left/pi) { $type = 'GREATER_BY_LEFT'; }
362	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)_(?:by_)?)?right/pi) { $type = 'LESS_BY_RIGHT'; }
363	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)_(?:by_)?)right/pi) { $type = 'GREATER_BY_RIGHT'; }
364	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)_(?:by_)?)?mid(?:dle)?/pi) { $type = 'LESS_BY_MIDDLE'; }
365	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)_(?:by_)?)mid(?:dle)?/pi) { $type = 'GREATER_BY_MIDDLE'; }
366	2					7	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)(?:_by)?_key)/pi) { $type = 'LESS_BY_KEY'; }
367	0					0	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)(?:_by)?_key)/pi) { $type = 'GREATER_BY_KEY'; }
368	0					0	elsif ($$bufr =~ m/\G\#(?:(?:less\|asc)(?:_by)?_(?:count\|val(?:ue)?))/pi) { $type = 'LESS_BY_COUNT'; }
369	2					8	elsif ($$bufr =~ m/\G\#(?:(?:greater\|de?sc)(?:_by)?_(?:count\|val(?:ue)?))/pi) { $type = 'GREATER_BY_COUNT'; }
370	0					0	elsif ($$bufr =~ m/\G\#(?:less\|asc)(?:_by)?/pi) { $type = 'LESS_BY'; }
371	0					0	elsif ($$bufr =~ m/\G\#(?:greater\|de?sc)(?:_by)?/pi) { $type = 'GREATER_BY'; }
372	0					0	elsif ($$bufr =~ m/\G\#rand(?:om)?/pi) { $type = 'RANDOM'; }
373	28					62	elsif ($$bufr =~ m/\G\#by/pi) { $type = 'BY'; }
374	2					8	elsif ($$bufr =~ m/\G\#samp(?:le)?/pi) { $type = 'SAMPLE'; }
375	0					0	elsif ($$bufr =~ m/\G\#clim(?:it)?/pi) { $type = 'CLIMIT'; }
376
377							##-- regexes
378	0					0	elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='NEG_REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; }
	0					0
	0					0
379	0					0	elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\//po) { $type='NEG_REGEX'; $text=$1; }
	0					0
380	0					0	elsif ($$bufr =~ m/\G\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; }
	0					0
	0					0
381	11					28	elsif ($$bufr =~ m/\G\/($DEF{regex_text})\//po) { $type='REGEX'; $text=$1; }
	11					31
382	2					6	elsif ($$bufr =~ m/\Gs\/($DEF{regex_text})\//po) { $type='REGEX_SEARCH'; $text=$1; $lex->{state}='Q_REGREP'; }
	2					7
	2					5
383
384							##-- punctutation & special characters
385	2					85	elsif ($$bufr =~ m/\G#=/p) { $type = 'HASH_EQUAL'; } ##-- hash+equal: exact distance
386	0					0	elsif ($$bufr =~ m/\G#
387	0					0	elsif ($$bufr =~ m/\G#>/p) { $type = 'HASH_GREATER'; } ##-- hash+greater: min distance
388	0					0	elsif ($$bufr =~ m/\G\$\./p) { $type = 'DOLLAR_DOT'; } ##-- positional anchor pseudo-index
389	0					0	elsif ($$bufr =~ m/\G\:\{/p) { $type = 'COLON_LBRACE'; } ##-- theusaurus-query operator
390	4					12	elsif ($$bufr =~ m/\G\@\{/p) { $type = 'AT_LBRACE'; } ##-- literal-set operator
391	0					0	elsif ($$bufr =~ m/\G\*\{/p) { $type = 'STAR_LBRACE'; } ##-- prefix-set opener
392	0					0	elsif ($$bufr =~ m/\G\}\*/p) { $type = 'RBRACE_STAR'; } ##-- suffix-set closer
393	74					224	elsif ($$bufr =~ m/\G[!.,;=@%^#~\/]/p) { $type = ${^MATCH}; } ##-- single-char punctuation operators
394	164					464	elsif ($$bufr =~ m/\G[\[\]{}()<>]/p) { $type = ${^MATCH}; } ##-- parentheses
395	4					14	elsif ($$bufr =~ m/\G\"/p) { $type = ${^MATCH}; } ##-- double-quotes
396
397							##-- subcorpus path-lists
398	26					73	elsif ($$bufr =~ m/\G\:/p) { $type = ${^MATCH}; $lex->{state}='Q_CORPORA'; }
	26					55
399
400							##-- truncated symbols
401	0					0	elsif ($$bufr =~ m/\G\\'($DEF{sq_text})\'\/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated quoted string (infix symbol)
	0					0
402	2					8	elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated quoted string (prefix symbol)
	2					6
403	0					0	elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated quoted string (suffix symbol)
	0					0
404
405	0					0	elsif ($$bufr =~ m/\G\($DEF{symbol_text})\/po) { $type='INFIX'; $text=$1; } ##-- dual-truncated bareword (infix symbol)
	0					0
406	2					7	elsif ($$bufr =~ m/\G($DEF{symbol_text})\*/po) { $type='PREFIX'; $text=$1; } ##-- right-truncated bareword (prefix symbol)
	2					7
407	0					0	elsif ($$bufr =~ m/\G\*($DEF{symbol_text})/po) { $type='SUFFIX'; $text=$1; } ##-- left-truncated bareword (suffix symbol)
	0					0
408
409							##-- index names (special handling to allow count(foo) #by[$w-1]
410	24					62	elsif ($$bufr =~ m/\G\$($DEF{index_name})/po) { $type='INDEX'; $text=$1; }
	24					64
411	0					0	elsif ($$bufr =~ m/\G\$'($DEF{index_name})'/po) { $type='INDEX'; $text=$1; }
	0					0
412	0					0	elsif ($$bufr =~ /m\G\$/p) { $type='$'; }
413
414							##-- numeric tokens (handled below with symbols for perl lexer w/o longest-match)
415
416							##-- single-term wildcard
417	0					0	elsif ($$bufr =~ m/\G\/p) { $type = ''; }
418
419							##-- term expander pipelines
420	4					13	elsif ($$bufr =~ m/\G\\|/p) { $lex->{state}='Q_XPIPE'; $type='__SKIP__'; }
	4					7
421
422							##-- symbols: quoted
423							elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) {
424	39					133	$text = $1;
425	39	100				203	if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; }
	10	100				21
426	6					20	elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; }
427	23					46	else { $type='SYMBOL'; }
428							}
429							##-- symbols: barewords
430							elsif ($lex->{state} eq 'INITIAL' && $$bufr =~ m/\G$DEF{symbol_text}/po) {
431	269					834	$text = ${^MATCH};
432	269					441	$match = ${^MATCH};
433	269	100				1307	if ($text =~ m/^[\+\-]?[0-9]+\z/) { $type='INTEGER'; }
	36	100				86
434	6					16	elsif ($text =~ m/^[0-9-]+\z/) { $type='DATE'; }
435	227					427	else { $type='SYMBOL'; }
436							}
437							##-- barewords: integers and dates (Q_MATCHID)
438	14					48	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+(?=$DEF{int_boundary})/po) { $type='INTEGER'; }
439	26					143	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+\z/p) { $type='INTEGER'; }
440
441	0					0	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}(?=$DEF{int_boundary})/po) { $type='DATE'; }
442	0					0	elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}\z/po) { $type='DATE'; }
443
444							##-- misc
445	284					785	elsif ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; }
446							#elsif ($$bufr =~ m/\G./p) { $type = 'SYMBOL'; }
447	0					0	elsif ($$bufr =~ m/\G./p) { $type = '__ERROR__'; }
448
449	1137	100				3013	$match = ${^MATCH} if (!defined($match));
450							}
451							##------------------------
452							elsif ($lex->{state} eq 'Q_CORPORA') {
453	58	100				946	if ($$bufr =~ m/\G$DEF{corpus_text}/p) { $type='SYMBOL'; }
	22	100				53
		100
		50
454	18					42	elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $type='SYMBOL'; $text=$1; }
	18					41
455	16					48	elsif ($$bufr =~ m/\G\,/p) { $type=${^MATCH}; }
456	0					0	elsif ($$bufr =~ m/\G\s+/p) { $type='__SKIP__'; }
457	2					8	else { $type='__SKIP__'; $lex->yypopq(); }
	2					8
458
459	58					124	$match = ${^MATCH};
460							}
461							##------------------------
462							elsif ($lex->{state} eq 'Q_COMMENT') {
463	38	100				450	if ($$bufr =~ m/\G\]/p) {
		50
464	19					41	$type='__SKIP__';
465	19					45	$lex->{_cmtbuf} .= ${^MATCH};
466	19					41	push(@{$lex->{comments}}, $lex->{_cmtbuf});
	19					50
467	19					43	delete $lex->{_cmtbuf};
468	19					50	$lex->yypopq();
469							}
470	19					46	elsif ($$bufr =~ m/\G$DEF{comment_text}/sp) { $type='__SKIP__'; $lex->{_cmtbuf} .= ${^MATCH}; }
	19					54
471	0					0	else { $type='__ERROR__'; }
472
473	38					91	$match = ${^MATCH};
474							}
475							##------------------------
476							elsif ($lex->{state} eq 'Q_DATE') {
477	2	50				21	if ($$bufr =~ m/^\s+/) { $type = '__SKIP__'; }
	0	50				0
478	0					0	elsif ($$bufr =~ m/^\//) { $type = '/'; $lex->{state}='INITIAL'; }
	0					0
479	2					5	else { $lex->{state}='INITIAL'; $type='__SKIP__'; }
	2					6
480
481	2					5	$match = ${^MATCH};
482							}
483							##------------------------
484							elsif ($lex->{state} eq 'Q_REGREP') {
485	2	50				149	if ($$bufr =~ m/\G($DEF{regex_text})\/(?=$DEF{regex_modifier})/po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='Q_REGOPT'; }
	0	50				0
	0					0
	0					0
486	2					7	elsif ($$bufr =~ m/\G($DEF{regex_text})\//po) { $type='REGEX_REPLACE'; $text=$1; $lex->{state}='INITIAL'; }
	2					7
	2					6
487
488	2					8	$match = ${^MATCH};
489							}
490							##------------------------
491							elsif ($lex->{state} eq 'Q_REGOPT') {
492	0	0				0	if ($$bufr =~ m/\G$DEF{regex_modifier}+/po) { $type='REGOPT'; }
	0					0
493	0					0	else { $type='__SKIP__'; }
494	0					0	$lex->{state} = 'INITIAL';
495
496	0					0	$match = ${^MATCH};
497							}
498							##------------------------
499							elsif ($lex->{state} eq 'Q_XPIPE') {
500	4	50				218	if ($$bufr =~ m/\G\s+/p) { $type = '__SKIP__'; } ##-- whitespace: skip
	0	50				0
		50
		50
501	0					0	elsif ($$bufr =~ m/\G\-/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; }
	0					0
502	0					0	elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=$1; }
	0					0
	0					0
503	4					12	elsif ($$bufr =~ m/\G$DEF{symbol_text}/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; }
	4					9
504							#elsif ($$bufr =~ m/\G\z/p) { $lex->{state}='INITIAL'; $type='EXPANDER'; }
505	0					0	else { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=''; }
	0					0
	0					0
506
507	4					11	$match = ${^MATCH};
508							}
509							##------------------------
510							## END perl-ification of flex sources
511
512							##-- guts
513	1241	100				2582	$text = $match if (!defined($text));
514	1241	100				3196	$lex->{bufp} += length($match) if (defined($match));
515
516	1241					2852	pos($$bufr) = $lex->{bufp};
517	1241	50				2888	return if (!defined($type));
518
519	1241	100				2573	next LEXSKIP if ($type eq '__SKIP__');
520	878	100				1667	next LEXBUF if ($type eq '__EOF__');
521							#elsif ($type eq '__ERROR__') {
522							# return $lex->yyerror();
523							#}
524	860					4932	return @$lex{qw(yytype yytext)} = ($type,$text);
525							}
526							}
527	197					790	return @$lex{qw(yytype yytext)} = ('__EOF__',undef);
528							}
529
530							##======================================================================
531							## Testing: dummy lexing
532
533							## undef = $lex->dummylex(@from_whatever)
534							sub dummylex {
535	0			0	0		my $lex = shift;
536	0						$lex->reset();
537	0						$lex->from(@_);
538	0						my ($type,$text);
539							TOKEN:
540	0						while(1) {
541	0						($type,$text) = $lex->yylex();
542							print("-" x 64, "\n",
543							">> Line: ", $lex->yylineno, ", Pos: ", $lex->yypos, "\n",
544	0	0					">> State: ", (defined($lex->{state}) ? $lex->{state} : '(undef)'), "\n",
		0
		0
545							">> Type: ", (defined($type) ? $type : '(undef)'), "\n",
546							">> Text: ", (defined($text) ? $text : '(undef)'), "\n",
547							);
548	0	0					if (!defined($type)) {
549	0						warn(":: undef type!");
550	0						return;
551							}
552	0	0					if ($type eq '__ERROR__') {
553	0						print(":: ERROR DETECTED\n");
554	0						$lex->yyerror();
555							}
556	0	0					last if ($type eq '__EOF__');
557							}
558							}
559
560
561							1; ##-- be happy
562
563							__END__