| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | ##-*- Mode: CPerl -*- | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | ## File: DDC::::yyqlexer.pm | 
| 4 |  |  |  |  |  |  | ## Author: Bryan Jurish | 
| 5 |  |  |  |  |  |  | ## Description: | 
| 6 |  |  |  |  |  |  | ##  + lexer for ddc queries (formerly DDC::Query::yylexer) | 
| 7 |  |  |  |  |  |  | ##  + last updated for ddc v2.1.15 | 
| 8 |  |  |  |  |  |  | ##====================================================================== | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | package DDC::PP::yyqlexer; | 
| 11 | 20 |  |  | 20 |  | 603 | use 5.010;  ##-- we need at least v5.10.0 for /p regex modifier | 
|  | 20 |  |  |  |  | 76 |  | 
| 12 | 20 |  |  | 20 |  | 117 | use Encode qw(encode_utf8 decode_utf8); | 
|  | 20 |  |  |  |  | 40 |  | 
|  | 20 |  |  |  |  | 1086 |  | 
| 13 | 20 |  |  | 20 |  | 121 | use Carp; | 
|  | 20 |  |  |  |  | 39 |  | 
|  | 20 |  |  |  |  | 1222 |  | 
| 14 | 20 |  |  | 20 |  | 136 | use IO::File; | 
|  | 20 |  |  |  |  | 43 |  | 
|  | 20 |  |  |  |  | 2954 |  | 
| 15 | 20 |  |  | 20 |  | 153 | use IO::Handle; | 
|  | 20 |  |  |  |  | 41 |  | 
|  | 20 |  |  |  |  | 833 |  | 
| 16 | 20 |  |  | 20 |  | 128 | use strict; | 
|  | 20 |  |  |  |  | 51 |  | 
|  | 20 |  |  |  |  | 7378 |  | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | ##====================================================================== | 
| 19 |  |  |  |  |  |  | ## Globals etc. | 
| 20 |  |  |  |  |  |  | our @ISA = qw(); | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | ##---------------------------------------------------------------------- | 
| 23 |  |  |  |  |  |  | ## Globals: regexes for Parse::Lex lexer token regexes | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | ## %DEF | 
| 26 |  |  |  |  |  |  | ##   + common shared regex definitions | 
| 27 |  |  |  |  |  |  | our (%DEF); | 
| 28 |  |  |  |  |  |  | BEGIN { | 
| 29 |  |  |  |  |  |  | ##-- adapted from ddc-2.1.1/src/ConcordLib/yyQLexer.l ; extra escapes needed for backslashes ('\\' -> '\\\\') | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | ##-- whitespace | 
| 32 | 20 |  |  | 20 |  | 96 | $DEF{ws} = '[ \t\n\r\f\x0b]'; | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | ##-- integer boundary characters | 
| 35 | 20 |  |  |  |  | 48 | $DEF{int_boundary} = '[ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\/\\\\\'\".$@_+-]'; | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | ##-- bareword symbols | 
| 38 | 20 |  |  |  |  | 50 | $DEF{symbol_cfirst}  = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\".$@]'; | 
| 39 | 20 |  |  |  |  | 50 | $DEF{symbol_crest}   = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\\/\'\"]'; | 
| 40 | 20 |  |  |  |  | 54 | $DEF{symbol_cescape} = '(?:\\\\.)'; | 
| 41 | 20 |  |  |  |  | 141 | $DEF{symbol_text}    = "(?:$DEF{symbol_cescape}|$DEF{symbol_cfirst})(?:$DEF{symbol_cescape}|$DEF{symbol_crest})*"; | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | ##-- subcorpus symbols (ddc >= v2.2.0; also allow '!' here) | 
| 44 | 20 |  |  |  |  | 46 | $DEF{corpus_cfirst} =	'[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"$@]'; | 
| 45 | 20 |  |  |  |  | 51 | $DEF{corpus_crest}  =	'[^ \t\n\r\f\v\0&|?^%,:;#=~(){}<>\[\]\\\'\"]'; | 
| 46 | 20 |  |  |  |  | 78 | $DEF{corpus_text}   =	"(?:$DEF{symbol_cescape}|$DEF{corpus_cfirst})(?:$DEF{symbol_cescape}|$DEF{corpus_crest})*"; | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | ##-- bareword index names (underscore and digits ok, but no '.', '-', or '+') | 
| 49 | 20 |  |  |  |  | 67 | $DEF{index_char} = '[^ \t\n\r\f\x0b\0&|!?^%,:;#*=~(){}<>\[\]\\\\/\'\".$@+-]'; | 
| 50 | 20 |  |  |  |  | 65 | $DEF{index_name} = "(?:$DEF{index_char}|$DEF{symbol_cescape})+"; | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | ##-- single-quoted symbols | 
| 53 | 20 |  |  |  |  | 49 | $DEF{sq_text} = "(?:[^\']|$DEF{symbol_cescape})*"; | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | ##-- bareword dates (>= 4 digits of year, otherwise breaks {count(...) #by[$w=1-1]}) | 
| 56 | 20 |  |  |  |  | 47 | $DEF{date_bare} = "[+-]?[0-9]{4,}(-[0-9]{1,2}){1,2}"; | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | ##-- regexes | 
| 59 | 20 |  |  |  |  | 46 | $DEF{regex_text}     = "(?:(?:\\\\.)|[^\\\\/])*"; | 
| 60 | 20 |  |  |  |  | 34 | $DEF{regex_modifier} = '[dgimsx]'; | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  | ##-- comments | 
| 63 | 20 |  |  |  |  | 77 | $DEF{comment_text}   = "(?:\\\\.|[^\\]])*"; | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | ##-- compile patterns | 
| 66 | 20 |  |  |  |  | 218 | foreach (keys %DEF) { | 
| 67 |  |  |  |  |  |  | #print STDERR __PACKAGE__, ": compiling regex macro: $_ ~ /$DEF{$_}/\n"; | 
| 68 | 320 |  |  |  |  | 12096 | $DEF{$_} = qr/$DEF{$_}/; | 
| 69 |  |  |  |  |  |  | } | 
| 70 |  |  |  |  |  |  | } | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | ##====================================================================== | 
| 73 |  |  |  |  |  |  | ## $lex = $CLASS_OR_OBJ->new(%args) | 
| 74 |  |  |  |  |  |  | ## + abstract constructor | 
| 75 |  |  |  |  |  |  | ## + %$lex, %args: | 
| 76 |  |  |  |  |  |  | ##   { | 
| 77 |  |  |  |  |  |  | ##    src  => $name,    ##-- source name | 
| 78 |  |  |  |  |  |  | ##    fh   => $srcfh,   ##-- source filehandle | 
| 79 |  |  |  |  |  |  | ##    bufr => \$buf,    ##-- source buffer (string reference) | 
| 80 |  |  |  |  |  |  | ##    bufp => $pos,     ##-- current pos() in source buffer | 
| 81 |  |  |  |  |  |  | ##    buf  => $buf,     ##-- local buffer (for filehandle input) | 
| 82 |  |  |  |  |  |  | ##    state => $q,      ##-- symbolic state name (default: 'INITIAL') | 
| 83 |  |  |  |  |  |  | ##    stack => \@stack, ##-- state stack | 
| 84 |  |  |  |  |  |  | ## | 
| 85 |  |  |  |  |  |  | ##    ##-- utf-8 or byte mode? | 
| 86 |  |  |  |  |  |  | ##    utf8 => $bool,    ##-- whether to use utf8 or byte-mode (default: true (non-compatible but handy)) | 
| 87 |  |  |  |  |  |  | ## | 
| 88 |  |  |  |  |  |  | ##    ##-- runtime data | 
| 89 |  |  |  |  |  |  | ##    yytext => $text,  ##-- current text | 
| 90 |  |  |  |  |  |  | ##    yytype => $type,  ##-- current token type | 
| 91 |  |  |  |  |  |  | ##    yylineno => $line, ##-- current source line (file input only) | 
| 92 |  |  |  |  |  |  | ##   } | 
| 93 |  |  |  |  |  |  | sub new { | 
| 94 | 15 |  |  | 15 | 0 | 46 | my $that = shift; | 
| 95 | 15 |  | 33 |  |  | 249 | my $lex = bless({ | 
| 96 |  |  |  |  |  |  | src =>undef, | 
| 97 |  |  |  |  |  |  | fh   =>undef, | 
| 98 |  |  |  |  |  |  | bufr =>undef, | 
| 99 |  |  |  |  |  |  | bufp =>undef, | 
| 100 |  |  |  |  |  |  | buf  =>undef, | 
| 101 |  |  |  |  |  |  | utf8 =>1, | 
| 102 |  |  |  |  |  |  | state => 'INITIAL', | 
| 103 |  |  |  |  |  |  | stack => [], | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | yytext=>undef, | 
| 106 |  |  |  |  |  |  | yytype=>undef, | 
| 107 |  |  |  |  |  |  | yylineno=>undef, | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | ##-- lexer comment retention hacks | 
| 110 |  |  |  |  |  |  | comments=>[], | 
| 111 |  |  |  |  |  |  |  | 
| 112 |  |  |  |  |  |  | ##-- user args | 
| 113 |  |  |  |  |  |  | @_ | 
| 114 |  |  |  |  |  |  | }, | 
| 115 |  |  |  |  |  |  | ref($that)||$that | 
| 116 |  |  |  |  |  |  | ); | 
| 117 | 15 |  |  |  |  | 141 | return $lex; | 
| 118 |  |  |  |  |  |  | } | 
| 119 |  |  |  |  |  |  |  | 
| 120 |  |  |  |  |  |  | ## $lex = $lex->clear() | 
| 121 |  |  |  |  |  |  | ##  + clear lexer buffer, source, etc | 
| 122 |  |  |  |  |  |  | sub clear { | 
| 123 | 398 |  |  | 398 | 0 | 598 | my $lex = shift; | 
| 124 | 398 |  |  |  |  | 1298 | delete @$lex{qw(src fh bufr bufp buf yytext yytype yylineno)}; | 
| 125 | 398 |  |  |  |  | 680 | $lex->{state} = 'INITIAL'; | 
| 126 | 398 |  |  |  |  | 568 | @{$lex->{stack}} = qw(); | 
|  | 398 |  |  |  |  | 772 |  | 
| 127 | 398 |  |  |  |  | 604 | @{$lex->{comments}} = qw(); | 
|  | 398 |  |  |  |  | 633 |  | 
| 128 | 398 |  |  |  |  | 574 | delete $lex->{_cmtbuf}; | 
| 129 | 398 |  |  |  |  | 667 | return $lex; | 
| 130 |  |  |  |  |  |  | } | 
| 131 | 20 |  |  | 20 |  | 95152 | BEGIN { *reset = *close = \&clear; } | 
| 132 |  |  |  |  |  |  |  | 
| 133 |  |  |  |  |  |  | ##====================================================================== | 
| 134 |  |  |  |  |  |  | ## I/O | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | ## $lex = $lex->from($which,$src, %opts) | 
| 137 |  |  |  |  |  |  | ##  + $which is one of qw(fh file string) | 
| 138 |  |  |  |  |  |  | ##  + $src is the actual source (default: 'string') | 
| 139 |  |  |  |  |  |  | sub from { | 
| 140 | 199 |  |  | 199 | 0 | 457 | my ($lex,$which,$src,%opts) = @_; | 
| 141 | 199 | 50 |  |  |  | 474 | return $lex->fromFh($src,%opts) if ($which eq 'fh'); | 
| 142 | 199 | 50 |  |  |  | 393 | return $lex->fromFile($src,%opts) if ($which eq 'file'); | 
| 143 | 199 |  |  |  |  | 563 | return $lex->fromString($src,%opts); | 
| 144 |  |  |  |  |  |  | } | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | ## $lex = $lex->fromFile($filename_or_handle,%opts) | 
| 147 |  |  |  |  |  |  | sub fromFile { | 
| 148 | 0 |  |  | 0 | 0 | 0 | my ($lex,$file,%opts) = @_; | 
| 149 | 0 | 0 |  |  |  | 0 | return $lex->fromFh($file,%opts) if (ref($file)); | 
| 150 | 0 | 0 |  |  |  | 0 | my $fh = IO::File->new("<$file") | 
| 151 |  |  |  |  |  |  | or confess("cannot open '$file' for read: $!"); | 
| 152 | 0 | 0 |  |  |  | 0 | binmode($fh,':encoding(utf8)') if ($lex->{utf8}); | 
| 153 | 0 |  |  |  |  | 0 | return $lex->fromFh($fh,src=>"file \`$file'",%opts); | 
| 154 |  |  |  |  |  |  | } | 
| 155 |  |  |  |  |  |  |  | 
| 156 |  |  |  |  |  |  | our $FH_SLURP=0; ##-- DEBUG: slurp whole files instead of line-wise input | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | ## $lex = $lex->fromFh($fh,%opts) | 
| 159 |  |  |  |  |  |  | ##  + uses native $fh encoding | 
| 160 |  |  |  |  |  |  | sub fromFh { | 
| 161 | 0 |  |  | 0 | 0 | 0 | my ($lex,$fh,%opts) = @_; | 
| 162 | 0 | 0 |  |  |  | 0 | if ($FH_SLURP) { | 
| 163 |  |  |  |  |  |  | ##-- always use string mode | 
| 164 | 0 |  |  |  |  | 0 | local $/=undef; | 
| 165 | 0 |  |  |  |  | 0 | my $buf = $fh->getline; | 
| 166 | 0 |  |  |  |  | 0 | $fh->close(); | 
| 167 | 0 |  |  |  |  | 0 | return $lex->fromString(\$buf,src=>"filehandle \`$fh'",%opts); | 
| 168 |  |  |  |  |  |  | } | 
| 169 |  |  |  |  |  |  | ##-- line-wise buffering | 
| 170 | 0 |  |  |  |  | 0 | $lex->clear(); | 
| 171 | 0 |  |  |  |  | 0 | @$lex{keys %opts} = values(%opts); | 
| 172 | 0 |  |  |  |  | 0 | $lex->{fh}   = $fh; | 
| 173 | 0 |  |  |  |  | 0 | $lex->{buf}  = undef; | 
| 174 | 0 |  |  |  |  | 0 | $lex->{bufr} = \$lex->{buf}; | 
| 175 | 0 |  |  |  |  | 0 | $lex->{bufp} = 0; | 
| 176 | 0 | 0 |  |  |  | 0 | $lex->{src} = "filehandle \`$fh'" if (!defined($lex->{src})); | 
| 177 | 0 |  |  |  |  | 0 | return $lex; | 
| 178 |  |  |  |  |  |  | } | 
| 179 |  |  |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | ## $lex = $lex->fromString($str,%opts) | 
| 181 |  |  |  |  |  |  | ## $lex = $lex->fromString(\$str,%opts) | 
| 182 |  |  |  |  |  |  | sub fromString { | 
| 183 | 199 |  |  | 199 | 0 | 426 | my ($lex,$str,%opts) = @_; | 
| 184 | 199 |  |  |  |  | 477 | $lex->clear(); | 
| 185 | 199 | 50 |  |  |  | 522 | if (ref($str)) { | 
| 186 | 199 |  |  |  |  | 357 | $lex->{bufr} = $str; | 
| 187 | 199 | 50 |  |  |  | 903 | $lex->{src} = "buffer \`$str'" if (!defined($lex->{src})); | 
| 188 |  |  |  |  |  |  | } else { | 
| 189 | 0 |  |  |  |  | 0 | $lex->{bufr} = \$str; | 
| 190 | 0 | 0 |  |  |  | 0 | $lex->{src} = "string \`$str'" if (!defined($lex->{src})); | 
| 191 |  |  |  |  |  |  | } | 
| 192 | 199 |  |  |  |  | 418 | $lex->{bufp} = 0; | 
| 193 | 199 |  |  |  |  | 321 | $lex->{yylineno} = 0; | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | ##-- utf8 checks | 
| 196 | 199 | 50 | 33 |  |  | 532 | if ($lex->{utf8} && !utf8::is_utf8(${$lex->{bufr}})) { | 
|  | 199 | 0 | 0 |  |  | 793 |  | 
| 197 |  |  |  |  |  |  | ##-- lexer:utf8, string:bytes --> assume string is utf8-encoded | 
| 198 | 199 | 50 |  |  |  | 332 | ${$lex->{bufr}} = decode_utf8(${$lex->{bufr}}) if (!utf8::is_utf8(${$lex->{bufr}})); | 
|  | 199 |  |  |  |  | 4864 |  | 
|  | 199 |  |  |  |  | 707 |  | 
|  | 199 |  |  |  |  | 533 |  | 
| 199 |  |  |  |  |  |  | } | 
| 200 | 0 |  |  |  |  | 0 | elsif (!$lex->{utf8} && utf8::is_utf8(${$lex->{bufr}})) { | 
| 201 |  |  |  |  |  |  | ##-- lexer:bytes, string:utf8 --> encode as utf8 octets | 
| 202 | 0 |  |  |  |  | 0 | ${$lex->{bufr}} = encode_utf8(${$lex->{bufr}}); | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 203 |  |  |  |  |  |  | } | 
| 204 |  |  |  |  |  |  |  | 
| 205 | 199 |  |  |  |  | 763 | return $lex; | 
| 206 |  |  |  |  |  |  | } | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | ##====================================================================== | 
| 209 |  |  |  |  |  |  | ## Utilities | 
| 210 |  |  |  |  |  |  |  | 
| 211 |  |  |  |  |  |  | ## $bool = $lex->eof() | 
| 212 |  |  |  |  |  |  | ##  + true iff at end-of-file | 
| 213 |  |  |  |  |  |  | sub eof { | 
| 214 | 1075 | 50 |  | 1075 | 0 | 2649 | return $FH_SLURP ? $_[0]->eob() : !$_[0]->getmore(); | 
| 215 |  |  |  |  |  |  | } | 
| 216 |  |  |  |  |  |  |  | 
| 217 |  |  |  |  |  |  | ## $bool = $lex->eob() | 
| 218 |  |  |  |  |  |  | ##  + true at end-of-buffer | 
| 219 |  |  |  |  |  |  | sub eob { | 
| 220 | 1075 |  | 66 | 1075 | 0 | 2978 | return (!$_[0]{bufr} || !${$_[0]{bufr}} || ($_[0]{bufp}||0) >= length(${$_[0]{bufr}})); | 
| 221 |  |  |  |  |  |  | } | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | ## $bufr_or_undef = $lex->getmore() | 
| 224 |  |  |  |  |  |  | ##   + returns true iff there is still data in the buffer | 
| 225 |  |  |  |  |  |  | sub getmore { | 
| 226 | 1075 | 100 |  | 1075 | 0 | 1994 | return $_[0]{bufr} if (!$_[0]->eob()); | 
| 227 | 197 | 50 |  |  |  | 665 | if (defined($_[0]{fh})) { | 
| 228 | 0 |  |  |  |  | 0 | $_[0]{bufp}     = 0; | 
| 229 | 0 |  |  |  |  | 0 | $_[0]{buf}      = $_[0]{fh}->getline; | 
| 230 | 0 |  |  |  |  | 0 | $_[0]{bufr}     = \$_[0]{buf}; | 
| 231 | 0 |  |  |  |  | 0 | $_[0]{yylineno} = $_[0]{fh}->input_line_number; | 
| 232 | 0 | 0 |  |  |  | 0 | return defined($_[0]{buf}) ? $_[0]{bufr} : undef; | 
| 233 |  |  |  |  |  |  | } | 
| 234 | 197 |  |  |  |  | 623 | return undef; | 
| 235 |  |  |  |  |  |  | } | 
| 236 |  |  |  |  |  |  |  | 
| 237 |  |  |  |  |  |  | ##====================================================================== | 
| 238 |  |  |  |  |  |  | ## Runtime lexer accessors | 
| 239 |  |  |  |  |  |  |  | 
| 240 |  |  |  |  |  |  | ## $yytext = $lex->yytext | 
| 241 |  |  |  |  |  |  | ##  + always defined; otherwise using $lex->{yytext} is faster | 
| 242 | 2 | 50 |  | 2 | 0 | 42 | sub yytext { return defined($_[0]{yytext}) ? $_[0]{yytext} : ''; } | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | ## $yytype = $lex->yytype | 
| 245 |  |  |  |  |  |  | ##  + always defined; otherwise using $lex->{yytype} is faster | 
| 246 | 2 | 50 |  | 2 | 0 | 26 | sub yytype { return defined($_[0]{yytype}) ? $_[0]{yytype} : '__EOF__'; } | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | ## $line = $lex->yylineno() | 
| 249 |  |  |  |  |  |  | ##  + returns current line number | 
| 250 |  |  |  |  |  |  | sub yylineno { | 
| 251 | 2 |  |  | 2 | 0 | 11 | return $_[0]{yylineno}; | 
| 252 |  |  |  |  |  |  | } | 
| 253 |  |  |  |  |  |  |  | 
| 254 |  |  |  |  |  |  | ## $pos = $lex->yycolumn() | 
| 255 |  |  |  |  |  |  | ##  + return column at which current token starts (if any) | 
| 256 |  |  |  |  |  |  | sub yycolumn { | 
| 257 | 0 | 0 | 0 | 0 | 0 | 0 | return ($_[0]{bufp}||0) - (defined($_[0]{yytext}) ? length($_[0]{yytext}) : 0); | 
| 258 |  |  |  |  |  |  | } | 
| 259 |  |  |  |  |  |  |  | 
| 260 |  |  |  |  |  |  | ## $pos = $lex->yypos() | 
| 261 |  |  |  |  |  |  | ##  + return byte position in current line (or input string) | 
| 262 |  |  |  |  |  |  | sub yypos { | 
| 263 | 0 |  | 0 | 0 | 0 | 0 | return ($_[0]{bufp}||0); | 
| 264 |  |  |  |  |  |  | } | 
| 265 |  |  |  |  |  |  |  | 
| 266 |  |  |  |  |  |  | ## $string = $lex->yyerror(@msg) | 
| 267 |  |  |  |  |  |  | ##  + create a helpful error message | 
| 268 |  |  |  |  |  |  | sub yyerror { | 
| 269 | 0 |  |  | 0 | 0 | 0 | my $lex = shift; | 
| 270 | 0 |  |  |  |  | 0 | confess(ref($lex).": error in ".$lex->yywhere().join('',@_)); | 
| 271 |  |  |  |  |  |  | } | 
| 272 |  |  |  |  |  |  |  | 
| 273 |  |  |  |  |  |  | ## $string = $lex->yywhere() | 
| 274 |  |  |  |  |  |  | ##  + location string used by yyerror() | 
| 275 |  |  |  |  |  |  | sub yywhere { | 
| 276 | 0 |  |  | 0 | 0 | 0 | my $lex = shift; | 
| 277 |  |  |  |  |  |  | return ("$lex->{src} at " | 
| 278 |  |  |  |  |  |  | .(defined($lex->{fh}) ? ("line $lex->{yylineno}, ") : '') | 
| 279 |  |  |  |  |  |  | ."column ".$lex->yycolumn | 
| 280 | 0 | 0 |  |  |  | 0 | .", near ".(defined($lex->{yytext}) ? "\`$lex->{yytext}\'" : '__EOF__') | 
|  |  | 0 |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | ); | 
| 282 |  |  |  |  |  |  | } | 
| 283 |  |  |  |  |  |  |  | 
| 284 |  |  |  |  |  |  | ## $q = $lex->yypushq($new_state) | 
| 285 |  |  |  |  |  |  | sub yypushq { | 
| 286 | 19 |  |  | 19 | 0 | 44 | my ($lex,$qnew) = @_; | 
| 287 | 19 |  |  |  |  | 34 | push(@{$lex->{stack}},$lex->{state}); | 
|  | 19 |  |  |  |  | 53 |  | 
| 288 | 19 |  |  |  |  | 46 | return $lex->{state} = $qnew; | 
| 289 |  |  |  |  |  |  | } | 
| 290 |  |  |  |  |  |  |  | 
| 291 |  |  |  |  |  |  | ## $q = $lex->yypopq() | 
| 292 |  |  |  |  |  |  | sub yypopq { | 
| 293 | 21 |  |  | 21 | 0 | 42 | my $lex = shift; | 
| 294 | 21 |  | 100 |  |  | 34 | return $lex->{state} = pop(@{$lex->{stack}}) || 'INITIAL'; | 
| 295 |  |  |  |  |  |  | } | 
| 296 |  |  |  |  |  |  |  | 
| 297 |  |  |  |  |  |  | ##====================================================================== | 
| 298 |  |  |  |  |  |  | ## Runtime lexer routines | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | ## ($typ,$text) = $lex->yylex() | 
| 301 |  |  |  |  |  |  | ##  + get next token from input stream | 
| 302 |  |  |  |  |  |  | sub yylex { | 
| 303 | 1057 |  |  | 1057 | 0 | 1621 | my $lex = shift; | 
| 304 | 1057 |  |  |  |  | 1686 | my ($bufr,$type,$text,$match,@part); | 
| 305 |  |  |  |  |  |  | #use re 'eval'; ##-- dangerous! | 
| 306 |  |  |  |  |  |  | LEXBUF: | 
| 307 | 1057 |  |  |  |  | 2167 | while (!$lex->eof()) { | 
| 308 | 878 |  |  |  |  | 1611 | $bufr       = $lex->{bufr}; | 
| 309 | 878 |  |  |  |  | 2173 | pos($$bufr) = $lex->{bufp}; | 
| 310 |  |  |  |  |  |  |  | 
| 311 |  |  |  |  |  |  | LEXSKIP: | 
| 312 |  |  |  |  |  |  | ##------------------------------------ | 
| 313 |  |  |  |  |  |  | ## LEXSKIP: main lexer loop | 
| 314 | 878 |  |  |  |  | 1535 | while (1) { | 
| 315 | 1241 |  |  |  |  | 2089 | $type = $text = $match = undef; | 
| 316 | 1241 |  |  |  |  | 1922 | @part = qw(); | 
| 317 |  |  |  |  |  |  |  | 
| 318 |  |  |  |  |  |  | ##------------------------ | 
| 319 | 1241 | 100 | 100 |  |  | 3410 | if ($lex->{state} eq 'INITIAL' || $lex->{state} eq 'Q_MATCHID') { | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 320 |  |  |  |  |  |  |  | 
| 321 |  |  |  |  |  |  | ##-- end-of-file (should be first pattern) | 
| 322 | 1137 | 100 | 100 |  |  | 54219 | if    ($$bufr =~ m/\G\z/)	{ $type = '__EOF__'; } | 
|  | 18 | 100 | 100 |  |  | 38 |  | 
|  |  | 100 | 66 |  |  |  |  | 
|  |  | 100 | 33 |  |  |  |  | 
|  |  | 100 | 33 |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 323 |  |  |  |  |  |  |  | 
| 324 |  |  |  |  |  |  | ##-- comments | 
| 325 | 14 |  |  |  |  | 29 | elsif ($$bufr =~ m/\G\#:[^\n]*/sp)	{ $type='__SKIP__'; push(@{$lex->{comments}},${^MATCH}."\n"); } | 
|  | 14 |  |  |  |  | 20 |  | 
|  | 14 |  |  |  |  | 65 |  | 
| 326 | 19 |  |  |  |  | 40 | elsif ($$bufr =~ m/\G\#\[/p)		{ $type='__SKIP__'; $lex->{_cmtbuf}=${^MATCH}; $lex->yypushq('Q_COMMENT'); } | 
|  | 19 |  |  |  |  | 54 |  | 
|  | 19 |  |  |  |  | 60 |  | 
| 327 |  |  |  |  |  |  |  | 
| 328 |  |  |  |  |  |  | ##-- operators | 
| 329 | 21 |  |  |  |  | 41 | elsif ($$bufr =~ m/\G\&\&/p)	{ $type = 'OP_BOOL_AND'; } | 
| 330 | 2 |  |  |  |  | 6 | elsif ($$bufr =~ m/\G\|\|/p)	{ $type = 'OP_BOOL_OR'; } | 
| 331 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\Gnear/pi)	{ $type = 'NEAR'; } | 
| 332 | 2 |  |  |  |  | 6 | elsif ($$bufr =~ m/\G(?:\!=|\&\!=|\&=\s*\!|\!with|with(?:out|\s*\!))/pi)	{ $type = 'WITHOUT'; } | 
| 333 | 2 |  |  |  |  | 7 | elsif ($$bufr =~ m/\G(?:\|=|withor|orwith|wor)/pi) 				{ $type = 'WITHOR'; } | 
| 334 | 2 |  |  |  |  | 6 | elsif ($$bufr =~ m/\G(?:\&=|with)/pi)						{ $type = 'WITH'; } | 
| 335 | 28 |  |  |  |  | 64 | elsif ($$bufr =~ m/\Gcount/pi)	{ $type = 'COUNT'; } | 
| 336 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\Gkeys/pi)	{ $type = 'KEYS'; } | 
| 337 |  |  |  |  |  |  |  | 
| 338 |  |  |  |  |  |  | ##-- count-by keywords | 
| 339 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G(?:file|doc)_?id/pi) { $type = 'KW_FILEID'; } | 
| 340 | 2 |  |  |  |  | 6 | elsif ($$bufr =~ m/\G(?:file|doc)_?(?:name)?/pi) { $type = 'KW_FILENAME'; } | 
| 341 | 2 |  |  |  |  | 5 | elsif ($$bufr =~ m/\Gdate/pi) { $type = 'KW_DATE'; $lex->{state}='Q_DATE' } | 
|  | 2 |  |  |  |  | 5 |  | 
| 342 |  |  |  |  |  |  |  | 
| 343 |  |  |  |  |  |  | ##-- query operators | 
| 344 | 17 |  |  |  |  | 34 | elsif ($$bufr =~ m/\G\#(?:comment|cmt)/pi)			{ $type = 'KW_COMMENT'; } | 
| 345 | 2 |  |  |  |  | 7 | elsif ($$bufr =~ m/\G\#(?:(?:co?n?te?xt?|n))/pi)		{ $type = 'CNTXT'; } | 
| 346 | 2 |  |  |  |  | 7 | elsif ($$bufr =~ m/\G\#(?:with)?in/pi)				{ $type = 'WITHIN'; } | 
| 347 | 3 |  |  |  |  | 7 | elsif ($$bufr =~ m/\G\#(?:sep(?:arate)?|nojoin)(?:_hits)?/pi)	{ $type = 'SEPARATE_HITS'; } | 
| 348 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:nosep(?:arate)?|join)(?:_hits)?/pi)	{ $type = 'NOSEPARATE_HITS'; } | 
| 349 | 4 |  |  |  |  | 12 | elsif ($$bufr =~ m/\G\#(?:is_|has_)?date/pi)			{ $type = 'IS_DATE'; } | 
| 350 | 2 |  |  |  |  | 7 | elsif ($$bufr =~ m/\G\#has(?:_field)?/pi)			{ $type = 'HAS_FIELD'; } | 
| 351 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#file(?:_?)names/pi)			{ $type = 'FILENAMES_ONLY'; } | 
| 352 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#debug_rank/pi)				{ $type = 'DEBUG_RANK'; } | 
| 353 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_rank/pi)	{ $type = 'GREATER_BY_RANK'; } | 
| 354 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_rank/pi)		{ $type = 'LESS_BY_RANK'; } | 
| 355 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_date/pi)	{ $type = 'GREATER_BY_DATE'; } | 
| 356 | 10 |  |  |  |  | 26 | elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_date/pi)		{ $type = 'LESS_BY_DATE'; } | 
| 357 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?_size/pi)	{ $type = 'GREATER_BY_SIZE'; } | 
| 358 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?_size/pi)		{ $type = 'LESS_BY_SIZE'; } | 
| 359 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:is_|has_)?size/pi)			{ $type = 'IS_SIZE'; } | 
| 360 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?left/pi)	{ $type = 'LESS_BY_LEFT'; } | 
| 361 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)left/pi)	{ $type = 'GREATER_BY_LEFT'; } | 
| 362 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?right/pi)	{ $type = 'LESS_BY_RIGHT'; } | 
| 363 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)right/pi)	{ $type = 'GREATER_BY_RIGHT'; } | 
| 364 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:(?:less|asc)_(?:by_)?)?mid(?:dle)?/pi)	{ $type = 'LESS_BY_MIDDLE'; } | 
| 365 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)_(?:by_)?)mid(?:dle)?/pi)	{ $type = 'GREATER_BY_MIDDLE'; } | 
| 366 | 2 |  |  |  |  | 7 | elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_key)/pi)		{ $type = 'LESS_BY_KEY'; } | 
| 367 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_key)/pi)	{ $type = 'GREATER_BY_KEY'; } | 
| 368 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:(?:less|asc)(?:_by)?_(?:count|val(?:ue)?))/pi)	{ $type = 'LESS_BY_COUNT'; } | 
| 369 | 2 |  |  |  |  | 8 | elsif ($$bufr =~ m/\G\#(?:(?:greater|de?sc)(?:_by)?_(?:count|val(?:ue)?))/pi) { $type = 'GREATER_BY_COUNT'; } | 
| 370 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:less|asc)(?:_by)?/pi)			{ $type = 'LESS_BY'; } | 
| 371 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#(?:greater|de?sc)(?:_by)?/pi)		{ $type = 'GREATER_BY'; } | 
| 372 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#rand(?:om)?/pi)				{ $type = 'RANDOM'; } | 
| 373 | 28 |  |  |  |  | 62 | elsif ($$bufr =~ m/\G\#by/pi)					{ $type = 'BY'; } | 
| 374 | 2 |  |  |  |  | 8 | elsif ($$bufr =~ m/\G\#samp(?:le)?/pi)				{ $type = 'SAMPLE'; } | 
| 375 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\#clim(?:it)?/pi)				{ $type = 'CLIMIT'; } | 
| 376 |  |  |  |  |  |  |  | 
| 377 |  |  |  |  |  |  | ##-- regexes | 
| 378 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po)	{ $type='NEG_REGEX'; $text=$1; $lex->{state}='Q_REGOPT'; } | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 379 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\!\/($DEF{regex_text})\//po)				{ $type='NEG_REGEX'; $text=$1; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 380 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\/($DEF{regex_text})\/(?=$DEF{regex_modifier})/po)		{ $type='REGEX';     $text=$1; $lex->{state}='Q_REGOPT'; } | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 381 | 11 |  |  |  |  | 28 | elsif ($$bufr =~ m/\G\/($DEF{regex_text})\//po)					{ $type='REGEX';     $text=$1; } | 
|  | 11 |  |  |  |  | 31 |  | 
| 382 | 2 |  |  |  |  | 6 | elsif ($$bufr =~ m/\Gs\/($DEF{regex_text})\//po)				{ $type='REGEX_SEARCH'; $text=$1; $lex->{state}='Q_REGREP'; } | 
|  | 2 |  |  |  |  | 7 |  | 
|  | 2 |  |  |  |  | 5 |  | 
| 383 |  |  |  |  |  |  |  | 
| 384 |  |  |  |  |  |  | ##-- punctutation & special characters | 
| 385 | 2 |  |  |  |  | 85 | elsif ($$bufr =~ m/\G#=/p)				{ $type = 'HASH_EQUAL'; }	##-- hash+equal: exact distance | 
| 386 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G# | 
| 387 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G#>/p)				{ $type = 'HASH_GREATER'; }	##-- hash+greater: min distance | 
| 388 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\$\./p)				{ $type = 'DOLLAR_DOT'; }	##-- positional anchor pseudo-index | 
| 389 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\:\{/p)				{ $type = 'COLON_LBRACE'; }	##-- theusaurus-query operator | 
| 390 | 4 |  |  |  |  | 12 | elsif ($$bufr =~ m/\G\@\{/p)				{ $type = 'AT_LBRACE'; }	##-- literal-set operator | 
| 391 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\*\{/p)				{ $type = 'STAR_LBRACE'; }	##-- prefix-set opener | 
| 392 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\}\*/p)				{ $type = 'RBRACE_STAR'; }	##-- suffix-set closer | 
| 393 | 74 |  |  |  |  | 224 | elsif ($$bufr =~ m/\G[!.,;=@%^#~\/]/p)			{ $type = ${^MATCH}; } ##-- single-char punctuation operators | 
| 394 | 164 |  |  |  |  | 464 | elsif ($$bufr =~ m/\G[\[\]{}()<>]/p)			{ $type = ${^MATCH}; } ##-- parentheses | 
| 395 | 4 |  |  |  |  | 14 | elsif ($$bufr =~ m/\G\"/p)				{ $type = ${^MATCH}; } ##-- double-quotes | 
| 396 |  |  |  |  |  |  |  | 
| 397 |  |  |  |  |  |  | ##-- subcorpus path-lists | 
| 398 | 26 |  |  |  |  | 73 | elsif ($$bufr =~ m/\G\:/p)				{ $type = ${^MATCH}; $lex->{state}='Q_CORPORA'; } | 
|  | 26 |  |  |  |  | 55 |  | 
| 399 |  |  |  |  |  |  |  | 
| 400 |  |  |  |  |  |  | ##-- truncated symbols | 
| 401 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'\*/po) { $type='INFIX';  $text=$1; }	##-- dual-truncated quoted string (infix symbol) | 
|  | 0 |  |  |  |  | 0 |  | 
| 402 | 2 |  |  |  |  | 8 | elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'\*/po)	 { $type='PREFIX'; $text=$1; }	##-- right-truncated quoted string (prefix symbol) | 
|  | 2 |  |  |  |  | 6 |  | 
| 403 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\*\'($DEF{sq_text})\'/po)	 { $type='SUFFIX'; $text=$1; }	##-- left-truncated quoted string (suffix symbol) | 
|  | 0 |  |  |  |  | 0 |  | 
| 404 |  |  |  |  |  |  |  | 
| 405 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\*($DEF{symbol_text})\*/po) { $type='INFIX';  $text=$1; }	##-- dual-truncated bareword (infix symbol) | 
|  | 0 |  |  |  |  | 0 |  | 
| 406 | 2 |  |  |  |  | 7 | elsif ($$bufr =~ m/\G($DEF{symbol_text})\*/po)	 { $type='PREFIX'; $text=$1; }	##-- right-truncated bareword (prefix symbol) | 
|  | 2 |  |  |  |  | 7 |  | 
| 407 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\*($DEF{symbol_text})/po)	 { $type='SUFFIX'; $text=$1; }	##-- left-truncated bareword (suffix symbol) | 
|  | 0 |  |  |  |  | 0 |  | 
| 408 |  |  |  |  |  |  |  | 
| 409 |  |  |  |  |  |  | ##-- index names (special handling to allow count(foo) #by[$w-1] | 
| 410 | 24 |  |  |  |  | 62 | elsif ($$bufr =~ m/\G\$($DEF{index_name})/po)	{ $type='INDEX'; $text=$1; } | 
|  | 24 |  |  |  |  | 64 |  | 
| 411 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\$'($DEF{index_name})'/po)	{ $type='INDEX'; $text=$1; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 412 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ /m\G\$/p)			{ $type='$'; } | 
| 413 |  |  |  |  |  |  |  | 
| 414 |  |  |  |  |  |  | ##-- numeric tokens (handled below with symbols for perl lexer w/o longest-match) | 
| 415 |  |  |  |  |  |  |  | 
| 416 |  |  |  |  |  |  | ##-- single-term wildcard | 
| 417 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\*/p)     			{ $type = '*'; } | 
| 418 |  |  |  |  |  |  |  | 
| 419 |  |  |  |  |  |  | ##-- term expander pipelines | 
| 420 | 4 |  |  |  |  | 13 | elsif ($$bufr =~ m/\G\|/p) 			{ $lex->{state}='Q_XPIPE'; $type='__SKIP__'; } | 
|  | 4 |  |  |  |  | 7 |  | 
| 421 |  |  |  |  |  |  |  | 
| 422 |  |  |  |  |  |  | ##-- symbols: quoted | 
| 423 |  |  |  |  |  |  | elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { | 
| 424 | 39 |  |  |  |  | 133 | $text = $1; | 
| 425 | 39 | 100 |  |  |  | 203 | if    ($text =~ m/^[\+\-]?[0-9]+\z/)			{ $type='INTEGER'; } | 
|  | 10 | 100 |  |  |  | 21 |  | 
| 426 | 6 |  |  |  |  | 20 | elsif ($text =~ m/^[0-9-]+\z/)			{ $type='DATE'; } | 
| 427 | 23 |  |  |  |  | 46 | else							{ $type='SYMBOL'; } | 
| 428 |  |  |  |  |  |  | } | 
| 429 |  |  |  |  |  |  | ##-- symbols: barewords | 
| 430 |  |  |  |  |  |  | elsif ($lex->{state} eq 'INITIAL' && $$bufr =~ m/\G$DEF{symbol_text}/po) { | 
| 431 | 269 |  |  |  |  | 834 | $text  = ${^MATCH}; | 
| 432 | 269 |  |  |  |  | 441 | $match = ${^MATCH}; | 
| 433 | 269 | 100 |  |  |  | 1307 | if    ($text =~ m/^[\+\-]?[0-9]+\z/)			{ $type='INTEGER'; } | 
|  | 36 | 100 |  |  |  | 86 |  | 
| 434 | 6 |  |  |  |  | 16 | elsif ($text =~ m/^[0-9-]+\z/)			{ $type='DATE'; } | 
| 435 | 227 |  |  |  |  | 427 | else							{ $type='SYMBOL'; } | 
| 436 |  |  |  |  |  |  | } | 
| 437 |  |  |  |  |  |  | ##-- barewords: integers and dates (Q_MATCHID) | 
| 438 | 14 |  |  |  |  | 48 | elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+(?=$DEF{int_boundary})/po)	{ $type='INTEGER'; } | 
| 439 | 26 |  |  |  |  | 143 | elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G[\+\-]?[0-9]+\z/p)				{ $type='INTEGER'; } | 
| 440 |  |  |  |  |  |  |  | 
| 441 | 0 |  |  |  |  | 0 | elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}(?=$DEF{int_boundary})/po)	{ $type='DATE'; } | 
| 442 | 0 |  |  |  |  | 0 | elsif ($lex->{state} ne 'INITIAL' && $$bufr =~ m/\G$DEF{date_bare}\z/po)			{ $type='DATE'; } | 
| 443 |  |  |  |  |  |  |  | 
| 444 |  |  |  |  |  |  | ##-- misc | 
| 445 | 284 |  |  |  |  | 785 | elsif ($$bufr =~ m/\G\s+/p)	{ $type = '__SKIP__'; } | 
| 446 |  |  |  |  |  |  | #elsif ($$bufr =~ m/\G./p)	{ $type = 'SYMBOL'; } | 
| 447 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G./p)	{ $type = '__ERROR__'; } | 
| 448 |  |  |  |  |  |  |  | 
| 449 | 1137 | 100 |  |  |  | 3013 | $match = ${^MATCH} if (!defined($match)); | 
| 450 |  |  |  |  |  |  | } | 
| 451 |  |  |  |  |  |  | ##------------------------ | 
| 452 |  |  |  |  |  |  | elsif ($lex->{state} eq 'Q_CORPORA') { | 
| 453 | 58 | 100 |  |  |  | 946 | if    ($$bufr =~ m/\G$DEF{corpus_text}/p)	{ $type='SYMBOL'; } | 
|  | 22 | 100 |  |  |  | 53 |  | 
|  |  | 100 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 454 | 18 |  |  |  |  | 42 | elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po)	{ $type='SYMBOL'; $text=$1; } | 
|  | 18 |  |  |  |  | 41 |  | 
| 455 | 16 |  |  |  |  | 48 | elsif ($$bufr =~ m/\G\,/p)			{ $type=${^MATCH}; } | 
| 456 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\s+/p)			{ $type='__SKIP__'; } | 
| 457 | 2 |  |  |  |  | 8 | else						{ $type='__SKIP__'; $lex->yypopq(); } | 
|  | 2 |  |  |  |  | 8 |  | 
| 458 |  |  |  |  |  |  |  | 
| 459 | 58 |  |  |  |  | 124 | $match = ${^MATCH}; | 
| 460 |  |  |  |  |  |  | } | 
| 461 |  |  |  |  |  |  | ##------------------------ | 
| 462 |  |  |  |  |  |  | elsif ($lex->{state} eq 'Q_COMMENT') { | 
| 463 | 38 | 100 |  |  |  | 450 | if    ($$bufr =~ m/\G\]/p)		    { | 
|  |  | 50 |  |  |  |  |  | 
| 464 | 19 |  |  |  |  | 41 | $type='__SKIP__'; | 
| 465 | 19 |  |  |  |  | 45 | $lex->{_cmtbuf} .= ${^MATCH}; | 
| 466 | 19 |  |  |  |  | 41 | push(@{$lex->{comments}}, $lex->{_cmtbuf}); | 
|  | 19 |  |  |  |  | 50 |  | 
| 467 | 19 |  |  |  |  | 43 | delete $lex->{_cmtbuf}; | 
| 468 | 19 |  |  |  |  | 50 | $lex->yypopq(); | 
| 469 |  |  |  |  |  |  | } | 
| 470 | 19 |  |  |  |  | 46 | elsif ($$bufr =~ m/\G$DEF{comment_text}/sp) { $type='__SKIP__'; $lex->{_cmtbuf} .= ${^MATCH}; } | 
|  | 19 |  |  |  |  | 54 |  | 
| 471 | 0 |  |  |  |  | 0 | else                                        { $type='__ERROR__'; } | 
| 472 |  |  |  |  |  |  |  | 
| 473 | 38 |  |  |  |  | 91 | $match = ${^MATCH}; | 
| 474 |  |  |  |  |  |  | } | 
| 475 |  |  |  |  |  |  | ##------------------------ | 
| 476 |  |  |  |  |  |  | elsif ($lex->{state} eq 'Q_DATE') { | 
| 477 | 2 | 50 |  |  |  | 21 | if    ($$bufr =~ m/^\s+/) { $type = '__SKIP__'; } | 
|  | 0 | 50 |  |  |  | 0 |  | 
| 478 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/^\//)  { $type = '/'; $lex->{state}='INITIAL'; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 479 | 2 |  |  |  |  | 5 | else                      { $lex->{state}='INITIAL'; $type='__SKIP__'; } | 
|  | 2 |  |  |  |  | 6 |  | 
| 480 |  |  |  |  |  |  |  | 
| 481 | 2 |  |  |  |  | 5 | $match = ${^MATCH}; | 
| 482 |  |  |  |  |  |  | } | 
| 483 |  |  |  |  |  |  | ##------------------------ | 
| 484 |  |  |  |  |  |  | elsif ($lex->{state} eq 'Q_REGREP') { | 
| 485 | 2 | 50 |  |  |  | 149 | if    ($$bufr =~ m/\G($DEF{regex_text})\/(?=$DEF{regex_modifier})/po)	{ $type='REGEX_REPLACE'; $text=$1; $lex->{state}='Q_REGOPT'; } | 
|  | 0 | 50 |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 486 | 2 |  |  |  |  | 7 | elsif ($$bufr =~ m/\G($DEF{regex_text})\//po)				{ $type='REGEX_REPLACE'; $text=$1; $lex->{state}='INITIAL';  } | 
|  | 2 |  |  |  |  | 7 |  | 
|  | 2 |  |  |  |  | 6 |  | 
| 487 |  |  |  |  |  |  |  | 
| 488 | 2 |  |  |  |  | 8 | $match = ${^MATCH}; | 
| 489 |  |  |  |  |  |  | } | 
| 490 |  |  |  |  |  |  | ##------------------------ | 
| 491 |  |  |  |  |  |  | elsif ($lex->{state} eq 'Q_REGOPT') { | 
| 492 | 0 | 0 |  |  |  | 0 | if ($$bufr =~ m/\G$DEF{regex_modifier}+/po)	{ $type='REGOPT'; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 493 | 0 |  |  |  |  | 0 | else						{ $type='__SKIP__'; } | 
| 494 | 0 |  |  |  |  | 0 | $lex->{state} = 'INITIAL'; | 
| 495 |  |  |  |  |  |  |  | 
| 496 | 0 |  |  |  |  | 0 | $match = ${^MATCH}; | 
| 497 |  |  |  |  |  |  | } | 
| 498 |  |  |  |  |  |  | ##------------------------ | 
| 499 |  |  |  |  |  |  | elsif ($lex->{state} eq 'Q_XPIPE') { | 
| 500 | 4 | 50 |  |  |  | 218 | if ($$bufr =~ m/\G\s+/p)		 { $type = '__SKIP__'; }	##-- whitespace: skip | 
|  | 0 | 50 |  |  |  | 0 |  | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 501 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\-/p)		 { $lex->{state}='INITIAL'; $type='EXPANDER'; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 502 | 0 |  |  |  |  | 0 | elsif ($$bufr =~ m/\G\'($DEF{sq_text})\'/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=$1; } | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 503 | 4 |  |  |  |  | 12 | elsif ($$bufr =~ m/\G$DEF{symbol_text}/po) { $lex->{state}='INITIAL'; $type='EXPANDER'; } | 
|  | 4 |  |  |  |  | 9 |  | 
| 504 |  |  |  |  |  |  | #elsif ($$bufr =~ m/\G\z/p)		 { $lex->{state}='INITIAL'; $type='EXPANDER'; } | 
| 505 | 0 |  |  |  |  | 0 | else					 { $lex->{state}='INITIAL'; $type='EXPANDER'; $text=''; } | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 506 |  |  |  |  |  |  |  | 
| 507 | 4 |  |  |  |  | 11 | $match = ${^MATCH}; | 
| 508 |  |  |  |  |  |  | } | 
| 509 |  |  |  |  |  |  | ##------------------------ | 
| 510 |  |  |  |  |  |  | ## END perl-ification of flex sources | 
| 511 |  |  |  |  |  |  |  | 
| 512 |  |  |  |  |  |  | ##-- guts | 
| 513 | 1241 | 100 |  |  |  | 2582 | $text = $match if (!defined($text)); | 
| 514 | 1241 | 100 |  |  |  | 3196 | $lex->{bufp} += length($match) if (defined($match)); | 
| 515 |  |  |  |  |  |  |  | 
| 516 | 1241 |  |  |  |  | 2852 | pos($$bufr) = $lex->{bufp}; | 
| 517 | 1241 | 50 |  |  |  | 2888 | return if (!defined($type)); | 
| 518 |  |  |  |  |  |  |  | 
| 519 | 1241 | 100 |  |  |  | 2573 | next LEXSKIP if ($type eq '__SKIP__'); | 
| 520 | 878 | 100 |  |  |  | 1667 | next LEXBUF  if ($type eq '__EOF__'); | 
| 521 |  |  |  |  |  |  | #elsif ($type eq '__ERROR__') { | 
| 522 |  |  |  |  |  |  | #  return $lex->yyerror(); | 
| 523 |  |  |  |  |  |  | #} | 
| 524 | 860 |  |  |  |  | 4932 | return @$lex{qw(yytype yytext)} = ($type,$text); | 
| 525 |  |  |  |  |  |  | } | 
| 526 |  |  |  |  |  |  | } | 
| 527 | 197 |  |  |  |  | 790 | return @$lex{qw(yytype yytext)} = ('__EOF__',undef); | 
| 528 |  |  |  |  |  |  | } | 
| 529 |  |  |  |  |  |  |  | 
| 530 |  |  |  |  |  |  | ##====================================================================== | 
| 531 |  |  |  |  |  |  | ## Testing: dummy lexing | 
| 532 |  |  |  |  |  |  |  | 
| 533 |  |  |  |  |  |  | ## undef = $lex->dummylex(@from_whatever) | 
| 534 |  |  |  |  |  |  | sub dummylex { | 
| 535 | 0 |  |  | 0 | 0 |  | my $lex = shift; | 
| 536 | 0 |  |  |  |  |  | $lex->reset(); | 
| 537 | 0 |  |  |  |  |  | $lex->from(@_); | 
| 538 | 0 |  |  |  |  |  | my ($type,$text); | 
| 539 |  |  |  |  |  |  | TOKEN: | 
| 540 | 0 |  |  |  |  |  | while(1) { | 
| 541 | 0 |  |  |  |  |  | ($type,$text) = $lex->yylex(); | 
| 542 |  |  |  |  |  |  | print("-" x 64, "\n", | 
| 543 |  |  |  |  |  |  | ">>  Line: ",  $lex->yylineno, ", Pos: ", $lex->yypos, "\n", | 
| 544 | 0 | 0 |  |  |  |  | ">>  State: ",  (defined($lex->{state}) ? $lex->{state} : '(undef)'), "\n", | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 545 |  |  |  |  |  |  | ">>  Type: ",  (defined($type) ? $type : '(undef)'), "\n", | 
| 546 |  |  |  |  |  |  | ">>  Text: ", (defined($text) ? $text : '(undef)'), "\n", | 
| 547 |  |  |  |  |  |  | ); | 
| 548 | 0 | 0 |  |  |  |  | if (!defined($type)) { | 
| 549 | 0 |  |  |  |  |  | warn(":: undef type!"); | 
| 550 | 0 |  |  |  |  |  | return; | 
| 551 |  |  |  |  |  |  | } | 
| 552 | 0 | 0 |  |  |  |  | if ($type eq '__ERROR__') { | 
| 553 | 0 |  |  |  |  |  | print(":: ERROR DETECTED\n"); | 
| 554 | 0 |  |  |  |  |  | $lex->yyerror(); | 
| 555 |  |  |  |  |  |  | } | 
| 556 | 0 | 0 |  |  |  |  | last if ($type eq '__EOF__'); | 
| 557 |  |  |  |  |  |  | } | 
| 558 |  |  |  |  |  |  | } | 
| 559 |  |  |  |  |  |  |  | 
| 560 |  |  |  |  |  |  |  | 
| 561 |  |  |  |  |  |  | 1; ##-- be happy | 
| 562 |  |  |  |  |  |  |  | 
| 563 |  |  |  |  |  |  | __END__ |