File Coverage

blib/lib/Parser/MGC.pm
Criterion Covered Total %
statement 321 324 99.0
branch 103 116 88.7
condition 29 37 78.3
subroutine 61 63 96.8
pod 35 37 94.5
total 549 577 95.1


line stmt bran cond sub pod time code
1             # You may distribute under the terms of either the GNU General Public License
2             # or the Artistic License (the same terms as Perl itself)
3             #
4             # (C) Paul Evans, 2010-2024 -- leonerd@leonerd.org.uk
5              
6             package Parser::MGC 0.23;
7              
8 33     33   6500385 use v5.14;
  33         165  
9 33     33   237 use warnings;
  33         252  
  33         2025  
10              
11 33     33   205 use Carp;
  33         63  
  33         2991  
12 33     33   20496 use Feature::Compat::Try;
  33         18153  
  33         178  
13              
14 33     33   6936 use Scalar::Util qw( blessed );
  33         81  
  33         4942  
15              
16             =head1 NAME
17              
18             C - build simple recursive-descent parsers
19              
20             =head1 SYNOPSIS
21              
22             =for highlighter language=perl
23              
24             package My::Grammar::Parser;
25             use base qw( Parser::MGC );
26              
27             sub parse
28             {
29             my $self = shift;
30              
31             $self->sequence_of( sub {
32             $self->any_of(
33             sub { $self->token_int },
34             sub { $self->token_string },
35             sub { \$self->token_ident },
36             sub { $self->scope_of( "(", \&parse, ")" ) }
37             );
38             } );
39             }
40              
41             my $parser = My::Grammar::Parser->new;
42              
43             my $tree = $parser->from_file( $ARGV[0] );
44              
45             ...
46              
47             =head1 DESCRIPTION
48              
49             This base class provides a low-level framework for building recursive-descent
50             parsers that consume a given input string from left to right, returning a
51             parse structure. It takes its name from the C regexps used to implement
52             the token parsing behaviour.
53              
54             It provides a number of token-parsing methods, which each extract a
55             grammatical token from the string. It also provides wrapping methods that can
56             be used to build up a possibly-recursive grammar structure, by applying a
57             structure around other parts of parsing code.
58              
59             =head2 Backtracking
60              
61             Each method, both token and structural, atomically either consumes a prefix of
62             the string and returns its result, or fails and consumes nothing. This makes
63             it simple to implement grammars that require backtracking.
64              
65             Several structure-forming methods have some form of "optional" behaviour; they
66             can optionally consume some amount of input or take some particular choice,
67             but if the code invoked inside that subsequently fails, the structure can
68             backtrack and take some different behaviour. This is usually what is required
69             when testing whether the structure of the input string matches some part of
70             the grammar that is optional, or has multiple choices.
71              
72             However, once the choice of grammar has been made, it is often useful to be
73             able to fix on that one choice, thus making subsequent failures propagate up
74             rather than taking that alternative behaviour. Control of this backtracking
75             is given by the C method; and careful use of this method is one of the
76             key advantages that C has over more simple parsing using single
77             regexps alone.
78              
79             =head2 Stall Detection
80              
81             Most of the methods in this class have bounded execution time, but some
82             methods (L and L) repeatedly recuse into other code
83             to build up a list of results until some ending condition is reached. A
84             possible class of bug is that whatever they recurse into might successfully
85             match an empty string, and thus make no progress.
86              
87             These methods will automatically detect this situation if they repeatedly
88             encounter the same string position more than a certain number of times (given
89             by the C argument). If this count is reached, the entire parse
90             attempt will be aborted by the L method.
91              
92             =cut
93              
94             =head1 CONSTRUCTOR
95              
96             =cut
97              
98             =head2 new
99              
100             $parser = Parser::MGC->new( %args );
101              
102             Returns a new instance of a C object. This must be called on a
103             subclass that provides method of the name provided as C, by default
104             called C.
105              
106             Takes the following named arguments
107              
108             =over 8
109              
110             =item toplevel => STRING
111              
112             Name of the toplevel method to use to start the parse from. If not supplied,
113             will try to use a method called C.
114              
115             =item patterns => HASH
116              
117             Keys in this hash should map to quoted regexp (C) references, to
118             override the default patterns used to match tokens. See C below
119              
120             =item accept_0o_oct => BOOL
121              
122             If true, the C method will also accept integers with a C<0o> prefix
123             as octal.
124              
125             =item stallcount => INT
126              
127             I
128              
129             The number of times that the stall-detector would have to see the same
130             position before it aborts the parse attempt. If not supplied, a default of
131             C<10> will apply.
132              
133             =back
134              
135             =cut
136              
137             =head1 PATTERNS
138              
139             The following pattern names are recognised. They may be passed to the
140             constructor in the C hash, or provided as a class method under the
141             name C>.
142              
143             =over 4
144              
145             =item * ws
146              
147             Pattern used to skip whitespace between tokens. Defaults to C
148              
149             =item * comment
150              
151             Pattern used to skip comments between tokens. Undefined by default.
152              
153             =item * int
154              
155             Pattern used to parse an integer by C. Defaults to
156             C. If C is given, then
157             this will be expanded to match C as well.
158              
159             =item * float
160              
161             Pattern used to parse a floating-point number by C. Defaults to
162             C.
163              
164             =item * ident
165              
166             Pattern used to parse an identifier by C. Defaults to
167             C
168              
169             =item * string_delim
170              
171             Pattern used to delimit a string by C. Defaults to C.
172              
173             =back
174              
175             =cut
176              
177             my @patterns = qw(
178             ws
179             comment
180             int
181             float
182             ident
183             string_delim
184             );
185              
186 33     33   230 use constant pattern_ws => qr/[\s\n\t]+/;
  33         66  
  33         2615  
187 33     33   192 use constant pattern_comment => undef;
  33         142  
  33         5840  
188 33     33   235 use constant pattern_int => qr/-?(?:0x[[:xdigit:]]+|[[:digit:]]+)/;
  33         110  
  33         5189  
189 33     33   230 use constant pattern_float => qr/-?(?:\d*\.\d+|\d+\.)(?:e-?\d+)?|-?\d+e-?\d+/i;
  33         72  
  33         6021  
190 33     33   230 use constant pattern_ident => qr/[[:alpha:]_]\w*/;
  33         65  
  33         2896  
191 33     33   186 use constant pattern_string_delim => qr/["']/;
  33         115  
  33         1734  
192              
193 33     33   175 use constant DEFAULT_STALLCOUNT => 10;
  33         63  
  33         232702  
194              
195             sub new
196             {
197 48     48 1 5619698 my $class = shift;
198 48         184 my %args = @_;
199              
200 48   100     334 my $toplevel = $args{toplevel} || "parse";
201              
202 48 50       524 $class->can( $toplevel ) or
203             croak "Expected to be a subclass that can ->$toplevel";
204              
205             my $self = bless {
206             toplevel => $toplevel,
207             patterns => {},
208             scope_level => 0,
209 48   50     504 stallcount => $args{stallcount} // DEFAULT_STALLCOUNT,
210             }, $class;
211              
212 48   100     315 $self->{patterns}{$_} = $args{patterns}{$_} || $self->${\"pattern_$_"} for @patterns;
213              
214 48 100       200 if( $args{accept_0o_oct} ) {
215 1         41 $self->{patterns}{int} = qr/0o[0-7]+|$self->{patterns}{int}/;
216             }
217              
218 48 100       182 if( defined $self->{patterns}{comment} ) {
219 1         82 $self->{patterns}{_skip} = qr/$self->{patterns}{ws}|$self->{patterns}{comment}/;
220             }
221             else {
222 47         164 $self->{patterns}{_skip} = $self->{patterns}{ws};
223             }
224              
225 48         283 return $self;
226             }
227              
228             =head1 SUBCLASSING METHODS
229              
230             The following optional methods may be defined by subclasses, to customise
231             their parsing.
232              
233             =head2 on_parse_start
234              
235             $parser->on_parse_start;
236              
237             I
238              
239             If defined, is invoked by the C method that begins a new parse
240             operation, just before invoking the toplevel structure method.
241              
242             =head2 on_parse_end
243              
244             $result = $parser->on_parse_end( $result );
245              
246             I
247              
248             If defined, is invoked by the C method once it has finished the
249             toplevel structure method. This is passed the tentative result from the
250             structure method, and whatever it returns becomes the result of the C
251             method itself.
252              
253             =cut
254              
255             =head1 METHODS
256              
257             =cut
258              
259             =head2 from_string
260              
261             $result = $parser->from_string( $str );
262              
263             Parse the given literal string and return the result from the toplevel method.
264              
265             =cut
266              
267             sub from_string
268             {
269 169     169 1 124287 my $self = shift;
270 169         418 my ( $str ) = @_;
271              
272 169         456 $self->{str} = $str;
273              
274 169         598 pos $self->{str} = 0;
275              
276 169 100       1046 if( my $code = $self->can( "on_parse_start" ) ) {
277 2         9 $self->$code;
278             }
279              
280 169         382 my $toplevel = $self->{toplevel};
281 169         683 my $result = $self->$toplevel;
282              
283 149 100       686 $self->at_eos or
284             $self->fail( "Expected end of input" );
285              
286 144 100       1021 if( my $code = $self->can( "on_parse_end" ) ) {
287 2         11 $result = $self->$code( $result );
288             }
289              
290 144         1223 return $result;
291             }
292              
293             =head2 from_file
294              
295             $result = $parser->from_file( $file, %opts );
296              
297             Parse the given file, which may be a pathname in a string, or an opened IO
298             handle, and return the result from the toplevel method.
299              
300             The following options are recognised:
301              
302             =over 8
303              
304             =item binmode => STRING
305              
306             If set, applies the given binmode to the filehandle before reading. Typically
307             this can be used to set the encoding of the file.
308              
309             $parser->from_file( $file, binmode => ":encoding(UTF-8)" );
310              
311             =back
312              
313             =cut
314              
315             sub from_file
316             {
317 3     3 1 5276 my $self = shift;
318 3         10 my ( $file, %opts ) = @_;
319              
320 3 50       12 defined $file or croak "Expected a filename to ->from_file";
321              
322 3         12 $self->{filename} = $file;
323              
324 3         5 my $fh;
325 3 100       11 if( ref $file ) {
326 2         4 $fh = $file;
327             }
328             else {
329 1 50       34 open $fh, "<", $file or die "Cannot open $file for reading - $!";
330             }
331              
332 3 50       10 binmode $fh, $opts{binmode} if $opts{binmode};
333              
334 3         7 $self->from_string( do { local $/; <$fh>; } );
  3         13  
  3         125  
335             }
336              
337             =head2 filename
338              
339             $filename = $parser->filename;
340              
341             I
342              
343             Returns the name of the file currently being parsed, if invoked from within
344             L.
345              
346             =cut
347              
348             sub filename
349             {
350 70     70 1 127 my $self = shift;
351 70         153 return $self->{filename};
352             }
353              
354             =head2 from_reader
355              
356             $result = $parser->from_reader( \&reader );
357              
358             I
359              
360             Parse the input which is read by the C function. This function will be
361             called in scalar context to generate portions of string to parse, being passed
362             the C<$parser> object. The function should return C when it has no more
363             string to return.
364              
365             $reader->( $parser );
366              
367             Note that because it is not generally possible to detect exactly when more
368             input may be required due to failed regexp parsing, the reader function is
369             only invoked during searching for skippable whitespace. This makes it suitable
370             for reading lines of a file in the common case where lines are considered as
371             skippable whitespace, or for reading lines of input interactively from a
372             user. It cannot be used in all cases (for example, reading fixed-size buffers
373             from a file) because two successive invocations may split a single token
374             across the buffer boundaries, and cause parse failures.
375              
376             =cut
377              
378             sub from_reader
379             {
380 1     1 1 18 my $self = shift;
381 1         4 my ( $reader ) = @_;
382              
383 1         4 local $self->{reader} = $reader;
384              
385 1         3 $self->{str} = "";
386 1         6 pos $self->{str} = 0;
387              
388 1         5 my $result = $self->parse;
389              
390 1 50       5 $self->at_eos or
391             $self->fail( "Expected end of input" );
392              
393 1         16 return $result;
394             }
395              
396             =head2 pos
397              
398             $pos = $parser->pos;
399              
400             I
401              
402             Returns the current parse position, as a character offset from the beginning
403             of the file or string.
404              
405             =cut
406              
407             sub pos
408             {
409 299     299 1 432 my $self = shift;
410 299         864 return pos $self->{str};
411             }
412              
413             =head2 take
414              
415             $str = $parser->take( $len );
416              
417             I
418              
419             Returns the next C<$len> characters directly from the input, prior to any
420             whitespace or comment skipping. This does I take account of any
421             end-of-scope marker that may be pending. It is intended for use by parsers of
422             partially-binary protocols, or other situations in which it would be incorrect
423             for the end-of-scope marker to take effect at this time.
424              
425             =cut
426              
427             sub take
428             {
429 57     57 1 124 my $self = shift;
430 57         107 my ( $len ) = @_;
431              
432 57         168 my $start = pos( $self->{str} );
433              
434 57         183 pos( $self->{str} ) += $len;
435              
436 57         443 return substr( $self->{str}, $start, $len );
437             }
438              
439             =head2 where
440              
441             ( $lineno, $col, $text ) = $parser->where;
442              
443             Returns the current parse position, as a line and column number, and
444             the entire current line of text. The first line is numbered 1, and the first
445             column is numbered 0.
446              
447             =cut
448              
449             sub where
450             {
451 83     83 1 145 my $self = shift;
452 83         184 my ( $pos ) = @_;
453              
454 83 100       294 defined $pos or $pos = pos $self->{str};
455              
456 83         170 my $str = $self->{str};
457              
458 83         165 my $sol = $pos;
459 83 100 100     393 $sol-- if $sol > 0 and substr( $str, $sol, 1 ) =~ m/^[\r\n]$/;
460 83   100     625 $sol-- while $sol > 0 and substr( $str, $sol-1, 1 ) !~ m/^[\r\n]$/;
461              
462 83         139 my $eol = $pos;
463 83   100     1053 $eol++ while $eol < length($str) and substr( $str, $eol, 1 ) !~ m/^[\r\n]$/;
464              
465 83         256 my $line = substr( $str, $sol, $eol - $sol );
466              
467 83         140 my $col = $pos - $sol;
468 83         304 my $lineno = ( () = substr( $str, 0, $pos ) =~ m/\n/g ) + 1;
469              
470 83         321 return ( $lineno, $col, $line );
471             }
472              
473             =head2 fail
474              
475             =head2 fail_from
476              
477             $parser->fail( $message );
478              
479             $parser->fail_from( $pos, $message );
480              
481             I since version 0.09.>
482              
483             Aborts the current parse attempt with the given message string. The failure
484             message will include the line and column position, and the line of input that
485             failed at the current parse position (C), or a position earlier obtained
486             using the C method (C).
487              
488             This failure will propagate up to the inner-most structure parsing method that
489             has not been committed; or will cause the entire parser to fail if there are
490             no further options to take.
491              
492             =cut
493              
494             sub fail
495             {
496 283     283 1 444 my $self = shift;
497 283         538 my ( $message ) = @_;
498 283         769 $self->fail_from( $self->pos, $message );
499             }
500              
501             sub fail_from
502             {
503 283     283 1 379 my $self = shift;
504 283         524 my ( $pos, $message ) = @_;
505 283         998 die Parser::MGC::Failure->new( $message, $self, $pos );
506             }
507              
508             # On perl 5.32 onwards we can use the nicer `isa` infix operator
509             # Problem is it won't even parse correctly on older perls so we'll have to go
510             # the long way around
511             *_isa_failure = ( $^V ge v5.32 )
512 33     33   21536 ? do { eval 'use experimental "isa"; sub { $_[0] isa Parser::MGC::Failure }' // die $@ }
  33     264   147294  
  33         269  
  264         3040  
513             : do { require Scalar::Util;
514             sub { Scalar::Util::blessed($_[0]) and $_[0]->isa( "Parser::MGC::Failure" ) } };
515              
516             =head2 die
517              
518             =head2 die_from
519              
520             $parser->die( $message );
521              
522             $parser->die_from( $pos, $message );
523              
524             I
525              
526             Throws an exception that propagates as normal for C, entirely out of the
527             entire parser and to the caller of the toplevel C method that invoked
528             it, bypassing all of the back-tracking logic.
529              
530             This is much like using core's C directly, except that the message string
531             will include the line and column position, and the line of input that the
532             parser was working on, as it does in the L method.
533              
534             This method is intended for reporting fatal errors where the parsed input was
535             correctly recognised at a grammar level, but is requesting something that
536             cannot be fulfilled semantically.
537              
538             =cut
539              
540             sub die :method
541             {
542 3     3 1 17 my $self = shift;
543 3         9 my ( $message ) = @_;
544 3         12 $self->die_from( $self->pos, $message );
545             }
546              
547             sub die_from
548             {
549 3     3 1 4 my $self = shift;
550 3         8 my ( $pos, $message ) = @_;
551             # Convenient just to use the ->STRING method of a Failure object but don't
552             # throw it directly
553 3         22 die Parser::MGC::Failure->new( $message, $self, $pos )->STRING;
554             }
555              
556             =head2 warn
557              
558             =head2 warn_from
559              
560             $parser->warn( $message );
561              
562             $parser->warn_from( $pos, $message );
563              
564             I
565              
566             Issues a warning as normal for the C core Perl function, appending
567             information to the message string giving the current line and column position
568             and the line of input the parser was working on, as it does in the L
569             method.
570              
571             =cut
572              
573             sub warn :method
574             {
575 1     1 1 3 my $self = shift;
576 1         3 my ( $message ) = @_;
577 1         4 $self->warn_from( $self->pos, $message );
578             }
579              
580             sub warn_from
581             {
582 1     1 1 2 my $self = shift;
583 1         3 my ( $pos, $message ) = @_;
584             # Convenient just to use the ->STRING method of a Failure object but don't
585             # throw it directly
586 1         5 warn Parser::MGC::Failure->new( $message, $self, $pos )->STRING;
587             }
588              
589             =head2 at_eos
590              
591             $eos = $parser->at_eos;
592              
593             Returns true if the input string is at the end of the string.
594              
595             =cut
596              
597             sub at_eos
598             {
599 642     642 1 1027 my $self = shift;
600              
601             # Save pos() before skipping ws so we don't break the substring_before method
602 642         1767 my $pos = pos $self->{str};
603              
604 642         1717 $self->skip_ws;
605              
606 642         927 my $at_eos;
607 642 100       1910 if( pos( $self->{str} ) >= length $self->{str} ) {
    100          
608 178         332 $at_eos = 1;
609             }
610             elsif( defined $self->{endofscope} ) {
611 114         633 $at_eos = $self->{str} =~ m/\G$self->{endofscope}/;
612             }
613             else {
614 350         567 $at_eos = 0;
615             }
616              
617 642         1746 pos( $self->{str} ) = $pos;
618              
619 642         1995 return $at_eos;
620             }
621              
622             =head2 scope_level
623              
624             $level = $parser->scope_level;
625              
626             I
627              
628             Returns the number of nested C calls that have been made.
629              
630             =cut
631              
632             sub scope_level
633             {
634 5     5 1 8 my $self = shift;
635 5         117 return $self->{scope_level};
636             }
637              
638             =head2 include_string
639              
640             $result = $parser->include_string( $str, %opts );
641              
642             I
643              
644             Parses a given string into the existing parser object.
645              
646             The current parser state is moved aside from the duration of this method, and
647             is replaced by the given string. Then the toplevel parser method (or a
648             different as specified) is invoked over it. Its result is returned by this
649             method.
650              
651             This would typically be used to handle some sort of "include" or "macro
652             expansion" ability, by injecting new content in as if the current parse
653             location had encountered it. Other than the internal parser state, other
654             object fields are not altered, so whatever effects the invoked parsing methods
655             will have on it can continue to inspect and alter it as required.
656              
657             The following options are recognised:
658              
659             =over 8
660              
661             =item filename => STRING
662              
663             If set, provides a filename (or other descriptive text) to pretend for the
664             source of this string. It need not be a real file on the filesystem; it could
665             for example explain the source of the string in some other way. It is the
666             value reported by the L method and printed in failure messages.
667              
668             =item toplevel => STRING | CODE
669              
670             If set, provides the toplevel parser method to use within this inclusion,
671             overriding the object's defined default.
672              
673             =back
674              
675             =cut
676              
677             sub include_string
678             {
679 5     5 1 35 my $self = shift;
680 5         13 my ( $str, %opts ) = @_;
681              
682             # local'ize everything out of the way
683 5         8 local @{$self}{qw( str filename reader )};
  5         22  
684              
685 5         10 $self->{str} = $str;
686 5         17 pos($self->{str}) = 0;
687              
688 5         12 $self->{filename} = $opts{filename};
689              
690 5   33     52 my $toplevel = $opts{toplevel} // $self->{toplevel};
691 5         22 my $result = $self->$toplevel;
692              
693 5         63 return $result;
694             }
695              
696             =head1 STRUCTURE-FORMING METHODS
697              
698             The following methods may be used to build a grammatical structure out of the
699             defined basic token-parsing methods. Each takes at least one code reference,
700             which will be passed the actual C<$parser> object as its first argument.
701              
702             Anywhere that a code reference is expected also permits a plain string giving
703             the name of a method to invoke. This is sufficient in many simple cases, such
704             as
705              
706             $self->any_of(
707             'token_int',
708             'token_string',
709             ...
710             );
711              
712             =cut
713              
714             =head2 maybe
715              
716             $ret = $parser->maybe( $code );
717              
718             Attempts to execute the given C<$code> in scalar context, and returns what it
719             returned, accepting that it might fail. C<$code> may either be a CODE
720             reference or a method name given as a string.
721              
722             $ret = $parser->maybe( $code, @args );
723              
724             I this method passes any additional arguments into the
725             invoked code. This is especially useful if the C<$code> is a method name.
726              
727             If the code fails (either by calling C itself, or by propagating a
728             failure from another method it invoked) before it has invoked C, then
729             none of the input string will be consumed; the current parsing position will
730             be restored. C will be returned in this case.
731              
732             If it calls C then any subsequent failure will be propagated to the
733             caller, rather than returning C.
734              
735             This may be considered to be similar to the C regexp qualifier.
736              
737             sub parse_declaration
738             {
739             my $self = shift;
740              
741             [ $self->parse_type,
742             $self->token_ident,
743             $self->maybe( sub {
744             $self->expect( "=" );
745             $self->parse_expression
746             } ),
747             ];
748             }
749              
750             =cut
751              
752             sub maybe
753             {
754 4     4 1 27 my $self = shift;
755 4         6 my ( $code, @args ) = @_;
756              
757 4         36 my $pos = pos $self->{str};
758              
759 4         6 my $committed = 0;
760 4     0   17 local $self->{committer} = sub { $committed++ };
  0         0  
761              
762 4         10 try {
763 4         10 return $self->$code( @args );
764             }
765             catch ( $e ) {
766 2         8 pos($self->{str}) = $pos;
767              
768 2 100 66     68 die $e if $committed or not _isa_failure( $e );
769 1         11 return undef;
770             }
771             }
772              
773             =head2 scope_of
774              
775             $ret = $parser->scope_of( $start, $code, $stop );
776              
777             Expects to find the C<$start> pattern, then attempts to execute the given
778             C<$code>, then expects to find the C<$stop> pattern. Returns whatever the
779             code returned. C<$code> may either be a CODE reference of a method name given
780             as a string.
781              
782             While the code is being executed, the C<$stop> pattern will be used by the
783             token parsing methods as an end-of-scope marker; causing them to raise a
784             failure if called at the end of a scope.
785              
786             sub parse_block
787             {
788             my $self = shift;
789              
790             $self->scope_of( "{", 'parse_statements', "}" );
791             }
792              
793             If the C<$start> pattern is undefined, it is presumed the caller has already
794             checked for this. This is useful when the stop pattern needs to be calculated
795             based on the start pattern.
796              
797             sub parse_bracketed
798             {
799             my $self = shift;
800              
801             my $delim = $self->expect( qr/[\(\[\<\{]/ );
802             $delim =~ tr/([<{/)]>}/;
803              
804             $self->scope_of( undef, 'parse_body', $delim );
805             }
806              
807             This method does not have any optional parts to it; any failures are
808             immediately propagated to the caller.
809              
810             =cut
811              
812             sub scope_of
813             {
814 19     19 1 97 my $self = shift;
815 19         81 $self->_scope_of( 0, @_ );
816             }
817              
818             sub _scope_of
819             {
820 73     73   103 my $self = shift;
821 73         201 my ( $commit_if_started, $start, $code, $stop ) = @_;
822              
823 73 50       701 ref $stop or $stop = qr/\Q$stop/;
824              
825 73 100       308 $self->expect( $start ) if defined $start;
826              
827 34 100       123 $self->commit if $commit_if_started;
828              
829 34         88 local $self->{endofscope} = $stop;
830 34         93 local $self->{scope_level} = $self->{scope_level} + 1;
831              
832 34         145 my $ret = $self->$code;
833              
834 31         121 $self->expect( $stop );
835              
836 30         242 return $ret;
837             }
838              
839             =head2 committed_scope_of
840              
841             $ret = $parser->committed_scope_of( $start, $code, $stop );
842              
843             I
844              
845             A variant of L that calls L after a successful match of
846             the start pattern. This is usually what you want if using C from
847             within an C choice, if no other alternative following this one could
848             possibly match if the start pattern has.
849              
850             =cut
851              
852             sub committed_scope_of
853             {
854 54     54 1 102 my $self = shift;
855 54         129 $self->_scope_of( 1, @_ );
856             }
857              
858             =head2 list_of
859              
860             $ret = $parser->list_of( $sep, $code );
861              
862             Expects to find a list of instances of something parsed by C<$code>,
863             separated by the C<$sep> pattern. Returns an ARRAY ref containing a list of
864             the return values from the C<$code>. A single trailing delimiter is allowed,
865             and does not affect the return value. C<$code> may either be a CODE reference
866             or a method name given as a string. It is called in list context, and whatever
867             values it returns are appended to the eventual result - similar to perl's
868             C.
869              
870             This method does not consider it an error if the returned list is empty; that
871             is, that the scope ended before any item instances were parsed from it.
872              
873             sub parse_numbers
874             {
875             my $self = shift;
876              
877             $self->list_of( ",", 'token_int' );
878             }
879              
880             If the code fails (either by invoking C itself, or by propagating a
881             failure from another method it invoked) before it has invoked C on a
882             particular item, then the item is aborted and the parsing position will be
883             restored to the beginning of that failed item. The list of results from
884             previous successful attempts will be returned.
885              
886             If it calls C within an item then any subsequent failure for that item
887             will cause the entire C to fail, propagating that to the caller.
888              
889             =cut
890              
891             sub list_of
892             {
893 83     83 1 179 my $self = shift;
894 83         2665 my ( $sep, $code ) = @_;
895              
896 83 100 33     422 ref $sep or $sep = qr/\Q$sep/ if defined $sep;
897              
898 83         131 my $committed;
899 83     14   374 local $self->{committer} = sub { $committed++ };
  14         26  
900              
901 83         187 my @ret;
902              
903             my @lastpos;
904              
905 83         239 while( !$self->at_eos ) {
906 157         230 $committed = 0;
907 157         1183 my $pos = pos $self->{str};
908              
909 157         309 push @lastpos, $pos;
910 157 100       430 if( @lastpos > $self->{stallcount} ) {
911 1         2 shift @lastpos;
912 1 50       16 $self->die( ref($self) . " failed to make progress" ) if $lastpos[0] == $pos;
913             }
914              
915 156         324 try {
916 156         493 push @ret, $self->$code;
917 132         680 next;
918             }
919             catch ( $e ) {
920 24         73 pos($self->{str}) = $pos;
921 24 100 66     800 die $e if $committed or not _isa_failure( $e );
922              
923 23         65 last;
924             }
925             }
926             continue {
927 132 100       467 if( defined $sep ) {
928 32         69 $self->skip_ws;
929 32 100       206 $self->{str} =~ m/\G$sep/gc or last;
930             }
931             }
932              
933 81         639 return \@ret;
934             }
935              
936             =head2 sequence_of
937              
938             $ret = $parser->sequence_of( $code );
939              
940             A shortcut for calling C with an empty string as separator; expects
941             to find at least one instance of something parsed by C<$code>, separated only
942             by skipped whitespace.
943              
944             This may be considered to be similar to the C<+> or C<*> regexp qualifiers.
945              
946             sub parse_statements
947             {
948             my $self = shift;
949              
950             $self->sequence_of( 'parse_statement' );
951             }
952              
953             The interaction of failures in the code and the C method is identical
954             to that of C.
955              
956             =cut
957              
958             sub sequence_of
959             {
960 67     67 1 369 my $self = shift;
961 67         153 my ( $code ) = @_;
962              
963 67         249 $self->list_of( undef, $code );
964             }
965              
966             =head2 any_of
967              
968             $ret = $parser->any_of( @codes );
969              
970             I
971              
972             Expects that one of the given code instances can parse something from the
973             input, returning what it returned. Each code instance may indicate a failure
974             to parse by calling the C method or otherwise propagating a failure.
975             Each code instance may either be a CODE reference or a method name given as a
976             string.
977              
978             This may be considered to be similar to the C<|> regexp operator for forming
979             alternations of possible parse trees.
980              
981             sub parse_statement
982             {
983             my $self = shift;
984              
985             $self->any_of(
986             sub { $self->parse_declaration; $self->expect(";") },
987             sub { $self->parse_expression; $self->expect(";") },
988             sub { $self->parse_block },
989             );
990             }
991              
992             If the code for a given choice fails (either by invoking C itself, or by
993             propagating a failure from another method it invoked) before it has invoked
994             C itself, then the parsing position restored and the next choice will
995             be attempted.
996              
997             If it calls C then any subsequent failure for that choice will cause
998             the entire C to fail, propagating that to the caller and no further
999             choices will be attempted.
1000              
1001             If none of the choices match then a simple failure message is printed:
1002              
1003             =for highlighter
1004              
1005             Found nothing parseable
1006              
1007             =for highlighter language=perl
1008              
1009             As this is unlikely to be helpful to users, a better message can be provided
1010             by the final choice instead. Don't forget to C before printing the
1011             failure message, or it won't count.
1012              
1013             $self->any_of(
1014             'token_int',
1015             'token_string',
1016             ...,
1017              
1018             sub { $self->commit; $self->fail( "Expected an int or string" ) }
1019             );
1020              
1021             =cut
1022              
1023             sub any_of
1024             {
1025 234     234 1 740 my $self = shift;
1026              
1027 234         560 while( @_ ) {
1028 460         718 my $code = shift;
1029 460         1370 my $pos = pos $self->{str};
1030              
1031 460         601 my $committed = 0;
1032 460     55   1746 local $self->{committer} = sub { $committed++ };
  55         107  
1033              
1034 460         770 try {
1035 460         1127 return $self->$code;
1036             }
1037             catch ( $e ) {
1038 241         603 pos( $self->{str} ) = $pos;
1039              
1040 241 100 100     6875 die $e if $committed or not _isa_failure( $e );
1041             }
1042             }
1043              
1044 12         39 $self->fail( "Found nothing parseable" );
1045             }
1046              
1047             sub one_of {
1048 0     0 0 0 croak "Parser::MGC->one_of is deprecated; use ->any_of instead";
1049             }
1050              
1051             =head2 commit
1052              
1053             $parser->commit;
1054              
1055             Calling this method will cancel the backtracking behaviour of the innermost
1056             C, C, C, or C structure forming method.
1057             That is, if later code then calls C, the exception will be propagated
1058             out of C, no further list items will be attempted by C or
1059             C, and no further code blocks will be attempted by C.
1060              
1061             Typically this will be called once the grammatical structure alter has been
1062             determined, ensuring that any further failures are raised as real exceptions,
1063             rather than by attempting other alternatives.
1064              
1065             sub parse_statement
1066             {
1067             my $self = shift;
1068              
1069             $self->any_of(
1070             ...
1071             sub {
1072             $self->scope_of( "{",
1073             sub { $self->commit; $self->parse_statements; },
1074             "}" ),
1075             },
1076             );
1077             }
1078              
1079             Though in this common pattern, L may be used instead.
1080              
1081             =cut
1082              
1083             sub commit
1084             {
1085 69     69 1 137 my $self = shift;
1086 69 50       152 if( $self->{committer} ) {
1087 69         162 $self->{committer}->();
1088             }
1089             else {
1090 0         0 croak "Cannot commit except within a backtrack-able structure";
1091             }
1092             }
1093              
1094             =head1 TOKEN PARSING METHODS
1095              
1096             The following methods attempt to consume some part of the input string, to be
1097             used as part of the parsing process.
1098              
1099             =cut
1100              
1101             sub skip_ws
1102             {
1103 1362     1362 0 1948 my $self = shift;
1104              
1105 1362         2366 my $pattern = $self->{patterns}{_skip};
1106              
1107             {
1108 1362         1785 1 while $self->{str} =~ m/\G$pattern/gc;
  1365         9234  
1109              
1110 1365 100       3959 return if pos( $self->{str} ) < length $self->{str};
1111              
1112 238 100       689 return unless $self->{reader};
1113              
1114 4         12 my $more = $self->{reader}->( $self );
1115 4 100       21 if( defined $more ) {
1116 3         6 my $pos = pos( $self->{str} );
1117 3         11 $self->{str} .= $more;
1118 3         9 pos( $self->{str} ) = $pos;
1119              
1120 3         7 redo;
1121             }
1122              
1123 1         3 undef $self->{reader};
1124 1         3 return;
1125             }
1126             }
1127              
1128             =head2 expect
1129              
1130             $str = $parser->expect( $literal );
1131              
1132             $str = $parser->expect( qr/pattern/ );
1133              
1134             @groups = $parser->expect( qr/pattern/ );
1135              
1136             Expects to find a literal string or regexp pattern match, and consumes it.
1137             In scalar context, this method returns the string that was captured. In list
1138             context it returns the matching substring and the contents of any subgroups
1139             contained in the pattern.
1140              
1141             This method will raise a parse error (by calling C) if the regexp fails
1142             to match. Note that if the pattern could match an empty string (such as for
1143             example C), the pattern will always match, even if it has to match an
1144             empty string. This method will not consider a failure if the regexp matches
1145             with zero-width.
1146              
1147             =head2 maybe_expect
1148              
1149             $str = $parser->maybe_expect( ... );
1150              
1151             @groups = $parser->maybe_expect( ... );
1152              
1153             I
1154              
1155             A convenient shortcut equivalent to calling C within C, but
1156             implemented more efficiently, avoiding the exception-handling set up by
1157             C. Returns C or an empty list if the match fails.
1158              
1159             =cut
1160              
1161             sub maybe_expect
1162             {
1163 404     404 1 589 my $self = shift;
1164 404         665 my ( $expect ) = @_;
1165              
1166 404 100       860 ref $expect or $expect = qr/\Q$expect/;
1167              
1168 404         988 $self->skip_ws;
1169 404 100       5969 $self->{str} =~ m/\G$expect/gc or return;
1170              
1171 232 100       1728 return substr( $self->{str}, $-[0], $+[0]-$-[0] ) if !wantarray;
1172 40 100       126 return map { defined $-[$_] ? substr( $self->{str}, $-[$_], $+[$_]-$-[$_] ) : undef } 0 .. $#+;
  60         539  
1173             }
1174              
1175             sub expect
1176             {
1177 388     388 1 848 my $self = shift;
1178 388         1471 my ( $expect ) = @_;
1179              
1180 388 100       3439 ref $expect or $expect = qr/\Q$expect/;
1181              
1182 388 100       884 if( wantarray ) {
1183 60 100       182 my @ret = $self->maybe_expect( $expect ) or
1184             $self->fail( "Expected $expect" );
1185 38         178 return @ret;
1186             }
1187             else {
1188 328 100       795 defined( my $ret = $self->maybe_expect( $expect ) ) or
1189             $self->fail( "Expected $expect" );
1190 189         629 return $ret;
1191             }
1192             }
1193              
1194             =head2 substring_before
1195              
1196             $str = $parser->substring_before( $literal );
1197              
1198             $str = $parser->substring_before( qr/pattern/ );
1199              
1200             I
1201              
1202             Expects to possibly find a literal string or regexp pattern match. If it finds
1203             such, consume all the input text before but excluding this match, and return
1204             it. If it fails to find a match before the end of the current scope, consumes
1205             all the input text until the end of scope and return it.
1206              
1207             This method does not consume the part of input that matches, only the text
1208             before it. It is not considered a failure if the substring before this match
1209             is empty. If a non-empty match is required, use the C method:
1210              
1211             sub token_nonempty_part
1212             {
1213             my $self = shift;
1214              
1215             my $str = $parser->substring_before( "," );
1216             length $str or $self->fail( "Expected a string fragment before ," );
1217              
1218             return $str;
1219             }
1220              
1221             Note that unlike most of the other token parsing methods, this method does not
1222             consume either leading or trailing whitespace around the substring. It is
1223             expected that this method would be used as part a parser to read quoted
1224             strings, or similar cases where whitespace should be preserved.
1225              
1226             =head2 nonempty_substring_before
1227              
1228             $str = $parser->nonempty_substring_before( $literal );
1229              
1230             $str = $parser->nonempty_substring_before( qr/pattern/ );
1231              
1232             I
1233              
1234             A variant of L which fails if the matched part is empty.
1235              
1236             The example above could have been written:
1237              
1238             sub token_nonempty_part
1239             {
1240             my $self = shift;
1241              
1242             return $parser->nonempty_substring_before( "," );
1243             }
1244              
1245             This is often useful for breaking out of repeating loops; e.g.
1246              
1247             sub token_escaped_string
1248             {
1249             my $self = shift;
1250             $self->expect( '"' );
1251              
1252             my $ret = "";
1253             1 while $self->any_of(
1254             sub { $ret .= $self->nonempty_substring_before( qr/%|$/m ); 1 }
1255             sub { my $escape = ( $self->expect( qr/%(.)/ ) )[1];
1256             $ret .= _handle_escape( $escape );
1257             1 },
1258             sub { 0 },
1259             )
1260              
1261             return $ret;
1262             }
1263              
1264             =cut
1265              
1266             sub _substring_before
1267             {
1268 52     52   108 my $self = shift;
1269 52         142 my ( $expect, $fail_if_empty ) = @_;
1270              
1271 52 100       335 ref $expect or $expect = qr/\Q$expect/;
1272              
1273 52 100       220 my $endre = ( defined $self->{endofscope} ) ?
1274             qr/$expect|$self->{endofscope}/ :
1275             $expect;
1276              
1277             # NO skip_ws
1278              
1279 52         102 my $start = pos $self->{str};
1280 52         75 my $end;
1281 52 100       762 if( $self->{str} =~ m/\G(?s:.*?)($endre)/ ) {
1282 42         189 $end = $-[1];
1283             }
1284             else {
1285 10         24 $end = length $self->{str};
1286             }
1287              
1288 52 100 66     198 $self->fail( "Expected to find a non-empty substring before $expect" )
1289             if $fail_if_empty and $end == $start;
1290              
1291 51         183 return $self->take( $end - $start );
1292             }
1293              
1294             sub substring_before
1295             {
1296 51     51 1 128 my $self = shift;
1297 51         151 return $self->_substring_before( $_[0], 0 );
1298             }
1299              
1300             sub nonempty_substring_before
1301             {
1302 1     1 1 14 my $self = shift;
1303 1         4 return $self->_substring_before( $_[0], 1 );
1304             }
1305              
1306             =head2 generic_token
1307              
1308             $val = $parser->generic_token( $name, $re, $convert );
1309              
1310             I
1311              
1312             Expects to find a token matching the precompiled regexp C<$re>. If provided,
1313             the C<$convert> CODE reference can be used to convert the string into a more
1314             convenient form. C<$name> is used in the failure message if the pattern fails
1315             to match.
1316              
1317             If provided, the C<$convert> function will be passed the parser and the
1318             matching substring; the value it returns is returned from C.
1319              
1320             $convert->( $parser, $substr )
1321              
1322             If not provided, the substring will be returned as it stands.
1323              
1324             This method is mostly provided for subclasses to define their own token types.
1325             For example:
1326              
1327             sub token_hex
1328             {
1329             my $self = shift;
1330             $self->generic_token( hex => qr/[0-9A-F]{2}h/, sub { hex $_[1] } );
1331             }
1332              
1333             =cut
1334              
1335             sub generic_token
1336             {
1337 230     230 1 359 my $self = shift;
1338 230         536 my ( $name, $re, $convert ) = @_;
1339              
1340 230 50       643 $self->fail( "Expected $name" ) if $self->at_eos;
1341              
1342 230         648 $self->skip_ws;
1343 230 100       9785 $self->{str} =~ m/\G$re/gc or
1344             $self->fail( "Expected $name" );
1345              
1346 169         1088 my $match = substr( $self->{str}, $-[0], $+[0] - $-[0] );
1347              
1348 169 100       732 return $convert ? $convert->( $self, $match ) : $match;
1349             }
1350              
1351             sub _token_generic
1352             {
1353 226     226   337 my $self = shift;
1354 226         889 my %args = @_;
1355              
1356 226         422 my $name = $args{name};
1357 226 50       753 my $re = $args{pattern} ? $self->{patterns}{ $args{pattern} } : $args{re};
1358 226         378 my $convert = $args{convert};
1359              
1360 226         624 $self->generic_token( $name, $re, $convert );
1361             }
1362              
1363             =head2 token_int
1364              
1365             $int = $parser->token_int;
1366              
1367             Expects to find an integer in decimal, octal or hexadecimal notation, and
1368             consumes it. Negative integers, preceded by C<->, are also recognised.
1369              
1370             =cut
1371              
1372             sub token_int
1373             {
1374 134     134 1 375 my $self = shift;
1375             $self->_token_generic(
1376             name => "int",
1377              
1378             pattern => "int",
1379             convert => sub {
1380 104     104   168 my $int = $_[1];
1381 104 100       280 my $sign = ( $int =~ s/^-// ) ? -1 : 1;
1382              
1383 104         163 $int =~ s/^0o/0/;
1384              
1385 104 100       299 return $sign * oct $int if $int =~ m/^0/;
1386 99         963 return $sign * $int;
1387             },
1388 134         764 );
1389             }
1390              
1391             =head2 token_float
1392              
1393             $float = $parser->token_float;
1394              
1395             I
1396              
1397             Expects to find a number expressed in floating-point notation; a sequence of
1398             digits possibly prefixed by C<->, possibly containing a decimal point,
1399             possibly followed by an exponent specified by C followed by an integer. The
1400             numerical value is then returned.
1401              
1402             =cut
1403              
1404             sub token_float
1405             {
1406 20     20 1 79 my $self = shift;
1407             $self->_token_generic(
1408             name => "float",
1409              
1410             pattern => "float",
1411 18     18   194 convert => sub { $_[1] + 0 },
1412 20         195 );
1413             }
1414              
1415             =head2 token_number
1416              
1417             $number = $parser->token_number;
1418              
1419             I
1420              
1421             Expects to find a number expressed in either of the above forms.
1422              
1423             =cut
1424              
1425             sub token_number
1426             {
1427 7     7 1 40 my $self = shift;
1428 7         29 $self->any_of( \&token_float, \&token_int );
1429             }
1430              
1431             =head2 token_string
1432              
1433             $str = $parser->token_string;
1434              
1435             Expects to find a quoted string, and consumes it. The string should be quoted
1436             using C<"> or C<'> quote marks.
1437              
1438             The content of the quoted string can contain character escapes similar to
1439             those accepted by C or Perl. Specifically, the following forms are recognised:
1440              
1441             =for highlighter
1442              
1443             \a Bell ("alert")
1444             \b Backspace
1445             \e Escape
1446             \f Form feed
1447             \n Newline
1448             \r Return
1449             \t Horizontal Tab
1450             \0, \012 Octal character
1451             \x34, \x{5678} Hexadecimal character
1452              
1453             =for highlighter language=perl
1454              
1455             C's C<\v> for vertical tab is not supported as it is rarely used in practice
1456             and it collides with Perl's C<\v> regexp escape. Perl's C<\c> for forming other
1457             control characters is also not supported.
1458              
1459             =cut
1460              
1461             my %escapes = (
1462             a => "\a",
1463             b => "\b",
1464             e => "\e",
1465             f => "\f",
1466             n => "\n",
1467             r => "\r",
1468             t => "\t",
1469             );
1470              
1471             sub token_string
1472             {
1473 53     53 1 158 my $self = shift;
1474              
1475 53 100       115 $self->fail( "Expected string" ) if $self->at_eos;
1476              
1477 52         92 my $pos = pos $self->{str};
1478              
1479 52         119 $self->skip_ws;
1480 52 100       653 $self->{str} =~ m/\G($self->{patterns}{string_delim})/gc or
1481             $self->fail( "Expected string delimiter" );
1482              
1483 32         84 my $delim = $1;
1484              
1485             $self->{str} =~ m/
1486             \G(
1487             (?:
1488             \\[0-7]{1,3} # octal escape
1489             |\\x[0-9A-F]{2} # 2-digit hex escape
1490             |\\x\{[0-9A-F]+\} # {}-delimited hex escape
1491             |\\. # symbolic escape
1492             |[^\\$delim]+ # plain chunk
1493             )*?
1494             )$delim/gcix or
1495 32 50       1230 pos($self->{str}) = $pos, $self->fail( "Expected contents of string" );
1496              
1497 32         90 my $string = $1;
1498              
1499 32         85 $string =~ s<\\(?:([0-7]{1,3})|x([0-9A-F]{2})|x\{([0-9A-F]+)\}|(.))>
1500             [defined $1 ? chr oct $1 :
1501             defined $2 ? chr hex $2 :
1502             defined $3 ? chr hex $3 :
1503 11 50       59 exists $escapes{$4} ? $escapes{$4} : $4]egi;
    100          
    100          
    100          
1504              
1505 32         114 return $string;
1506             }
1507              
1508             =head2 token_ident
1509              
1510             $ident = $parser->token_ident;
1511              
1512             Expects to find an identifier, and consumes it.
1513              
1514             =cut
1515              
1516             sub token_ident
1517             {
1518 72     72 1 189 my $self = shift;
1519 72         206 $self->_token_generic(
1520             name => "ident",
1521              
1522             pattern => "ident",
1523             );
1524             }
1525              
1526             =head2 token_kw
1527              
1528             $keyword = $parser->token_kw( @keywords );
1529              
1530             Expects to find a keyword, and consumes it. A keyword is defined as an
1531             identifier which is exactly one of the literal values passed in.
1532              
1533             =cut
1534              
1535             sub token_kw
1536             {
1537 2     2 1 24 my $self = shift;
1538 2         12 my @acceptable = @_;
1539              
1540 2         15 $self->skip_ws;
1541              
1542 2         6 my $pos = pos $self->{str};
1543              
1544 2 50       14 defined( my $kw = $self->token_ident ) or
1545             return undef;
1546              
1547 4         33 grep { $_ eq $kw } @acceptable or
1548 2 100       8 pos($self->{str}) = $pos, $self->fail( "Expected any of ".join( ", ", @acceptable ) );
1549              
1550 1         4 return $kw;
1551             }
1552              
1553             package # hide from indexer
1554             Parser::MGC::Failure;
1555              
1556             sub new
1557             {
1558 287     287   481 my $class = shift;
1559 287         636 my $self = bless {}, $class;
1560 287         579 @{$self}{qw( message parser pos )} = @_;
  287         1789  
1561 287         2040 return $self;
1562             }
1563              
1564 33     33   387 use overload '""' => "STRING";
  33         122  
  33         431  
1565             sub STRING
1566             {
1567 70     70   6295 my $self = shift;
1568              
1569 70         134 my $parser = $self->{parser};
1570 70         294 my ( $linenum, $col, $text ) = $parser->where( $self->{pos} );
1571              
1572             # Column number only counts characters. There may be tabs in there.
1573             # Rather than trying to calculate the visual column number, just print the
1574             # indentation as it stands.
1575              
1576 70         178 my $indent = substr( $text, 0, $col );
1577 70         243 $indent =~ s/[^ \t]/ /g; # blank out all the non-whitespace
1578              
1579 70         296 my $filename = $parser->filename;
1580 70 100 100     259 my $in_file = ( defined $filename and !ref $filename )
1581             ? "in $filename " : "";
1582              
1583 70         566 return "$self->{message} ${in_file}on line $linenum at:\n" .
1584             "$text\n" .
1585             "$indent^\n";
1586             }
1587              
1588             # Provide fallback operators for cmp, eq, etc...
1589 33     33   11376 use overload fallback => 1;
  33         177  
  33         263  
1590              
1591             =head1 EXAMPLES
1592              
1593             =head2 Accumulating Results Using Variables
1594              
1595             Although the structure-forming methods all return a value, obtained from their
1596             nested parsing code, it can sometimes be more convenient to use a variable to
1597             accumulate a result in instead. For example, consider the following parser
1598             method, designed to parse a set of C assignments, such as might
1599             be found in a configuration file, or YAML/JSON-style mapping value.
1600              
1601             sub parse_dict
1602             {
1603             my $self = shift;
1604              
1605             my %ret;
1606             $self->list_of( ",", sub {
1607             my $key = $self->token_ident;
1608             exists $ret{$key} and $self->fail( "Already have a mapping for '$key'" );
1609              
1610             $self->expect( ":" );
1611              
1612             $ret{$key} = $self->parse_value;
1613             } );
1614              
1615             return \%ret
1616             }
1617              
1618             Instead of using the return value from C, this method accumulates
1619             values in the C<%ret> hash, eventually returning a reference to it as its
1620             result. Because of this, it can perform some error checking while it parses;
1621             namely, rejecting duplicate keys.
1622              
1623             =head1 TODO
1624              
1625             =for highlighter
1626              
1627             =over 4
1628              
1629             =item *
1630              
1631             Make unescaping of string constants more customisable. Possibly consider
1632             instead a C using a loop over C.
1633              
1634             =item *
1635              
1636             Easy ability for subclasses to define more token types as methods. Perhaps
1637             provide a class method such as
1638              
1639             __PACKAGE__->has_token( hex => qr/[0-9A-F]+/i, sub { hex $_[1] } );
1640              
1641             =item *
1642              
1643             Investigate how well C can cope with buffer splitting across
1644             other tokens than simply skippable whitespace
1645              
1646             =back
1647              
1648             =head1 AUTHOR
1649              
1650             Paul Evans
1651              
1652             =cut
1653              
1654             0x55AA;