File Coverage

blib/lib/App/Mowyw/Lexer.pm
Criterion Covered Total %
statement 55 56 98.2
branch 22 24 91.6
condition n/a
subroutine 4 4 100.0
pod 0 1 0.0
total 81 85 95.2


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2 12     12   1980 use warnings;
  12         23  
  12         347  
3 12     12   56 use strict;
  12         24  
  12         8428  
4             package App::Mowyw::Lexer;
5              
6             =pod
7             =head1 NAME
8              
9             App::Mowyw::Lexer - Simple Lexer
10              
11             =head1 SYNOPSIS
12            
13             use App::Mowyw::Lexer qw(lex);
14             # suppose you want to parse simple math expressions
15             my @input_tokens = (
16             ['Int', qr/(?:-|\+)?\d+/],
17             ['Op', qr/\+|\*|-|\//],
18             ['Brace_Open', qr/\(/],
19             ['Brace_Close', qr/\)/],
20             ['Whitespace', qr/\s/, sub { return undef; }],
21             );
22             my $text = "-12 * (3+4)";
23             foreach (lex($text, \@input_tokens){
24             my ($name, $text, $position, $line) = @$_;
25             print "Found Token $name: '$text'\n"
26             print " at position $position line $line\n";
27             }
28              
29             =head1 DESCRIPTION
30              
31             App::Mowyw::Lexer is a simple lexer that breaks up a text into tokens according to
32             regexes you provide.
33              
34             The only exported subroutine is C, which expects input text as its first
35             argument, and a array references as second argument, which contains arrays of
36             token names and regexes.
37              
38             Each input token consists of a token name (which you can choose freely), a
39             regexwhich matches the desired token, and optionally a reference to a
40             functions that takes the matched token text as its argument. The token text is
41             replaced by the return value of that function. If the function returns undef,
42             that token will not be included in the list of output tokens.
43              
44             C returns a list of output tokens, each output token is a reference to a
45             list which contains the token name, matched text, position of the match in the
46             input string (zero-based, suitable for passing to C), and line number
47             of the start of the match (one-based, suitable for humans).
48              
49             If there is unmatched text, it is returned with the token name C.
50              
51             =head1 COPYRIGHT AND LICENSE
52              
53             Copyright (C) 2007,2009 by Moritz Lenz, http://perlgeek.de/, moritz@faui2k3.org
54              
55             This Program and its Documentation is free software. You may distribute it
56             under the terms of the Artistic License 2.0 as published by The Perl
57             Foundation.
58              
59             However all code examples are public domain, so you can use it in any way you
60             want to.
61              
62             =cut
63              
64             require Exporter;
65             our @ISA = qw(Exporter);
66             our @EXPORT = qw(lex);
67              
68             our %EXPORT_TAGS = (":all" => \@EXPORT);
69              
70             sub lex {
71 52     52 0 122 my ($text, $tokens) = @_;
72              
73 52         82 my ($last_line_number, $last_pos) = (0, 0);
74             my $pos_and_line_number = sub {
75 398     398   494 my $pos = shift;
76 398         855 $last_line_number +=
77             (substr($text, $last_pos, $pos - $last_pos) =~ tr/\n//);
78 398         448 $last_pos = $pos;
79 398         1300 return ($pos, $last_line_number + 1);
80 52         292 };
81              
82 52 100       183 return () unless length $text;
83 51         66 my @res;
84 51         145 pos($text) = 0;
85 51         177 while (pos($text) < length($text)){
86 305         352 my $matched = 0;
87             # try to match at the start of $text
88 305         515 foreach (@$tokens){
89 1463         2057 my $re = $_->[1];
90 1463 100       46448 if ($text =~ m#\G($re)#gc){
91 174         221 $matched = 1;
92 174         321 my $match = $1;
93 174 50       409 die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0);
94 174         295 my $token_pos = pos($text) - length($match);
95 174 100       376 if (my $fun = $_->[2]){
96 10         25 $match = $fun->($match);
97             }
98 174 100       347 if (defined $match){
99 168         341 push @res, [$_->[0],
100             $match,
101             $pos_and_line_number->($token_pos),
102             ];
103             }
104 174         429 last;
105             }
106             }
107 305 100       1221 unless ($matched){
108 131         149 my $next_token;
109             my $next_token_match;
110 0         0 my $match;
111              
112 131         185 my $min = length($text);
113 131         173 my $pos = pos($text);
114              
115             # find the token that matches first
116 131         135 my $token_pos;
117 131         215 foreach(@$tokens){
118 913         1210 my $re = $_->[1];
119 913 100       26039 if ($text =~ m#\G((?s:.)*?)($re)#gc){
120 289 100       1019 if ($+[1] < $min){
121 173         359 $min = $+[1];
122 173         254 $next_token = $_;
123 173         393 $next_token_match = $2;
124 173         325 $match = $1;
125 173         335 $token_pos = pos($text) - length($match);
126             }
127             }
128 913         3467 pos($text) = $pos;
129             }
130 131 100       300 if (defined $match){
131 100         264 push @res, ['UNMATCHED',
132             $match,
133             $pos_and_line_number->($token_pos - length($match)),
134             ];
135 100 50       272 die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n"
136             unless (length($next_token_match) > 0);
137 100 100       304 if (my $fun = $next_token->[2]){
138 1         4 $match = $fun->($match);
139             }
140 100 100       850 push @res, [$next_token->[0],
141             $next_token_match,
142             $pos_and_line_number->($min),
143             ] if defined $match;
144 100         557 pos($text) = $min + length($next_token_match);
145             } else {
146 31         93 push @res, ['UNMATCHED',
147             substr($text, $pos),
148             $pos_and_line_number->($pos)
149             ];
150 31         163 pos($text) = length($text);
151             }
152             }
153             }
154 51         435 return @res;
155             }
156             -1;
157              
158             # vim: sw=4 ts=4 expandtab