File Coverage

blib/lib/App/Mowyw/Lexer.pm
Criterion Covered Total %
statement 55 56 98.2
branch 22 24 91.6
condition n/a
subroutine 4 4 100.0
pod 0 1 0.0
total 81 85 95.2


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2 12     12   1205 use warnings;
  12         13  
  12         287  
3 12     12   32 use strict;
  12         13  
  12         5186  
4             package App::Mowyw::Lexer;
5              
6             =pod
7             =head1 NAME
8              
9             App::Mowyw::Lexer - Simple Lexer
10              
11             =head1 SYNOPSIS
12            
13             use App::Mowyw::Lexer qw(lex);
14             # suppose you want to parse simple math expressions
15             my @input_tokens = (
16             ['Int', qr/(?:-|\+)?\d+/],
17             ['Op', qr/\+|\*|-|\//],
18             ['Brace_Open', qr/\(/],
19             ['Brace_Close', qr/\)/],
20             ['Whitespace', qr/\s/, sub { return undef; }],
21             );
22             my $text = "-12 * (3+4)";
23             foreach (lex($text, \@input_tokens){
24             my ($name, $text, $position, $line) = @$_;
25             print "Found Token $name: '$text'\n"
26             print " at position $position line $line\n";
27             }
28              
29             =head1 DESCRIPTION
30              
31             App::Mowyw::Lexer is a simple lexer that breaks up a text into tokens according to
32             regexes you provide.
33              
34             The only exported subroutine is C, which expects input text as its first
35             argument, and a array references as second argument, which contains arrays of
36             token names and regexes.
37              
38             Each input token consists of a token name (which you can choose freely), a
39             regexwhich matches the desired token, and optionally a reference to a
40             functions that takes the matched token text as its argument. The token text is
41             replaced by the return value of that function. If the function returns undef,
42             that token will not be included in the list of output tokens.
43              
44             C returns a list of output tokens, each output token is a reference to a
45             list which contains the token name, matched text, position of the match in the
46             input string (zero-based, suitable for passing to C), and line number
47             of the start of the match (one-based, suitable for humans).
48              
49             If there is unmatched text, it is returned with the token name C.
50              
51             =head1 COPYRIGHT AND LICENSE
52              
53             Copyright (C) 2007,2009 by Moritz Lenz, http://perlgeek.de/, moritz@faui2k3.org
54              
55             This Program and its Documentation is free software. You may distribute it
56             under the terms of the Artistic License 2.0 as published by The Perl
57             Foundation.
58              
59             However all code examples are public domain, so you can use it in any way you
60             want to.
61              
62             =cut
63              
64             require Exporter;
65             our @ISA = qw(Exporter);
66             our @EXPORT = qw(lex);
67              
68             our %EXPORT_TAGS = (":all" => \@EXPORT);
69              
70             sub lex {
71 52     52 0 82 my ($text, $tokens) = @_;
72              
73 52         57 my ($last_line_number, $last_pos) = (0, 0);
74             my $pos_and_line_number = sub {
75 398     398   316 my $pos = shift;
76 398         527 $last_line_number +=
77             (substr($text, $last_pos, $pos - $last_pos) =~ tr/\n//);
78 398         359 $last_pos = $pos;
79 398         782 return ($pos, $last_line_number + 1);
80 52         175 };
81              
82 52 100       120 return () unless length $text;
83 51         47 my @res;
84 51         92 pos($text) = 0;
85 51         127 while (pos($text) < length($text)){
86 305         216 my $matched = 0;
87             # try to match at the start of $text
88 305         351 foreach (@$tokens){
89 1463         1167 my $re = $_->[1];
90 1463 100       21256 if ($text =~ m#\G($re)#gc){
91 174         142 $matched = 1;
92 174         236 my $match = $1;
93 174 50       285 die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0);
94 174         192 my $token_pos = pos($text) - length($match);
95 174 100       253 if (my $fun = $_->[2]){
96 10         18 $match = $fun->($match);
97             }
98 174 100       243 if (defined $match){
99 168         234 push @res, [$_->[0],
100             $match,
101             $pos_and_line_number->($token_pos),
102             ];
103             }
104 174         247 last;
105             }
106             }
107 305 100       758 unless ($matched){
108 131         102 my $next_token;
109             my $next_token_match;
110 0         0 my $match;
111              
112 131         120 my $min = length($text);
113 131         108 my $pos = pos($text);
114              
115             # find the token that matches first
116 131         95 my $token_pos;
117 131         152 foreach(@$tokens){
118 913         703 my $re = $_->[1];
119 913 100       12639 if ($text =~ m#\G((?s:.)*?)($re)#gc){
120 289 100       653 if ($+[1] < $min){
121 173         239 $min = $+[1];
122 173         175 $next_token = $_;
123 173         199 $next_token_match = $2;
124 173         174 $match = $1;
125 173         206 $token_pos = pos($text) - length($match);
126             }
127             }
128 913         1849 pos($text) = $pos;
129             }
130 131 100       195 if (defined $match){
131 100         197 push @res, ['UNMATCHED',
132             $match,
133             $pos_and_line_number->($token_pos - length($match)),
134             ];
135 100 50       186 die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n"
136             unless (length($next_token_match) > 0);
137 100 100       181 if (my $fun = $next_token->[2]){
138 1         2 $match = $fun->($match);
139             }
140 100 100       206 push @res, [$next_token->[0],
141             $next_token_match,
142             $pos_and_line_number->($min),
143             ] if defined $match;
144 100         301 pos($text) = $min + length($next_token_match);
145             } else {
146 31         64 push @res, ['UNMATCHED',
147             substr($text, $pos),
148             $pos_and_line_number->($pos)
149             ];
150 31         98 pos($text) = length($text);
151             }
152             }
153             }
154 51         312 return @res;
155             }
156             -1;
157              
158             # vim: sw=4 ts=4 expandtab