| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | #!/usr/bin/perl | 
| 2 | 12 |  |  | 12 |  | 1980 | use warnings; | 
|  | 12 |  |  |  |  | 23 |  | 
|  | 12 |  |  |  |  | 347 |  | 
| 3 | 12 |  |  | 12 |  | 56 | use strict; | 
|  | 12 |  |  |  |  | 24 |  | 
|  | 12 |  |  |  |  | 8428 |  | 
| 4 |  |  |  |  |  |  | package App::Mowyw::Lexer; | 
| 5 |  |  |  |  |  |  |  | 
| 6 |  |  |  |  |  |  | =pod | 
| 7 |  |  |  |  |  |  | =head1 NAME | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | App::Mowyw::Lexer - Simple Lexer | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | use App::Mowyw::Lexer qw(lex); | 
| 14 |  |  |  |  |  |  | # suppose you want to parse simple math expressions | 
| 15 |  |  |  |  |  |  | my @input_tokens = ( | 
| 16 |  |  |  |  |  |  | ['Int',     qr/(?:-|\+)?\d+/], | 
| 17 |  |  |  |  |  |  | ['Op',      qr/\+|\*|-|\//], | 
| 18 |  |  |  |  |  |  | ['Brace_Open',  qr/\(/], | 
| 19 |  |  |  |  |  |  | ['Brace_Close', qr/\)/], | 
| 20 |  |  |  |  |  |  | ['Whitespace',  qr/\s/, sub { return undef; }], | 
| 21 |  |  |  |  |  |  | ); | 
| 22 |  |  |  |  |  |  | my $text = "-12 * (3+4)"; | 
| 23 |  |  |  |  |  |  | foreach (lex($text, \@input_tokens){ | 
| 24 |  |  |  |  |  |  | my ($name, $text, $position, $line) = @$_; | 
| 25 |  |  |  |  |  |  | print "Found Token $name: '$text'\n" | 
| 26 |  |  |  |  |  |  | print "    at position $position line $line\n"; | 
| 27 |  |  |  |  |  |  | } | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | App::Mowyw::Lexer is a simple lexer that breaks up a text into tokens according to | 
| 32 |  |  |  |  |  |  | regexes you provide. | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | The only exported subroutine is C, which expects input text as its first | 
| 35 |  |  |  |  |  |  | argument, and a array references as second argument, which contains arrays of | 
| 36 |  |  |  |  |  |  | token names and regexes. | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | Each input token consists of a token name (which you can choose freely), a | 
| 39 |  |  |  |  |  |  | regexwhich matches the desired token, and optionally a reference to a | 
| 40 |  |  |  |  |  |  | functions that takes the matched token text as its argument. The token text is | 
| 41 |  |  |  |  |  |  | replaced by the return value of that function. If the function returns undef, | 
| 42 |  |  |  |  |  |  | that token will not be included in the list of output tokens. | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  | C returns a list of output tokens, each output token is a reference to a | 
| 45 |  |  |  |  |  |  | list which contains the token name, matched text, position of the match in the | 
| 46 |  |  |  |  |  |  | input string (zero-based, suitable for passing to C), and line number | 
| 47 |  |  |  |  |  |  | of the start of the match (one-based, suitable for humans). | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  | If there is unmatched text, it is returned with the token name C. | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | Copyright (C) 2007,2009 by Moritz Lenz, http://perlgeek.de/, moritz@faui2k3.org | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | This Program and its Documentation is free software. You may distribute it | 
| 56 |  |  |  |  |  |  | under the terms of the Artistic License 2.0 as published by The Perl | 
| 57 |  |  |  |  |  |  | Foundation. | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | However all code examples are public domain, so you can use it in any way you | 
| 60 |  |  |  |  |  |  | want to. | 
| 61 |  |  |  |  |  |  |  | 
| 62 |  |  |  |  |  |  | =cut | 
| 63 |  |  |  |  |  |  |  | 
| 64 |  |  |  |  |  |  | require Exporter; | 
| 65 |  |  |  |  |  |  | our @ISA = qw(Exporter); | 
| 66 |  |  |  |  |  |  | our @EXPORT = qw(lex); | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | our %EXPORT_TAGS = (":all" => \@EXPORT); | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | sub lex { | 
| 71 | 52 |  |  | 52 | 0 | 122 | my ($text, $tokens) = @_; | 
| 72 |  |  |  |  |  |  |  | 
| 73 | 52 |  |  |  |  | 82 | my ($last_line_number, $last_pos) = (0, 0); | 
| 74 |  |  |  |  |  |  | my $pos_and_line_number = sub { | 
| 75 | 398 |  |  | 398 |  | 494 | my $pos = shift; | 
| 76 | 398 |  |  |  |  | 855 | $last_line_number += | 
| 77 |  |  |  |  |  |  | (substr($text, $last_pos, $pos - $last_pos) =~ tr/\n//); | 
| 78 | 398 |  |  |  |  | 448 | $last_pos = $pos; | 
| 79 | 398 |  |  |  |  | 1300 | return ($pos, $last_line_number + 1); | 
| 80 | 52 |  |  |  |  | 292 | }; | 
| 81 |  |  |  |  |  |  |  | 
| 82 | 52 | 100 |  |  |  | 183 | return () unless length $text; | 
| 83 | 51 |  |  |  |  | 66 | my @res; | 
| 84 | 51 |  |  |  |  | 145 | pos($text) = 0; | 
| 85 | 51 |  |  |  |  | 177 | while (pos($text) < length($text)){ | 
| 86 | 305 |  |  |  |  | 352 | my $matched = 0; | 
| 87 |  |  |  |  |  |  | # try to match at the start of $text | 
| 88 | 305 |  |  |  |  | 515 | foreach (@$tokens){ | 
| 89 | 1463 |  |  |  |  | 2057 | my $re = $_->[1]; | 
| 90 | 1463 | 100 |  |  |  | 46448 | if ($text =~ m#\G($re)#gc){ | 
| 91 | 174 |  |  |  |  | 221 | $matched = 1; | 
| 92 | 174 |  |  |  |  | 321 | my $match = $1; | 
| 93 | 174 | 50 |  |  |  | 409 | die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0); | 
| 94 | 174 |  |  |  |  | 295 | my $token_pos = pos($text)  - length($match); | 
| 95 | 174 | 100 |  |  |  | 376 | if (my $fun = $_->[2]){ | 
| 96 | 10 |  |  |  |  | 25 | $match = $fun->($match); | 
| 97 |  |  |  |  |  |  | } | 
| 98 | 174 | 100 |  |  |  | 347 | if (defined $match){ | 
| 99 | 168 |  |  |  |  | 341 | push @res, [$_->[0], | 
| 100 |  |  |  |  |  |  | $match, | 
| 101 |  |  |  |  |  |  | $pos_and_line_number->($token_pos), | 
| 102 |  |  |  |  |  |  | ]; | 
| 103 |  |  |  |  |  |  | } | 
| 104 | 174 |  |  |  |  | 429 | last; | 
| 105 |  |  |  |  |  |  | } | 
| 106 |  |  |  |  |  |  | } | 
| 107 | 305 | 100 |  |  |  | 1221 | unless ($matched){ | 
| 108 | 131 |  |  |  |  | 149 | my $next_token; | 
| 109 |  |  |  |  |  |  | my $next_token_match; | 
| 110 | 0 |  |  |  |  | 0 | my $match; | 
| 111 |  |  |  |  |  |  |  | 
| 112 | 131 |  |  |  |  | 185 | my $min = length($text); | 
| 113 | 131 |  |  |  |  | 173 | my $pos = pos($text); | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | # find the token that matches first | 
| 116 | 131 |  |  |  |  | 135 | my $token_pos; | 
| 117 | 131 |  |  |  |  | 215 | foreach(@$tokens){ | 
| 118 | 913 |  |  |  |  | 1210 | my $re = $_->[1]; | 
| 119 | 913 | 100 |  |  |  | 26039 | if ($text =~ m#\G((?s:.)*?)($re)#gc){ | 
| 120 | 289 | 100 |  |  |  | 1019 | if ($+[1] < $min){ | 
| 121 | 173 |  |  |  |  | 359 | $min              = $+[1]; | 
| 122 | 173 |  |  |  |  | 254 | $next_token       = $_; | 
| 123 | 173 |  |  |  |  | 393 | $next_token_match = $2; | 
| 124 | 173 |  |  |  |  | 325 | $match            = $1; | 
| 125 | 173 |  |  |  |  | 335 | $token_pos        = pos($text) - length($match); | 
| 126 |  |  |  |  |  |  | } | 
| 127 |  |  |  |  |  |  | } | 
| 128 | 913 |  |  |  |  | 3467 | pos($text) = $pos; | 
| 129 |  |  |  |  |  |  | } | 
| 130 | 131 | 100 |  |  |  | 300 | if (defined $match){ | 
| 131 | 100 |  |  |  |  | 264 | push @res, ['UNMATCHED', | 
| 132 |  |  |  |  |  |  | $match, | 
| 133 |  |  |  |  |  |  | $pos_and_line_number->($token_pos - length($match)), | 
| 134 |  |  |  |  |  |  | ]; | 
| 135 | 100 | 50 |  |  |  | 272 | die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" | 
| 136 |  |  |  |  |  |  | unless (length($next_token_match) > 0); | 
| 137 | 100 | 100 |  |  |  | 304 | if (my $fun = $next_token->[2]){ | 
| 138 | 1 |  |  |  |  | 4 | $match = $fun->($match); | 
| 139 |  |  |  |  |  |  | } | 
| 140 | 100 | 100 |  |  |  | 850 | push @res, [$next_token->[0], | 
| 141 |  |  |  |  |  |  | $next_token_match, | 
| 142 |  |  |  |  |  |  | $pos_and_line_number->($min), | 
| 143 |  |  |  |  |  |  | ] if defined $match; | 
| 144 | 100 |  |  |  |  | 557 | pos($text) = $min + length($next_token_match); | 
| 145 |  |  |  |  |  |  | } else { | 
| 146 | 31 |  |  |  |  | 93 | push @res, ['UNMATCHED', | 
| 147 |  |  |  |  |  |  | substr($text, $pos), | 
| 148 |  |  |  |  |  |  | $pos_and_line_number->($pos) | 
| 149 |  |  |  |  |  |  | ]; | 
| 150 | 31 |  |  |  |  | 163 | pos($text) = length($text); | 
| 151 |  |  |  |  |  |  | } | 
| 152 |  |  |  |  |  |  | } | 
| 153 |  |  |  |  |  |  | } | 
| 154 | 51 |  |  |  |  | 435 | return @res; | 
| 155 |  |  |  |  |  |  | } | 
| 156 |  |  |  |  |  |  | -1; | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | # vim: sw=4 ts=4 expandtab |