File Coverage

blib/lib/App/Mowyw/Lexer.pm

Criterion	Covered	Total	%
statement	55	56	98.2
branch	22	24	91.6
condition			n/a
subroutine	4	4	100.0
pod	0	1	0.0
total	81	85	95.2

line	stmt	bran	sub	pod	time	code
1						#!/usr/bin/perl
2	12		12		1205	use warnings;
	12				13
	12				287
3	12		12		32	use strict;
	12				13
	12				5186
4						package App::Mowyw::Lexer;
5
6						=pod
7						=head1 NAME
8
9						App::Mowyw::Lexer - Simple Lexer
10
11						=head1 SYNOPSIS
12
13						use App::Mowyw::Lexer qw(lex);
14						# suppose you want to parse simple math expressions
15						my @input_tokens = (
16						['Int', qr/(?:-\|\+)?\d+/],
17						['Op', qr/\+\|\*\|-\|\//],
18						['Brace_Open', qr/\(/],
19						['Brace_Close', qr/\)/],
20						['Whitespace', qr/\s/, sub { return undef; }],
21						);
22						my $text = "-12 * (3+4)";
23						foreach (lex($text, \@input_tokens){
24						my ($name, $text, $position, $line) = @$_;
25						print "Found Token $name: '$text'\n"
26						print " at position $position line $line\n";
27						}
28
29						=head1 DESCRIPTION
30
31						App::Mowyw::Lexer is a simple lexer that breaks up a text into tokens according to
32						regexes you provide.
33
34						The only exported subroutine is C, which expects input text as its first
35						argument, and a array references as second argument, which contains arrays of
36						token names and regexes.
37
38						Each input token consists of a token name (which you can choose freely), a
39						regexwhich matches the desired token, and optionally a reference to a
40						functions that takes the matched token text as its argument. The token text is
41						replaced by the return value of that function. If the function returns undef,
42						that token will not be included in the list of output tokens.
43
44						C returns a list of output tokens, each output token is a reference to a
45						list which contains the token name, matched text, position of the match in the
46						input string (zero-based, suitable for passing to C), and line number
47						of the start of the match (one-based, suitable for humans).
48
49						If there is unmatched text, it is returned with the token name C.
50
51						=head1 COPYRIGHT AND LICENSE
52
53						Copyright (C) 2007,2009 by Moritz Lenz, http://perlgeek.de/, moritz@faui2k3.org
54
55						This Program and its Documentation is free software. You may distribute it
56						under the terms of the Artistic License 2.0 as published by The Perl
57						Foundation.
58
59						However all code examples are public domain, so you can use it in any way you
60						want to.
61
62						=cut
63
64						require Exporter;
65						our @ISA = qw(Exporter);
66						our @EXPORT = qw(lex);
67
68						our %EXPORT_TAGS = (":all" => \@EXPORT);
69
70						sub lex {
71	52		52	0	82	my ($text, $tokens) = @_;
72
73	52				57	my ($last_line_number, $last_pos) = (0, 0);
74						my $pos_and_line_number = sub {
75	398		398		316	my $pos = shift;
76	398				527	$last_line_number +=
77						(substr($text, $last_pos, $pos - $last_pos) =~ tr/\n//);
78	398				359	$last_pos = $pos;
79	398				782	return ($pos, $last_line_number + 1);
80	52				175	};
81
82	52	100			120	return () unless length $text;
83	51				47	my @res;
84	51				92	pos($text) = 0;
85	51				127	while (pos($text) < length($text)){
86	305				216	my $matched = 0;
87						# try to match at the start of $text
88	305				351	foreach (@$tokens){
89	1463				1167	my $re = $_->[1];
90	1463	100			21256	if ($text =~ m#\G($re)#gc){
91	174				142	$matched = 1;
92	174				236	my $match = $1;
93	174	50			285	die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0);
94	174				192	my $token_pos = pos($text) - length($match);
95	174	100			253	if (my $fun = $_->[2]){
96	10				18	$match = $fun->($match);
97						}
98	174	100			243	if (defined $match){
99	168				234	push @res, [$_->[0],
100						$match,
101						$pos_and_line_number->($token_pos),
102						];
103						}
104	174				247	last;
105						}
106						}
107	305	100			758	unless ($matched){
108	131				102	my $next_token;
109						my $next_token_match;
110	0				0	my $match;
111
112	131				120	my $min = length($text);
113	131				108	my $pos = pos($text);
114
115						# find the token that matches first
116	131				95	my $token_pos;
117	131				152	foreach(@$tokens){
118	913				703	my $re = $_->[1];
119	913	100			12639	if ($text =~ m#\G((?s:.)*?)($re)#gc){
120	289	100			653	if ($+[1] < $min){
121	173				239	$min = $+[1];
122	173				175	$next_token = $_;
123	173				199	$next_token_match = $2;
124	173				174	$match = $1;
125	173				206	$token_pos = pos($text) - length($match);
126						}
127						}
128	913				1849	pos($text) = $pos;
129						}
130	131	100			195	if (defined $match){
131	100				197	push @res, ['UNMATCHED',
132						$match,
133						$pos_and_line_number->($token_pos - length($match)),
134						];
135	100	50			186	die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n"
136						unless (length($next_token_match) > 0);
137	100	100			181	if (my $fun = $next_token->[2]){
138	1				2	$match = $fun->($match);
139						}
140	100	100			206	push @res, [$next_token->[0],
141						$next_token_match,
142						$pos_and_line_number->($min),
143						] if defined $match;
144	100				301	pos($text) = $min + length($next_token_match);
145						} else {
146	31				64	push @res, ['UNMATCHED',
147						substr($text, $pos),
148						$pos_and_line_number->($pos)
149						];
150	31				98	pos($text) = length($text);
151						}
152						}
153						}
154	51				312	return @res;
155						}
156						-1;
157
158						# vim: sw=4 ts=4 expandtab