File Coverage

blib/lib/App/Mowyw/Lexer.pm

Criterion	Covered	Total	%
statement	55	56	98.2
branch	22	24	91.6
condition			n/a
subroutine	4	4	100.0
pod	0	1	0.0
total	81	85	95.2

line	stmt	bran	sub	pod	time	code
1						#!/usr/bin/perl
2	12		12		1980	use warnings;
	12				23
	12				347
3	12		12		56	use strict;
	12				24
	12				8428
4						package App::Mowyw::Lexer;
5
6						=pod
7						=head1 NAME
8
9						App::Mowyw::Lexer - Simple Lexer
10
11						=head1 SYNOPSIS
12
13						use App::Mowyw::Lexer qw(lex);
14						# suppose you want to parse simple math expressions
15						my @input_tokens = (
16						['Int', qr/(?:-\|\+)?\d+/],
17						['Op', qr/\+\|\*\|-\|\//],
18						['Brace_Open', qr/\(/],
19						['Brace_Close', qr/\)/],
20						['Whitespace', qr/\s/, sub { return undef; }],
21						);
22						my $text = "-12 * (3+4)";
23						foreach (lex($text, \@input_tokens){
24						my ($name, $text, $position, $line) = @$_;
25						print "Found Token $name: '$text'\n"
26						print " at position $position line $line\n";
27						}
28
29						=head1 DESCRIPTION
30
31						App::Mowyw::Lexer is a simple lexer that breaks up a text into tokens according to
32						regexes you provide.
33
34						The only exported subroutine is C, which expects input text as its first
35						argument, and a array references as second argument, which contains arrays of
36						token names and regexes.
37
38						Each input token consists of a token name (which you can choose freely), a
39						regexwhich matches the desired token, and optionally a reference to a
40						functions that takes the matched token text as its argument. The token text is
41						replaced by the return value of that function. If the function returns undef,
42						that token will not be included in the list of output tokens.
43
44						C returns a list of output tokens, each output token is a reference to a
45						list which contains the token name, matched text, position of the match in the
46						input string (zero-based, suitable for passing to C), and line number
47						of the start of the match (one-based, suitable for humans).
48
49						If there is unmatched text, it is returned with the token name C.
50
51						=head1 COPYRIGHT AND LICENSE
52
53						Copyright (C) 2007,2009 by Moritz Lenz, http://perlgeek.de/, moritz@faui2k3.org
54
55						This Program and its Documentation is free software. You may distribute it
56						under the terms of the Artistic License 2.0 as published by The Perl
57						Foundation.
58
59						However all code examples are public domain, so you can use it in any way you
60						want to.
61
62						=cut
63
64						require Exporter;
65						our @ISA = qw(Exporter);
66						our @EXPORT = qw(lex);
67
68						our %EXPORT_TAGS = (":all" => \@EXPORT);
69
70						sub lex {
71	52		52	0	122	my ($text, $tokens) = @_;
72
73	52				82	my ($last_line_number, $last_pos) = (0, 0);
74						my $pos_and_line_number = sub {
75	398		398		494	my $pos = shift;
76	398				855	$last_line_number +=
77						(substr($text, $last_pos, $pos - $last_pos) =~ tr/\n//);
78	398				448	$last_pos = $pos;
79	398				1300	return ($pos, $last_line_number + 1);
80	52				292	};
81
82	52	100			183	return () unless length $text;
83	51				66	my @res;
84	51				145	pos($text) = 0;
85	51				177	while (pos($text) < length($text)){
86	305				352	my $matched = 0;
87						# try to match at the start of $text
88	305				515	foreach (@$tokens){
89	1463				2057	my $re = $_->[1];
90	1463	100			46448	if ($text =~ m#\G($re)#gc){
91	174				221	$matched = 1;
92	174				321	my $match = $1;
93	174	50			409	die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n" unless (length($match) > 0);
94	174				295	my $token_pos = pos($text) - length($match);
95	174	100			376	if (my $fun = $_->[2]){
96	10				25	$match = $fun->($match);
97						}
98	174	100			347	if (defined $match){
99	168				341	push @res, [$_->[0],
100						$match,
101						$pos_and_line_number->($token_pos),
102						];
103						}
104	174				429	last;
105						}
106						}
107	305	100			1221	unless ($matched){
108	131				149	my $next_token;
109						my $next_token_match;
110	0				0	my $match;
111
112	131				185	my $min = length($text);
113	131				173	my $pos = pos($text);
114
115						# find the token that matches first
116	131				135	my $token_pos;
117	131				215	foreach(@$tokens){
118	913				1210	my $re = $_->[1];
119	913	100			26039	if ($text =~ m#\G((?s:.)*?)($re)#gc){
120	289	100			1019	if ($+[1] < $min){
121	173				359	$min = $+[1];
122	173				254	$next_token = $_;
123	173				393	$next_token_match = $2;
124	173				325	$match = $1;
125	173				335	$token_pos = pos($text) - length($match);
126						}
127						}
128	913				3467	pos($text) = $pos;
129						}
130	131	100			300	if (defined $match){
131	100				264	push @res, ['UNMATCHED',
132						$match,
133						$pos_and_line_number->($token_pos - length($match)),
134						];
135	100	50			272	die "Each token has to require at least one character; Rule $_->[0] matched Zero!\n"
136						unless (length($next_token_match) > 0);
137	100	100			304	if (my $fun = $next_token->[2]){
138	1				4	$match = $fun->($match);
139						}
140	100	100			850	push @res, [$next_token->[0],
141						$next_token_match,
142						$pos_and_line_number->($min),
143						] if defined $match;
144	100				557	pos($text) = $min + length($next_token_match);
145						} else {
146	31				93	push @res, ['UNMATCHED',
147						substr($text, $pos),
148						$pos_and_line_number->($pos)
149						];
150	31				163	pos($text) = length($text);
151						}
152						}
153						}
154	51				435	return @res;
155						}
156						-1;
157
158						# vim: sw=4 ts=4 expandtab