File Coverage

blib/lib/Asm/Preproc/Lexer.pm

Criterion	Covered	Total	%
statement	84	85	98.8
branch	24	28	85.7
condition			n/a
subroutine	16	16	100.0
pod	4	4	100.0
total	128	133	96.2

line	stmt	bran	sub	pod	time	code
1						# $Id: Lexer.pm,v 1.6 2013/07/26 01:57:26 Paulo Exp $
2
3						package Asm::Preproc::Lexer;
4
5						#------------------------------------------------------------------------------
6
7						=head1 NAME
8
9						Asm::Preproc::Lexer - Iterator to split input in tokens
10
11						=cut
12
13						#------------------------------------------------------------------------------
14
15	1		1		1753	use strict;
	1				3
	1				41
16	1		1		5	use warnings;
	1				2
	1				34
17
18	1		1		6	use Carp;
	1				2
	1				60
19	1		1		673	use Text::Template 'fill_in_string';
	1				3525
	1				66
20	1		1		8	use Asm::Preproc::Line;
	1				2
	1				6
21	1		1		27	use Asm::Preproc::Token;
	1				2
	1				7
22
23						our $VERSION = '1.03';
24
25						#------------------------------------------------------------------------------
26
27						=head1 SYNOPSIS
28
29						use Asm::Preproc::Lexer;
30
31						my @tokens = (
32						BLANKS => qr/\s+/, sub {()},
33						COMMENT => [qr/\/\/, qr/\\//],
34						undef,
35						QSTR => [qr/'/], sub { my($type, $value) = @_;
36						[$type,
37						substr($value, 1, length($value)-2)] },
38						QQSTR => [qr/"/, qr/"/],
39						NUM => qr/\d+/,
40						ID => qr/[a-z]+/, sub { my($type, $value) = @_;
41						[$type, $value] },
42						SYM => qr/(.)/, sub { [$1, $1] },
43						);
44
45						my $lex = Asm::Preproc::Lexer->new;
46						$lex->make_lexer(@tokens);
47
48						my $lex2 = $lex->clone;
49
50						$lex->from(sub {}, @lines); # read Asm::Preproc::Line from iterator
51						my $token = $lex->next; # isa Asm::Preproc::Token
52						my $token = $lex->();
53
54						=head1 DESCRIPTION
55
56						This module implements a sub-class of
57						L
58						to read text from iterators and split the text in tokens,
59						according to the specification given to
60						C constructor.
61
62						The objects are L compatible,
63						i.e. they can be used as an argument to C.
64
65						The tokenizer reads L objects and
66						splits them in L objects on each
67						C call. C returns C on end of input.
68
69						=head1 FUNCTIONS
70
71						=head2 new
72
73						Creates a new tokenizer object, subclass of
74						L.
75
76						C must be called to create the tokenizer code before the
77						iterator can be used.
78
79						=head2 make_lexer
80
81						Creates a new tokenizer object for the given token specification.
82						Each token is specified by the following elements:
83
84						=over 4
85
86						=item type
87
88						String to identify the token type, unused if the token is discarded (see
89						C and C above).
90
91						=item regexp
92
93						One of:
94
95						=over 4
96
97						=item 1
98
99						A single regular expression to match the token at the current input position.
100
101						=item 2
102
103						A list of one regular expression, to match delimited tokens that use the
104						same delimiter for the start and the end.
105						The token can span multiple lines.
106						See see C above for an example for multi-line single-quoted strings.
107
108						=item 3
109
110						A list of two regular expressions, to match the start
111						of the token at the current input position, and the end of the token.
112						The token can span multiple lines.
113						See see C above for an example for multi-line comments.
114
115						=back
116
117						The regular expression is matched where the previous match finished,
118						and each sub-expression cannot span multiple lines.
119						Parentheses may be used to capture sub-expressions in C<$1>, C<$2>, etc.
120
121						It is considered an error, and the tokeninzer dies with an error message
122						when reading input, if some input cannot be recognized by any of the
123						given C espressions. Therefore the C token above contains the
124						catch-all expression C.
125
126						=item transform (optional)
127
128						The optional code reference is a transform subroutine. It receives
129						the C and C of the recognized token, and returns one of:
130
131						=over 4
132
133						=item 1
134
135						An array ref with two elements C<[$type, $value]>,
136						the new C and C to be
137						returned in the L object.
138
139						=item 2
140
141						An empty array C<()> to signal that this token shall be dicarded.
142
143						=back
144
145						As an optimization, the transform subroutine code reference may be
146						set to C, to signal that the token will be dicarded
147						and there is no use in accumulating it while matching.
148						This is usefull to discard comments upfront, instead of
149						collecting the whole comment, and then pass it to the transform subroutine
150						just to be discarded afterwards.
151						See see C above for an example of usage.
152
153						=back
154
155						=head2 clone
156
157						Creates a copy of this tokenizer object without compiling a new
158						lexing subroutine. The copied object has all pending input cleared.
159
160						=cut
161
162						#------------------------------------------------------------------------------
163	1		1		73	use base 'Iterator::Simple::Lookahead', 'Class::Accessor';
	1				2
	1				821
164						__PACKAGE__->mk_accessors(
165						'_lexer', # lexer iterator
166						'_input', # input iterator
167						'_line', # current line being processed
168						'_text', # text being parsed
169						);
170
171						sub new {
172	7		7	1	3570	my($class) = @_;
173	7		1		64	return $class->_new( sub { return } ); # dummy lexer
	1				10
174						}
175
176						sub clone {
177	1		1	1	4	my($self) = @_;
178	1				4	return ref($self)->_new( $self->_lexer );
179						}
180
181						# used by new and clone
182						sub _new {
183	8		8		31	my($class, $lexer) = @_;
184
185	8				42	my $self = $class->SUPER::new; # init iterator
186	8				102	$self->_lexer( $lexer );
187	8				165	$self->_input( Iterator::Simple::Lookahead->new );
188	8				139	$self->_line( undef );
189	8				83	$self->_text( "" );
190
191	8				168	return $self;
192						};
193
194						#------------------------------------------------------------------------------
195						# compile the lexing subroutine
196						sub make_lexer {
197	6		6	1	24	my($self, @tokens) = @_;
198	6	100			203	@tokens or croak "tokens expected";
199
200						# closure for each token attributes, indexed by token sequence nr
201	5				28	my @type; # token type
202						my @start_re; # match start of token
203	5				0	my @end_re; # match end of token
204	5				0	my @transform; # transform subroutine
205	5				0	my @discard; # true to discard multi-line token
206	5				0	my @comment; # comment to show all options of each token branch
207
208						# parse the @tokens list
209	5				16	for (my $id = 0; @tokens; $id++) {
210						# read type
211	15				28	$type[$id] = shift @tokens;
212
213						# read regexp
214	15	100			130	my $re = shift @tokens or croak "regexp expected";
215
216	14	100			39	if (ref $re eq 'Regexp') {
		50
217	11				17	$start_re[$id] = $re;
218						}
219						elsif (ref $re eq 'ARRAY') {
220	3	100			10	@$re == 1 and push @$re, $re->[0];
221	3	50			6	@$re == 2 or croak "invalid regexp list";
222	3				8	($start_re[$id], $end_re[$id]) = @$re;
223						}
224						else {
225	0				0	croak "invalid regexp";
226						}
227
228						# read transform, define discard
229	14	100			28	if (@tokens) {
230	13	100			35	if (! defined($tokens[0])) {
		100
231	1				2	$discard[$id] = 1;
232	1				2	shift @tokens;
233						}
234						elsif (ref($tokens[0]) eq 'CODE') {
235	8				15	$transform[$id] = shift @tokens;
236						}
237						}
238
239						# comment
240	14	100			31	$comment[$id] = join(' ', map {defined($_) ? $_ : ''}
	84				182
241						$id,
242						$type[$id],
243						$start_re[$id],
244						$end_re[$id],
245						$transform[$id],
246						$discard[$id]);
247	14				53	$comment[$id] =~ s/\n/\\n/g;
248
249						}
250
251						# LEXER code
252	4				21	my $template_data = {
253						end_re => \@end_re,
254						transform => \@transform,
255						discard => \@discard,
256						comment => \@comment,
257						};
258	4				12	my @template_args = (
259						DELIMITERS => [ '<%', '%>' ],
260						HASH => $template_data,
261						);
262
263	4				17	my $code = fill_in_string(<<'END_CODE', @template_args);
264
265						sub {
266						my($self) = @_;
267
268						for ($self->{_text}) {
269						LINE:
270						while (1) { # read lines
271						while ((pos()\|\|0) >= length()) { # last line consumed
272						$self->_read_line or return undef;
273						}
274
275						TOKEN:
276						while (1) { # read tokens
277						my $token_line = $self->_line; # start of token line
278						my $pos0 = pos()\|\|0; # position before match
279
280						# need to read new line
281						if (/ \G \z /gcx) {
282						next LINE;
283						}
284						END_CODE
285
286	4				3268	for my $id (0 .. $#type) {
287	14				13745	$template_data->{id} = $id;
288						$template_data->{LINE_BLOCK}
289	14				37	= fill_in_string(<<'END_CODE', @template_args);
290
291						BLOCK:
292						while (1) { # read multi-line block
293						<% $discard[$id] ? '' : '$pos0 = pos()\|\|0;' %>
294
295						# need to read new line
296						if (/ \G \z /gcx) {
297						$self->_read_line
298						or $token_line->error(
299						"unbalanced token at: ".$value);
300						}
301						# end
302						elsif (/ \G (?s: .*?) $end_re[<% $id %>] /gcx) {
303						<% $discard[$id] ? '' :
304						'$value .= $self->_capture($pos0);' %>
305						last BLOCK; # collected whole token
306						}
307						# consume all
308						else {
309						pos() = length();
310						<% $discard[$id] ? '' :
311						'$value .= $self->_capture($pos0);' %>
312						}
313						}
314						END_CODE
315
316						$template_data->{TRANSFORM}
317	14				17707	= fill_in_string(<<'END_CODE', @template_args);
318
319						# call transform routine
320						my $ret = $transform[<% $id %>]->($type, $value);
321						next unless $ret; # discard token
322						($type, $value) = @$ret;
323						END_CODE
324	14				9014	$code .= fill_in_string(<<'END_CODE', @template_args);
325
326						# <% $comment[$id] %>
327						elsif (/ \G $start_re[<% $id %>] /gcx) {
328						my($type, $value) = <%
329						'' %> ($type[<% $id %>], $self->_capture($pos0));
330						<% $end_re[$id] ? $LINE_BLOCK : '' %>
331						<% $transform[$id] ? $TRANSFORM : '' %>
332
333						<% $discard[$id] ? 'next;' : '' %>
334
335						return Asm::Preproc::Token->new(
336						$type, $value, $token_line);
337						}
338						END_CODE
339						}
340
341	4				5352	$code .= fill_in_string(<<'END_CODE', @template_args);
342						# no token recognized, consume rest of line and die
343						else {
344						pos() = length();
345						$token_line->error("no token recognized at: ".
346						substr($_, $pos0));
347						}
348						}
349						}
350						}
351						};
352						END_CODE
353
354						#warn $code;
355	4				4406	my $lexer = eval $code;
356	4	50			17	$@ and croak "$code\n$@";
357
358	4				28	$self->_lexer( $lexer );
359						}
360
361						#------------------------------------------------------------------------------
362						# get the next line from _input, save in _line, _rtext
363						sub _read_line {
364	33		33		61	my($self) = @_;
365
366						# get one line
367	33				74	my $line = $self->_input->next;
368	33				2869	my $text = ""; # default: no text to parse
369
370	33	100			98	if (defined $line) {
371						# convert to Asm::Preproc::Line if needed
372	23	100			91	ref($line) or $line = Asm::Preproc::Line->new($line);
373	23				56	$text = $line->text;
374	23	50			269	$text = "" unless defined $text; # make sure we have something
375						}
376
377	33				87	$self->_line( $line ); # line to return at each token
378	33				379	$self->{_text} = $text; # text to parse - need to reset pos()
379
380	33				821	return $line;
381						}
382						#------------------------------------------------------------------------------
383						# capture the last match
384						sub _capture {
385	73		73		140	my($self, $pos0) = @_;
386	73				1433	return substr($_, $pos0, pos() - $pos0);
387						}
388						#------------------------------------------------------------------------------
389
390						=head2 from
391
392						Inserts the given input at the head of the input queue to the tokenizer.
393						The input is either a list of L
394						objects, or an interator function that returns a
395						L object on each call.
396
397						The input list and interator can also return plain scalar strings, that
398						are converted to L on the fly, but
399						the information on input file location for error messages will not be available.
400
401						The new inserted input is processed before continuing with whatever was
402						already in the queue.
403
404						=cut
405
406						#------------------------------------------------------------------------------
407						sub from {
408	13		13	1	3935	my($self, @input) = @_;
409	13				38	$self->_input->unget(@input);
410	13		52		679	$self->unget( sub { $self->_lexer->($self) } );
	52				5500
411						}
412						#------------------------------------------------------------------------------
413
414						=head2 peek
415
416						Peek the Nth element from the stream, inherited from
417						L.
418
419						=head2 next
420
421						Retrieve the next token from the input strean as a
422						L object, inherited from
423						L.
424
425						=head1 AUTHOR, BUGS, SUPPORT, LICENSE, COPYRIGHT
426
427						See L.
428
429						=cut
430
431						#------------------------------------------------------------------------------
432
433						1;