File Coverage

blib/lib/Asm/Preproc/Lexer.pm

Criterion	Covered	Total	%
statement	83	88	94.3
branch	24	28	85.7
condition			n/a
subroutine	17	17	100.0
pod	4	4	100.0
total	128	137	93.4

line	stmt	bran	sub	pod	time	code
1						# $Id: Lexer.pm,v 1.7 2015/04/18 18:02:05 Paulo Exp $
2
3						package Asm::Preproc::Lexer;
4
5						#------------------------------------------------------------------------------
6
7						=head1 NAME
8
9						Asm::Preproc::Lexer - Iterator to split input in tokens
10
11						=cut
12
13						#------------------------------------------------------------------------------
14
15	1		1		932	use strict;
	1				1
	1				29
16	1		1		3	use warnings;
	1				1
	1				16
17
18	1		1		3	use Carp;
	1				1
	1				47
19	1		1		559	use Text::Template 'fill_in_string';
	1				2525
	1				49
20	1		1		7	use Asm::Preproc::Line;
	1				1
	1				12
21	1		1		3	use Asm::Preproc::Token;
	1				1
	1				36
22
23						our $VERSION = '1.01';
24
25						#------------------------------------------------------------------------------
26
27						=head1 SYNOPSIS
28
29						use Asm::Preproc::Lexer;
30
31						my @tokens = (
32						BLANKS => qr/\s+/, sub {()},
33						COMMENT => [qr/\/\/, qr/\\//],
34						undef,
35						QSTR => [qr/'/], sub { my($type, $value) = @_;
36						[$type,
37						substr($value, 1, length($value)-2)] },
38						QQSTR => [qr/"/, qr/"/],
39						NUM => qr/\d+/,
40						ID => qr/[a-z]+/, sub { my($type, $value) = @_;
41						[$type, $value] },
42						SYM => qr/(.)/, sub { [$1, $1] },
43						);
44
45						my $lex = Asm::Preproc::Lexer->new;
46						$lex->make_lexer(@tokens);
47
48						my $lex2 = $lex->clone;
49
50						$lex->from(sub {}, @lines); # read Asm::Preproc::Line from iterator
51						my $token = $lex->next; # isa Asm::Preproc::Token
52						my $token = $lex->();
53
54						=head1 DESCRIPTION
55
56						This module implements a sub-class of
57						L
58						to read text from iterators and split the text in tokens,
59						according to the specification given to
60						C constructor.
61
62						The objects are L compatible,
63						i.e. they can be used as an argument to C.
64
65						The tokenizer reads L objects and
66						splits them in L objects on each
67						C call. C returns C on end of input.
68
69						=head1 FUNCTIONS
70
71						=head2 new
72
73						Creates a new tokenizer object, subclass of
74						L.
75
76						C must be called to create the tokenizer code before the
77						iterator can be used.
78
79						=head2 make_lexer
80
81						Creates a new tokenizer object for the given token specification.
82						Each token is specified by the following elements:
83
84						=over 4
85
86						=item type
87
88						String to identify the token type, unused if the token is discarded (see
89						C and C above).
90
91						=item regexp
92
93						One of:
94
95						=over 4
96
97						=item 1
98
99						A single regular expression to match the token at the current input position.
100
101						=item 2
102
103						A list of one regular expression, to match delimited tokens that use the
104						same delimiter for the start and the end.
105						The token can span multiple lines.
106						See see C above for an example for multi-line single-quoted strings.
107
108						=item 3
109
110						A list of two regular expressions, to match the start
111						of the token at the current input position, and the end of the token.
112						The token can span multiple lines.
113						See see C above for an example for multi-line comments.
114
115						=back
116
117						The regular expression is matched where the previous match finished,
118						and each sub-expression cannot span multiple lines.
119						Parentheses may be used to capture sub-expressions in C<$1>, C<$2>, etc.
120
121						It is considered an error, and the tokeninzer dies with an error message
122						when reading input, if some input cannot be recognized by any of the
123						given C espressions. Therefore the C token above contains the
124						catch-all expression C.
125
126						=item transform (optional)
127
128						The optional code reference is a transform subroutine. It receives
129						the C and C of the recognized token, and returns one of:
130
131						=over 4
132
133						=item 1
134
135						An array ref with two elements C<[$type, $value]>,
136						the new C and C to be
137						returned in the L object.
138
139						=item 2
140
141						An empty array C<()> to signal that this token shall be dicarded.
142
143						=back
144
145						As an optimization, the transform subroutine code reference may be
146						set to C, to signal that the token will be dicarded
147						and there is no use in accumulating it while matching.
148						This is usefull to discard comments upfront, instead of
149						collecting the whole comment, and then pass it to the transform subroutine
150						just to be discarded afterwards.
151						See see C above for an example of usage.
152
153						=back
154
155						=head2 clone
156
157						Creates a copy of this tokenizer object without compiling a new
158						lexing subroutine. The copied object has all pending input cleared.
159
160						=cut
161
162						#------------------------------------------------------------------------------
163	1		1		16	use base 'Iterator::Simple::Lookahead';
	1				1
	1				419
164						use Class::XSAccessor { # additional attributes
165	1				3	accessors => [
166						'_lexer', # lexer iterator
167						'_input', # input iterator
168						'_line', # current line being processed
169						'_text', # text being parsed
170						],
171	1		1		2934	};
	1				1
172
173						sub new {
174	7		7	1	2042	my($class) = @_;
175	7		1		26	return $class->_new( sub { return } ); # dummy lexer
	1				2
176						}
177
178						sub clone {
179	1		1	1	1	my($self) = @_;
180	1				5	return ref($self)->_new( $self->_lexer );
181						}
182
183						# used by new and clone
184						sub _new {
185	8		8		8	my($class, $lexer) = @_;
186
187	8				27	my $self = $class->SUPER::new; # init iterator
188	8				71	$self->_lexer( $lexer );
189	8				19	$self->_input( Iterator::Simple::Lookahead->new );
190	8				41	$self->_line( undef );
191	8				14	$self->_text( "" );
192
193	8				25	return $self;
194						};
195
196						#------------------------------------------------------------------------------
197						# compile the lexing subroutine
198						sub make_lexer {
199	6		6	1	22	my($self, @tokens) = @_;
200	6	100			172	@tokens or croak "tokens expected";
201
202						# closure for each token attributes, indexed by token sequence nr
203	5				7	my @type; # token type
204						my @start_re; # match start of token
205	0				0	my @end_re; # match end of token
206	0				0	my @transform; # transform subroutine
207	0				0	my @discard; # true to discard multi-line token
208	0				0	my @comment; # comment to show all options of each token branch
209
210						# parse the @tokens list
211	5				11	for (my $id = 0; @tokens; $id++) {
212						# read type
213	15				14	$type[$id] = shift @tokens;
214
215						# read regexp
216	15	100			95	my $re = shift @tokens or croak "regexp expected";
217
218	14	100			23	if (ref $re eq 'Regexp') {
		50
219	11				11	$start_re[$id] = $re;
220						}
221						elsif (ref $re eq 'ARRAY') {
222	3	100			6	@$re == 1 and push @$re, $re->[0];
223	3	50			4	@$re == 2 or croak "invalid regexp list";
224	3				5	($start_re[$id], $end_re[$id]) = @$re;
225						}
226						else {
227	0				0	croak "invalid regexp";
228						}
229
230						# read transform, define discard
231	14	100			18	if (@tokens) {
232	13	100			27	if (! defined($tokens[0])) {
		100
233	1				1	$discard[$id] = 1;
234	1				2	shift @tokens;
235						}
236						elsif (ref($tokens[0]) eq 'CODE') {
237	8				8	$transform[$id] = shift @tokens;
238						}
239						}
240
241						# comment
242	14	100			17	$comment[$id] = join(' ', map {defined($_) ? $_ : ''}
	84				106
243						$id,
244						$type[$id],
245						$start_re[$id],
246						$end_re[$id],
247						$transform[$id],
248						$discard[$id]);
249	14				35	$comment[$id] =~ s/\n/\\n/g;
250
251						}
252
253						# LEXER code
254	4				16	my $template_data = {
255						end_re => \@end_re,
256						transform => \@transform,
257						discard => \@discard,
258						comment => \@comment,
259						};
260	4				10	my @template_args = (
261						DELIMITERS => [ '<%', '%>' ],
262						HASH => $template_data,
263						);
264
265	4				14	my $code = fill_in_string(<<'END_CODE', @template_args);
266
267						sub {
268						my($self) = @_;
269
270						for ($self->{_text}) {
271						LINE:
272						while (1) { # read lines
273						while ((pos()\|\|0) >= length()) { # last line consumed
274						$self->_read_line or return undef;
275						}
276
277						TOKEN:
278						while (1) { # read tokens
279						my $token_line = $self->_line; # start of token line
280						my $pos0 = pos()\|\|0; # position before match
281
282						# need to read new line
283						if (/ \G \z /gcx) {
284						next LINE;
285						}
286						END_CODE
287
288	4				1911	for my $id (0 .. $#type) {
289	14				7419	$template_data->{id} = $id;
290	14				23	$template_data->{LINE_BLOCK}
291						= fill_in_string(<<'END_CODE', @template_args);
292
293						BLOCK:
294						while (1) { # read multi-line block
295						<% $discard[$id] ? '' : '$pos0 = pos()\|\|0;' %>
296
297						# need to read new line
298						if (/ \G \z /gcx) {
299						$self->_read_line
300						or $token_line->error(
301						"unbalanced token at: ".$value);
302						}
303						# end
304						elsif (/ \G (?s: .*?) $end_re[<% $id %>] /gcx) {
305						<% $discard[$id] ? '' :
306						'$value .= $self->_capture($pos0);' %>
307						last BLOCK; # collected whole token
308						}
309						# consume all
310						else {
311						pos() = length();
312						<% $discard[$id] ? '' :
313						'$value .= $self->_capture($pos0);' %>
314						}
315						}
316						END_CODE
317
318	14				10240	$template_data->{TRANSFORM}
319						= fill_in_string(<<'END_CODE', @template_args);
320
321						# call transform routine
322						my $ret = $transform[<% $id %>]->($type, $value);
323						next unless $ret; # discard token
324						($type, $value) = @$ret;
325						END_CODE
326	14				4607	$code .= fill_in_string(<<'END_CODE', @template_args);
327
328						# <% $comment[$id] %>
329						elsif (/ \G $start_re[<% $id %>] /gcx) {
330						my($type, $value) = <%
331						'' %> ($type[<% $id %>], $self->_capture($pos0));
332						<% $end_re[$id] ? $LINE_BLOCK : '' %>
333						<% $transform[$id] ? $TRANSFORM : '' %>
334
335						<% $discard[$id] ? 'next;' : '' %>
336
337						return Asm::Preproc::Token->new(
338						$type, $value, $token_line);
339						}
340						END_CODE
341						}
342
343	4				2801	$code .= fill_in_string(<<'END_CODE', @template_args);
344						# no token recognized, consume rest of line and die
345						else {
346						pos() = length();
347						$token_line->error("no token recognized at: ".
348						substr($_, $pos0));
349						}
350						}
351						}
352						}
353						};
354						END_CODE
355
356						#warn $code;
357	4				2632	my $lexer = eval $code;
358	4	50			12	$@ and croak "$code\n$@";
359
360	4				30	$self->_lexer( $lexer );
361						}
362
363						#------------------------------------------------------------------------------
364						# get the next line from _input, save in _line, _rtext
365						sub _read_line {
366	33		33		24	my($self) = @_;
367
368						# get one line
369	33				70	my $line = $self->_input->next;
370	33				495	my $text = ""; # default: no text to parse
371
372	33	100			50	if (defined $line) {
373						# convert to Asm::Preproc::Line if needed
374	23	100			77	ref($line) or $line = Asm::Preproc::Line->new($line);
375	23				32	$text = $line->text;
376	23	50			30	$text = "" unless defined $text; # make sure we have something
377						}
378
379	33				45	$self->_line( $line ); # line to return at each token
380	33				48	$self->{_text} = $text; # text to parse - need to reset pos()
381
382	33				619	return $line;
383						}
384						#------------------------------------------------------------------------------
385						# capture the last match
386						sub _capture {
387	73		73		68	my($self, $pos0) = @_;
388	73				1180	return substr($_, $pos0, pos() - $pos0);
389						}
390						#------------------------------------------------------------------------------
391
392						=head2 from
393
394						Inserts the given input at the head of the input queue to the tokenizer.
395						The input is either a list of L
396						objects, or an interator function that returns a
397						L object on each call.
398
399						The input list and interator can also return plain scalar strings, that
400						are converted to L on the fly, but
401						the information on input file location for error messages will not be available.
402
403						The new inserted input is processed before continuing with whatever was
404						already in the queue.
405
406						=cut
407
408						#------------------------------------------------------------------------------
409						sub from {
410	13		13	1	2610	my($self, @input) = @_;
411	13				40	$self->_input->unget(@input);
412	13		52		119	$self->unget( sub { $self->_lexer->($self) } );
	52				3407
413						}
414						#------------------------------------------------------------------------------
415
416						=head2 peek
417
418						Peek the Nth element from the stream, inherited from
419						L.
420
421						=head2 next
422
423						Retrieve the next token from the input strean as a
424						L object, inherited from
425						L.
426
427						=head1 AUTHOR, BUGS, SUPPORT, LICENSE, COPYRIGHT
428
429						See L.
430
431						=cut
432
433						#------------------------------------------------------------------------------
434
435						1;