File Coverage

blib/lib/PPIx/Regexp/Lexer.pm

Criterion	Covered	Total	%
statement	250	264	94.7
branch	64	80	80.0
condition	26	35	74.2
subroutine	44	45	97.7
pod	5	5	100.0
total	389	429	90.6

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Lexer - Assemble tokenizer output.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Lexer;
8							use PPIx::Regexp::Dumper;
9							my $lex = PPIx::Regexp::Lexer->new('qr{foo}smx');
10							my $dmp = PPIx::Regexp::Dumper->new( $lex );
11							$dmp->print();
12
13							=head1 INHERITANCE
14
15							C is a
16							L.
17
18							C has no descendants.
19
20							=head1 DESCRIPTION
21
22							This class takes the token stream generated by
23							L and generates the
24							parse tree.
25
26							=head1 METHODS
27
28							This class provides the following public methods. Methods not documented
29							here are private, and unsupported in the sense that the author reserves
30							the right to change or remove them without notice.
31
32							=cut
33
34							package PPIx::Regexp::Lexer;
35
36	9			9		63	use strict;
	9					18
	9					264
37	9			9		45	use warnings;
	9					20
	9					283
38
39	9			9		49	use base qw{ PPIx::Regexp::Support };
	9					17
	9					3973
40
41	9			9		64	use Carp qw{ confess };
	9					17
	9					442
42	9					780	use PPIx::Regexp::Constant qw{
43							ARRAY_REF
44							TOKEN_LITERAL
45							TOKEN_UNKNOWN
46							@CARP_NOT
47	9			9		59	};
	9					19
48	9			9		3933	use PPIx::Regexp::Node::Range ();
	9					25
	9					176
49	9			9		3425	use PPIx::Regexp::Node::Unknown ();
	9					31
	9					195
50	9			9		4150	use PPIx::Regexp::Structure ();
	9					29
	9					192
51	9			9		3933	use PPIx::Regexp::Structure::Assertion ();
	9					23
	9					182
52	9			9		3732	use PPIx::Regexp::Structure::Atomic_Script_Run ();
	9					28
	9					184
53	9			9		3765	use PPIx::Regexp::Structure::BranchReset ();
	9					35
	9					196
54	9			9		3787	use PPIx::Regexp::Structure::Code ();
	9					22
	9					209
55	9			9		3718	use PPIx::Regexp::Structure::Capture ();
	9					137
	9					191
56	9			9		3820	use PPIx::Regexp::Structure::CharClass ();
	9					26
	9					167
57	9			9		3853	use PPIx::Regexp::Structure::Subexpression ();
	9					27
	9					189
58	9			9		3823	use PPIx::Regexp::Structure::Main ();
	9					20
	9					172
59	9			9		3728	use PPIx::Regexp::Structure::Modifier ();
	9					29
	9					176
60	9			9		3802	use PPIx::Regexp::Structure::NamedCapture ();
	9					23
	9					172
61	9			9		3823	use PPIx::Regexp::Structure::Quantifier ();
	9					27
	9					181
62	9			9		3817	use PPIx::Regexp::Structure::Regexp ();
	9					27
	9					168
63	9			9		3718	use PPIx::Regexp::Structure::RegexSet ();
	9					23
	9					175
64	9			9		3776	use PPIx::Regexp::Structure::Replacement ();
	9					22
	9					183
65	9			9		3818	use PPIx::Regexp::Structure::Script_Run ();
	9					24
	9					173
66	9			9		3787	use PPIx::Regexp::Structure::Switch ();
	9					24
	9					179
67	9			9		3636	use PPIx::Regexp::Structure::Unknown ();
	9					26
	9					170
68	9			9		3786	use PPIx::Regexp::Token::Unmatched ();
	9					22
	9					187
69	9			9		5310	use PPIx::Regexp::Tokenizer ();
	9					44
	9					325
70	9			9		67	use PPIx::Regexp::Util qw{ __choose_tokenizer_class __instance };
	9					23
	9					19324
71
72							our $VERSION = '0.088';
73
74							=head2 new
75
76							This method instantiates the lexer. It takes as its argument either a
77							L or the text to be
78							parsed. In the latter case the tokenizer is instantiated from the text.
79
80							Any optional name/value pairs after the first argument are passed to the
81							tokenizer, which interprets them or not as the case may be.
82
83							=cut
84
85							{
86
87							my $errstr;
88
89							sub new {
90	332			332	1	1269	my ( $class, $tokenizer, %args ) = @_;
91	332	50				1094	ref $class and $class = ref $class;
92
93	332	50				1196	unless ( __instance( $tokenizer, 'PPIx::Regexp::Tokenizer' ) ) {
94							my $tokenizer_class = __choose_tokenizer_class(
95							$tokenizer, \%args )
96	0	0				0	or do {
97	0					0	$errstr = 'Data not supported';
98	0					0	return;
99							};
100							$tokenizer = $tokenizer_class->new( $tokenizer, %args )
101	0	0				0	or do {
102	0					0	$errstr = $tokenizer_class->errstr();
103	0					0	return;
104							};
105							}
106
107							my $self = {
108							deferred => [], # Deferred tokens
109							failures => 0,
110							strict => $args{strict},
111	332					2010	tokenizer => $tokenizer,
112							};
113
114	332					868	bless $self, $class;
115	332					931	return $self;
116							}
117
118							sub errstr {
119	0			0	1	0	return $errstr;
120							}
121
122							}
123
124							=head2 errstr
125
126							This method returns the error string from the last attempt to
127							instantiate a C. If the last attempt succeeded, the
128							error will be C.
129
130							=cut
131
132							# Defined above
133
134							=head2 failures
135
136							print $lexer->failures(), " parse failures\n";
137
138							This method returns the number of parse failures encountered. A
139							parse failure is either a tokenization failure (see
140							L<< PPIx::Regexp::Tokenizer->failures()\|PPIx::Regexp::Tokenizer/failures >>)
141							or a structural error.
142
143							=cut
144
145							sub failures {
146	332			332	1	844	my ( $self ) = @_;
147	332					983	return $self->{failures};
148							}
149
150							=head2 lex
151
152							This method lexes the tokens in the text, and returns the lexed list of
153							elements.
154
155							=cut
156
157							sub lex {
158	332			332	1	813	my ( $self ) = @_;
159
160	332					722	my @content;
161	332					882	$self->{failures} = 0;
162
163							# Accept everything up to the first delimiter.
164	332					686	my $kind; # Initial PPIx::Regexp::Token::Structure
165							{
166	332	100				656	my $token = $self->_get_token()
	668					2009
167							or return $self->_finalize( @content );
168	660	100				2969	$token->isa( 'PPIx::Regexp::Token::Delimiter' ) or do {
169	336	100	100			2333	not $kind
170							and $token->isa( 'PPIx::Regexp::Token::Structure' )
171							and $kind = $token;
172	336					910	push @content, $token;
173	336					583	redo;
174							};
175	324					1533	$self->_unget_token( $token );
176							}
177
178							my ( $part_0_class, $part_1_class ) =
179	324					1488	$self->{tokenizer}->__part_classes();
180
181							# Accept the first delimited structure.
182	324					1341	push @content, ( my $part_0 = $self->_get_delimited(
183							$part_0_class ) );
184
185							# If we are a substitution ...
186	324	100				1119	if ( defined $part_1_class ) {
187
188							# Accept any insignificant stuff.
189	24					104	while ( my $token = $self->_get_token() ) {
190	28	100				93	if ( $token->significant() ) {
191	24					109	$self->_unget_token( $token );
192	24					65	last;
193							} else {
194	4					15	push @content, $token;
195							}
196							}
197
198							# Figure out if we should expect an opening bracket.
199	24		100			150	my $expect_open_bracket = $self->close_bracket(
200							$part_0->start( 0 ) ) \|\| 0;
201
202							# Accept the next delimited structure.
203	24					87	push @content, $self->_get_delimited(
204							$part_1_class,
205							$expect_open_bracket,
206							);
207							}
208
209							# Accept the modifiers (we hope!) plus any trailing white space.
210	324					1067	while ( my $token = $self->_get_token() ) {
211	326					891	push @content, $token;
212							}
213
214							# Let all the elements finalize themselves, recording any additional
215							# errors as they do so.
216	324					1942	$self->_finalize( @content );
217
218							# If we found a regular expression (and we should have done so) ...
219	324	50	33			2452	if ( $part_0 && $part_0->can( 'max_capture_number' ) ) {
220							# TODO the above line is really ugly. I'm wondering about
221							# string implementations like:
222							# * return a $part_0_class of undef (but that complicates the
223							# lexing of the structure itself);
224							# * hang this logic on the tokenizer somehow (where it seems out
225							# of place)
226							# * hang this logic on PPIx::Regexp::Structure::Regexp and
227							# ::Replacement.
228							# I also need to figure out how to make \n backreferences come
229							# out as literals. Maybe that is a job best done by the
230							# tokenizer.
231
232							# Retrieve the maximum capture group.
233	324					1142	my $max_capture = $part_0->max_capture_number();
234
235							# Hashify the known capture names
236							my $capture_name = {
237	324					1216	map { $_ => 1 } $part_0->capture_names(),
	20					101
238							};
239
240							# For all the backreferences found
241	324	100				722	foreach my $elem ( @{ $part_0->find(
	324					971
242							'PPIx::Regexp::Token::Backreference' ) \|\| [] } ) {
243							# Rebless them as needed, recording any errors found.
244							$self->{failures} +=
245	25					109	$elem->__PPIX_LEXER__rebless(
246							capture_name => $capture_name,
247							max_capture => $max_capture,
248							);
249							}
250							}
251
252	324					1445	return @content;
253
254							}
255
256							=head2 strict
257
258							This method returns true or false based on the value of the C<'strict'>
259							argument to C.
260
261							=cut
262
263							sub strict {
264	13			13	1	28	my ( $self ) = @_;
265	13					63	return $self->{strict};
266							}
267
268							# Finalize the content array, updating the parse failures count as we
269							# go.
270							sub _finalize {
271	332			332		1071	my ( $self, @content ) = @_;
272	332					749	foreach my $elem ( @content ) {
273	1014					3778	$self->{failures} += $elem->__PPIX_LEXER__finalize( $self );
274							}
275	332	100				1048	defined wantarray and return @content;
276	324					621	return;
277							}
278
279							{
280
281							my %bracket = (
282							'{' => '}',
283							'(' => ')',
284							'[' => ']',
285							'(?[' => '])',
286							## '<' => '>',
287							);
288
289							my %unclosed = (
290							'{' => '_recover_curly',
291							);
292
293							sub _get_delimited {
294	348			348		977	my ( $self, $class, $expect_open_bracket ) = @_;
295	348	100				1166	defined $expect_open_bracket or $expect_open_bracket = 1;
296
297	348					653	my @rslt;
298	348					1001	$self->{_rslt} = \@rslt;
299
300	348	100				920	if ( $expect_open_bracket ) {
301	329	50				814	if ( my $token = $self->_get_token() ) {
302	329					889	push @rslt, [];
303	329	50				1187	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
304	329					671	push @{ $rslt[-1] }, '', $token;
	329					1388
305							} else {
306	0					0	push @{ $rslt[-1] }, '', undef;
	0					0
307	0					0	$self->_unget_token( $token );
308							}
309							} else {
310	0					0	return;
311							}
312							} else {
313	19					72	push @rslt, [ '', undef ];
314							}
315
316	348					999	while ( my $token = $self->_get_token() ) {
317	2296	100				8144	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
318	348					1263	$self->_unget_token( $token );
319	348					643	last;
320							}
321	1948	100				6101	if ( $token->isa( 'PPIx::Regexp::Token::Structure' ) ) {
322	555					1413	my $content = $token->content();
323
324	555	100	66			2528	if ( my $finish = $bracket{$content} ) {
		100	66
		100
		100
325							# Open bracket
326	276					801	push @rslt, [ $finish, $token ];
327
328							} elsif ( $content eq $rslt[-1][0] ) {
329
330							# Matched close bracket
331	269					1114	$self->_make_node( $token );
332
333							} elsif ( $content ne ')' ) {
334
335							# If the close bracket is not a parenthesis, it becomes
336							# a literal.
337	4					26	TOKEN_LITERAL->__PPIX_ELEM__rebless( $token );
338	4					9	push @{ $rslt[-1] }, $token;
	4					16
339
340							} elsif ( $content eq ')'
341							and @rslt > 1 # Ignore enclosing delimiter
342							and my $recover = $unclosed{$rslt[-1][1]->content()} ) {
343							# If the close bracket is a parenthesis and there is a
344							# recovery procedure, we use it.
345	1					7	$self->$recover( $token );
346
347							} else {
348
349							# Unmatched close with no recovery.
350	5					15	$self->{failures}++;
351	5					65	PPIx::Regexp::Token::Unmatched->
352							__PPIX_ELEM__rebless( $token );
353	5					8	push @{ $rslt[-1] }, $token;
	5					13
354							}
355
356							} else {
357	1393					2130	push @{ $rslt[-1] }, $token;
	1393					3346
358							}
359
360							# We have to hand-roll the Range object.
361	1948	100	100			5797	if ( __instance( $rslt[-1][-2], 'PPIx::Regexp::Token::Operator' )
			100
362							&& $rslt[-1][-2]->content() eq '-'
363							&& $rslt[-1][0] eq ']' # It's a character class
364							) {
365	13					32	my @tokens = splice @{ $rslt[-1] }, -3;
	13					57
366	13					36	push @{ $rslt[-1] },
	13					119
367							PPIx::Regexp::Node::Range->__new( @tokens );
368							}
369							}
370
371	348					1317	while ( @rslt > 1 ) {
372	6	100				34	if ( my $recover = $unclosed{$rslt[-1][1]->content()} ) {
373	5					29	$self->$recover();
374							} else {
375	1					4	$self->{failures}++;
376	1					4	$self->_make_node( undef );
377							}
378							}
379
380	348	50				998	if ( @rslt == 1 ) {
381	348					650	my @last = @{ pop @rslt };
	348					995
382	348					741	shift @last;
383	348					895	push @last, $self->_get_token();
384	348					2053	return $class->__new( @last );
385							} else {
386	0					0	confess "Missing data";
387							}
388
389							}
390
391							}
392
393							# $token = $self->_get_token();
394							#
395							# This method returns the next token from the tokenizer.
396
397							sub _get_token {
398	4319			4319		7839	my ( $self ) = @_;
399
400	4319	100				6032	if ( @{ $self->{deferred} } ) {
	4319					9749
401	697					1049	return shift @{ $self->{deferred} };
	697					2253
402							}
403
404	3622	100				10041	my $token = $self->{tokenizer}->next_token() or return;
405
406	3290					9891	return $token;
407							}
408
409							{
410
411							my %handler = (
412							'(' => '_round',
413							'[' => '_square',
414							'{' => '_curly',
415							'(?[' => '_regex_set',
416							);
417
418							sub _make_node {
419	270			270		688	my ( $self, $token ) = @_;
420	270					527	my @args = @{ pop @{ $self->{_rslt} } };
	270					453
	270					875
421	270					613	shift @args;
422	270					578	push @args, $token;
423	270					435	my @node;
424	270	50				792	if ( my $method = $handler{ $args[0]->content() } ) {
425	270					1319	@node = $self->$method( \@args );
426							}
427	270	50				906	@node or @node = PPIx::Regexp::Structure->__new( @args );
428	270					501	push @{ $self->{_rslt}[-1] }, @node;
	270					777
429	270					1084	return;
430							}
431
432							}
433
434							# Called as $self->$method( ... ) in _make_node(), above
435							sub _curly { ## no critic (ProhibitUnusedPrivateSubroutines)
436	35			35		117	my ( $self, $args ) = @_;
437
438	35	100	66			224	if ( $args->[-1] && $args->[-1]->is_quantifier() ) {
		50
439
440							# If the tokenizer has marked the right curly as a quantifier,
441							# make the whole thing a quantifier structure.
442	29					84	return PPIx::Regexp::Structure::Quantifier->__new( @{ $args } );
	29					211
443
444							} elsif ( $args->[-1] ) {
445
446							# If there is a right curly but it is not a quantifier,
447							# make both curlys into literals.
448	6					38	foreach my $inx ( 0, -1 ) {
449	12					83	TOKEN_LITERAL->__PPIX_ELEM__rebless( $args->[$inx] );
450							}
451
452							# Try to recover possible quantifiers not recognized because we
453							# thought this was a structure.
454	6					50	$self->_recover_curly_quantifiers( $args );
455
456	6					16	return @{ $args };
	6					24
457
458							} else {
459
460							# If there is no right curly, just make a generic structure
461							# TODO maybe this should be something else?
462	0					0	return PPIx::Regexp::Structure->__new( @{ $args } );
	0					0
463							}
464							}
465
466							# Recover from an unclosed left curly.
467							# Called as $self->$revover( ... ) in _get_delimited, above
468							sub _recover_curly { ## no critic (ProhibitUnusedPrivateSubroutines)
469	6			6		25	my ( $self, $token ) = @_;
470
471							# Get all the stuff we have accumulated for this curly.
472	6					14	my @content = @{ pop @{ $self->{_rslt} } };
	6					10
	6					25
473
474							# Lose the right bracket, which we have already failed to match.
475	6					17	shift @content;
476
477							# Rebless the left curly appropriately
478	6	100	66			64	if ( $self->{_rslt}[0][-1]->isa( 'PPIx::Regexp::Token::Assertion' )
479							&& q<\b> eq $self->{_rslt}[0][-1]->content() ) {
480							# If following \b, it becomes an unknown.
481	1					7	TOKEN_UNKNOWN->__PPIX_ELEM__rebless( $content[0],
482							error => 'Unterminated bound type',
483							);
484							} else {
485							# Rebless the left curly to a literal.
486	5					43	TOKEN_LITERAL->__PPIX_ELEM__rebless( $content[0] );
487							}
488
489							# Try to recover possible quantifiers not recognized because we
490							# thought this was a structure.
491	6					33	$self->_recover_curly_quantifiers( \@content );
492
493							# Shove the curly and its putative contents into whatever structure
494							# we have going.
495							# The checks are to try to trap things like RT 56864, though on
496							# further reflection it turned out that you could get here with an
497							# empty $self->{_rslt} on things like 'm{)}'. This one did not get
498							# made into an RT ticket, but was fixed by not calling the recovery
499							# code if $self->{_rslt} contained only the enclosing delimiters.
500							ARRAY_REF eq ref $self->{_rslt}
501							or confess 'Programming error - $self->{_rslt} not array ref, ',
502	6	50				35	"parsing '", $self->{tokenizer}->content(), "' at ",
503							$token->content();
504	6					24	@{ $self->{_rslt} }
505							or confess 'Programming error - $self->{_rslt} empty, ',
506	6	50				15	"parsing '", $self->{tokenizer}->content(), "' at ",
507							$token->content();
508	6					13	push @{ $self->{_rslt}[-1] }, @content;
	6					44
509
510							# Shove the mismatched delimiter back into the input so we can have
511							# another crack at it.
512	6	100				21	$token and $self->_unget_token( $token );
513
514							# We gone.
515	6					21	return;
516							}
517
518							sub _recover_curly_quantifiers {
519	12			12		36	my ( undef, $args ) = @_; # Invocant unused
520
521	12	100	100			49	if ( __instance( $args->[0], TOKEN_LITERAL )
			66
522							&& __instance( $args->[1], TOKEN_UNKNOWN )
523							&& PPIx::Regexp::Token::Quantifier->could_be_quantifier(
524							$args->[1]->content() )
525							) {
526	2					24	PPIx::Regexp::Token::Quantifier->
527							__PPIX_ELEM__rebless( $args->[1] );
528
529	2	50	33			8	if ( __instance( $args->[2], TOKEN_UNKNOWN )
530							&& PPIx::Regexp::Token::Greediness->could_be_greediness(
531							$args->[2]->content() )
532							) {
533	2					17	PPIx::Regexp::Token::Greediness
534							->__PPIX_ELEM__rebless( $args->[2] );
535							}
536
537							}
538
539	12					34	return;
540							}
541
542							sub _in_regex_set {
543	193			193		392	my ( $self ) = @_;
544	193					348	foreach my $stack_entry ( reverse @{ $self->{_rslt} } ) {
	193					485
545	302	100				790	$stack_entry->[0] eq '])'
546							and return 1;
547							}
548	189					538	return 0;
549							}
550
551							# Called as $self->$method( ... ) in _make_node(), above
552							sub _round { ## no critic (ProhibitUnusedPrivateSubroutines)
553	193			193		488	my ( $self, $args ) = @_;
554
555							# If we're inside a regex set, parens do not capture.
556							$self->_in_regex_set()
557	193	100				606	and return PPIx::Regexp::Structure->__new( @{ $args } );
	4					24
558
559							# If /n is asserted, parens do not capture.
560							$self->{tokenizer}->modifier( 'n' )
561	189	100				682	and return PPIx::Regexp::Structure->__new( @{ $args } );
	7					58
562
563							# The instantiator will rebless based on the first token if need be.
564	182					460	return PPIx::Regexp::Structure::Capture->__new( @{ $args } );
	182					1062
565							}
566
567							# Called as $self->$method( ... ) in _make_node(), above
568							sub _square { ## no critic (ProhibitUnusedPrivateSubroutines)
569	36			36		117	my ( undef, $args ) = @_; # Invocant unused
570	36					66	return PPIx::Regexp::Structure::CharClass->__new( @{ $args } );
	36					307
571							}
572
573							# Called as $self->$method( ... ) in _make_node(), above
574							sub _regex_set { ## no critic (ProhibitUnusedPrivateSubroutines)
575	6			6		46	my ( undef, $args ) = @_; # Invocant unused
576	6					19	return PPIx::Regexp::Structure::RegexSet->__new( @{ $args } );
	6					83
577							}
578
579							# $self->_unget_token( $token );
580							#
581							# This method caches its argument so that it will be returned by
582							# the next call to C<_get_token()>. If more than one argument is
583							# passed, they will be returned in the order given; that is,
584							# _unget_token/_get_token work like unshift/shift.
585
586							sub _unget_token {
587	697			697		1844	my ( $self, @args ) = @_;
588	697					1094	unshift @{ $self->{deferred} }, @args;
	697					1633
589	697					1289	return $self;
590							}
591
592							1;
593
594							__END__