File Coverage

blib/lib/PPIx/Regexp/Lexer.pm

Criterion	Covered	Total	%
statement	250	264	94.7
branch	64	80	80.0
condition	26	35	74.2
subroutine	44	45	97.7
pod	5	5	100.0
total	389	429	90.6

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Lexer - Assemble tokenizer output.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Lexer;
8							use PPIx::Regexp::Dumper;
9							my $lex = PPIx::Regexp::Lexer->new('qr{foo}smx');
10							my $dmp = PPIx::Regexp::Dumper->new( $lex );
11							$dmp->print();
12
13							=head1 INHERITANCE
14
15							C<PPIx::Regexp::Lexer> is a
16							L<PPIx::Regexp::Support\|PPIx::Regexp::Support>.
17
18							C<PPIx::Regexp::Lexer> has no descendants.
19
20							=head1 DESCRIPTION
21
22							This class takes the token stream generated by
23							L<PPIx::Regexp::Tokenizer\|PPIx::Regexp::Tokenizer> and generates the
24							parse tree.
25
26							=head1 METHODS
27
28							This class provides the following public methods. Methods not documented
29							here are private, and unsupported in the sense that the author reserves
30							the right to change or remove them without notice.
31
32							=cut
33
34							package PPIx::Regexp::Lexer;
35
36	9			9		47	use strict;
	9					11
	9					272
37	9			9		30	use warnings;
	9					11
	9					388
38
39	9			9		31	use base qw{ PPIx::Regexp::Support };
	9					11
	9					3358
40
41	9			9		45	use Carp qw{ confess };
	9					16
	9					425
42	9					700	use PPIx::Regexp::Constant qw{
43							ARRAY_REF
44							TOKEN_LITERAL
45							TOKEN_UNKNOWN
46							@CARP_NOT
47	9			9		37	};
	9					11
48	9			9		3518	use PPIx::Regexp::Node::Range ();
	9					31
	9					171
49	9			9		2998	use PPIx::Regexp::Node::Unknown ();
	9					20
	9					197
50	9			9		3426	use PPIx::Regexp::Structure ();
	9					20
	9					186
51	9			9		3557	use PPIx::Regexp::Structure::Assertion ();
	9					24
	9					159
52	9			9		3548	use PPIx::Regexp::Structure::Atomic_Script_Run ();
	9					26
	9					162
53	9			9		3280	use PPIx::Regexp::Structure::BranchReset ();
	9					22
	9					150
54	9			9		3205	use PPIx::Regexp::Structure::Code ();
	9					21
	9					172
55	9			9		3176	use PPIx::Regexp::Structure::Capture ();
	9					20
	9					163
56	9			9		3312	use PPIx::Regexp::Structure::CharClass ();
	9					23
	9					182
57	9			9		3314	use PPIx::Regexp::Structure::Subexpression ();
	9					20
	9					151
58	9			9		3380	use PPIx::Regexp::Structure::Main ();
	9					25
	9					153
59	9			9		3514	use PPIx::Regexp::Structure::Modifier ();
	9					25
	9					160
60	9			9		3418	use PPIx::Regexp::Structure::NamedCapture ();
	9					24
	9					142
61	9			9		3412	use PPIx::Regexp::Structure::Quantifier ();
	9					42
	9					235
62	9			9		3280	use PPIx::Regexp::Structure::Regexp ();
	9					26
	9					160
63	9			9		3366	use PPIx::Regexp::Structure::RegexSet ();
	9					38
	9					147
64	9			9		3352	use PPIx::Regexp::Structure::Replacement ();
	9					20
	9					142
65	9			9		3255	use PPIx::Regexp::Structure::Script_Run ();
	9					24
	9					159
66	9			9		3496	use PPIx::Regexp::Structure::Switch ();
	9					22
	9					165
67	9			9		3617	use PPIx::Regexp::Structure::Unknown ();
	9					113
	9					163
68	9			9		3683	use PPIx::Regexp::Token::Unmatched ();
	9					21
	9					163
69	9			9		6574	use PPIx::Regexp::Tokenizer ();
	9					35
	9					288
70	9			9		55	use PPIx::Regexp::Util qw{ __choose_tokenizer_class __instance };
	9					13
	9					16849
71
72							our $VERSION = '0.091';
73
74							=head2 new
75
76							This method instantiates the lexer. It takes as its argument either a
77							L<PPIx::Regexp::Tokenizer\|PPIx::Regexp::Tokenizer> or the text to be
78							parsed. In the latter case the tokenizer is instantiated from the text.
79
80							Any optional name/value pairs after the first argument are passed to the
81							tokenizer, which interprets them or not as the case may be.
82
83							=cut
84
85							{
86
87							my $errstr;
88
89							sub new {
90	334			334	1	835	my ( $class, $tokenizer, %args ) = @_;
91	334	50				968	ref $class and $class = ref $class;
92
93	334	50				780	unless ( __instance( $tokenizer, 'PPIx::Regexp::Tokenizer' ) ) {
94							my $tokenizer_class = __choose_tokenizer_class(
95							$tokenizer, \%args )
96	0	0				0	or do {
97	0					0	$errstr = 'Data not supported';
98	0					0	return;
99							};
100							$tokenizer = $tokenizer_class->new( $tokenizer, %args )
101	0	0				0	or do {
102	0					0	$errstr = $tokenizer_class->errstr();
103	0					0	return;
104							};
105							}
106
107							my $self = {
108							deferred => [], # Deferred tokens
109							failures => 0,
110							strict => $args{strict},
111	334					1514	tokenizer => $tokenizer,
112							};
113
114	334					645	bless $self, $class;
115	334					695	return $self;
116							}
117
118							sub errstr {
119	0			0	1	0	return $errstr;
120							}
121
122							}
123
124							=head2 errstr
125
126							This method returns the error string from the last attempt to
127							instantiate a C<PPIx::Regexp::Lexer>. If the last attempt succeeded, the
128							error will be C<undef>.
129
130							=cut
131
132							# Defined above
133
134							=head2 failures
135
136							print $lexer->failures(), " parse failures\n";
137
138							This method returns the number of parse failures encountered. A
139							parse failure is either a tokenization failure (see
140							L<< PPIx::Regexp::Tokenizer->failures()\|PPIx::Regexp::Tokenizer/failures >>)
141							or a structural error.
142
143							=cut
144
145							sub failures {
146	334			334	1	623	my ( $self ) = @_;
147	334					970	return $self->{failures};
148							}
149
150							=head2 lex
151
152							This method lexes the tokens in the text, and returns the lexed list of
153							elements.
154
155							=cut
156
157							sub lex {
158	334			334	1	613	my ( $self ) = @_;
159
160	334					436	my @content;
161	334					686	$self->{failures} = 0;
162
163							# Accept everything up to the first delimiter.
164	334					494	my $kind; # Initial PPIx::Regexp::Token::Structure
165							{
166	334	100				537	my $token = $self->_get_token()
	672					1626
167							or return $self->_finalize( @content );
168	664	100				1943	$token->isa( 'PPIx::Regexp::Token::Delimiter' ) or do {
169	338	100	100			1556	not $kind
170							and $token->isa( 'PPIx::Regexp::Token::Structure' )
171							and $kind = $token;
172	338					662	push @content, $token;
173	338					441	redo;
174							};
175	326					1006	$self->_unget_token( $token );
176							}
177
178							my ( $part_0_class, $part_1_class ) =
179	326					1025	$self->{tokenizer}->__part_classes();
180
181							# Accept the first delimited structure.
182	326					1058	push @content, ( my $part_0 = $self->_get_delimited(
183							$part_0_class ) );
184
185							# If we are a substitution ...
186	326	100				715	if ( defined $part_1_class ) {
187
188							# Accept any insignificant stuff.
189	26					83	while ( my $token = $self->_get_token() ) {
190	30	100				67	if ( $token->significant() ) {
191	26					91	$self->_unget_token( $token );
192	26					42	last;
193							} else {
194	4					8	push @content, $token;
195							}
196							}
197
198							# Figure out if we should expect an opening bracket.
199	26		100			106	my $expect_open_bracket = $self->close_bracket(
200							$part_0->start( 0 ) ) \|\| 0;
201
202							# Accept the next delimited structure.
203	26					71	push @content, $self->_get_delimited(
204							$part_1_class,
205							$expect_open_bracket,
206							);
207							}
208
209							# Accept the modifiers (we hope!) plus any trailing white space.
210	326					784	while ( my $token = $self->_get_token() ) {
211	328					569	push @content, $token;
212							}
213
214							# Let all the elements finalize themselves, recording any additional
215							# errors as they do so.
216	326					1104	$self->_finalize( @content );
217
218							# If we found a regular expression (and we should have done so) ...
219	326	50	33			1726	if ( $part_0 && $part_0->can( 'max_capture_number' ) ) {
220							# TODO the above line is really ugly. I'm wondering about
221							# string implementations like:
222							# * return a $part_0_class of undef (but that complicates the
223							# lexing of the structure itself);
224							# * hang this logic on the tokenizer somehow (where it seems out
225							# of place)
226							# * hang this logic on PPIx::Regexp::Structure::Regexp and
227							# ::Replacement.
228							# I also need to figure out how to make \n backreferences come
229							# out as literals. Maybe that is a job best done by the
230							# tokenizer.
231
232							# Retrieve the maximum capture group.
233	326					804	my $max_capture = $part_0->max_capture_number();
234
235							# Hashify the known capture names
236							my $capture_name = {
237	326					890	map { $_ => 1 } $part_0->capture_names(),
	20					67
238							};
239
240							# For all the backreferences found
241	326	100				481	foreach my $elem ( @{ $part_0->find(
	326					645
242							'PPIx::Regexp::Token::Backreference' ) \|\| [] } ) {
243							# Rebless them as needed, recording any errors found.
244							$self->{failures} +=
245	25					97	$elem->__PPIX_LEXER__rebless(
246							capture_name => $capture_name,
247							max_capture => $max_capture,
248							);
249							}
250							}
251
252	326					1245	return @content;
253
254							}
255
256							=head2 strict
257
258							This method returns true or false based on the value of the C<'strict'>
259							argument to C<new()>.
260
261							=cut
262
263							sub strict {
264	13			13	1	24	my ( $self ) = @_;
265	13					38	return $self->{strict};
266							}
267
268							# Finalize the content array, updating the parse failures count as we
269							# go.
270							sub _finalize {
271	334			334		711	my ( $self, @content ) = @_;
272	334					561	foreach my $elem ( @content ) {
273	1022					3133	$self->{failures} += $elem->__PPIX_LEXER__finalize( $self );
274							}
275	334	100				706	defined wantarray and return @content;
276	326					595	return;
277							}
278
279							{
280
281							my %bracket = (
282							'{' => '}',
283							'(' => ')',
284							'[' => ']',
285							'(?[' => '])',
286							## '<' => '>',
287							);
288
289							my %unclosed = (
290							'{' => '_recover_curly',
291							);
292
293							sub _get_delimited {
294	352			352		680	my ( $self, $class, $expect_open_bracket ) = @_;
295	352	100				674	defined $expect_open_bracket or $expect_open_bracket = 1;
296
297	352					483	my @rslt;
298	352					759	$self->{_rslt} = \@rslt;
299
300	352	100				647	if ( $expect_open_bracket ) {
301	331	50				526	if ( my $token = $self->_get_token() ) {
302	331					630	push @rslt, [];
303	331	50				849	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
304	331					412	push @{ $rslt[-1] }, '', $token;
	331					869
305							} else {
306	0					0	push @{ $rslt[-1] }, '', undef;
	0					0
307	0					0	$self->_unget_token( $token );
308							}
309							} else {
310	0					0	return;
311							}
312							} else {
313	21					52	push @rslt, [ '', undef ];
314							}
315
316	352					678	while ( my $token = $self->_get_token() ) {
317	2308	100				6192	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
318	352					946	$self->_unget_token( $token );
319	352					524	last;
320							}
321	1956	100				4513	if ( $token->isa( 'PPIx::Regexp::Token::Structure' ) ) {
322	555					1109	my $content = $token->content();
323
324	555	100	66			1686	if ( my $finish = $bracket{$content} ) {
		100	66
		100
		100
325							# Open bracket
326	276					629	push @rslt, [ $finish, $token ];
327
328							} elsif ( $content eq $rslt[-1][0] ) {
329
330							# Matched close bracket
331	269					716	$self->_make_node( $token );
332
333							} elsif ( $content ne ')' ) {
334
335							# If the close bracket is not a parenthesis, it becomes
336							# a literal.
337	4					19	TOKEN_LITERAL->__PPIX_ELEM__rebless( $token );
338	4					5	push @{ $rslt[-1] }, $token;
	4					11
339
340							} elsif ( $content eq ')'
341							and @rslt > 1 # Ignore enclosing delimiter
342							and my $recover = $unclosed{$rslt[-1][1]->content()} ) {
343							# If the close bracket is a parenthesis and there is a
344							# recovery procedure, we use it.
345	1					6	$self->$recover( $token );
346
347							} else {
348
349							# Unmatched close with no recovery.
350	5					13	$self->{failures}++;
351	5					54	PPIx::Regexp::Token::Unmatched->
352							__PPIX_ELEM__rebless( $token );
353	5					8	push @{ $rslt[-1] }, $token;
	5					27
354							}
355
356							} else {
357	1401					1665	push @{ $rslt[-1] }, $token;
	1401					2647
358							}
359
360							# We have to hand-roll the Range object.
361	1956	100	100			4818	if ( __instance( $rslt[-1][-2], 'PPIx::Regexp::Token::Operator' )
			100
362							&& $rslt[-1][-2]->content() eq '-'
363							&& $rslt[-1][0] eq ']' # It's a character class
364							) {
365	13					22	my @tokens = splice @{ $rslt[-1] }, -3;
	13					34
366	13					22	push @{ $rslt[-1] },
	13					94
367							PPIx::Regexp::Node::Range->__new( @tokens );
368							}
369							}
370
371	352					953	while ( @rslt > 1 ) {
372	6	100				19	if ( my $recover = $unclosed{$rslt[-1][1]->content()} ) {
373	5					42	$self->$recover();
374							} else {
375	1					2	$self->{failures}++;
376	1					5	$self->_make_node( undef );
377							}
378							}
379
380	352	50				728	if ( @rslt == 1 ) {
381	352					413	my @last = @{ pop @rslt };
	352					884
382	352					635	shift @last;
383	352					704	push @last, $self->_get_token();
384	352					1462	return $class->__new( @last );
385							} else {
386	0					0	confess "Missing data";
387							}
388
389							}
390
391							}
392
393							# $token = $self->_get_token();
394							#
395							# This method returns the next token from the tokenizer.
396
397							sub _get_token {
398	4347			4347		5719	my ( $self ) = @_;
399
400	4347	100				4212	if ( @{ $self->{deferred} } ) {
	4347					7088
401	705					859	return shift @{ $self->{deferred} };
	705					1517
402							}
403
404	3642	100				7216	my $token = $self->{tokenizer}->next_token() or return;
405
406	3308					6892	return $token;
407							}
408
409							{
410
411							my %handler = (
412							'(' => '_round',
413							'[' => '_square',
414							'{' => '_curly',
415							'(?[' => '_regex_set',
416							);
417
418							sub _make_node {
419	270			270		524	my ( $self, $token ) = @_;
420	270					302	my @args = @{ pop @{ $self->{_rslt} } };
	270					335
	270					741
421	270					485	shift @args;
422	270					389	push @args, $token;
423	270					354	my @node;
424	270	50				569	if ( my $method = $handler{ $args[0]->content() } ) {
425	270					962	@node = $self->$method( \@args );
426							}
427	270	50				613	@node or @node = PPIx::Regexp::Structure->__new( @args );
428	270					331	push @{ $self->{_rslt}[-1] }, @node;
	270					584
429	270					600	return;
430							}
431
432							}
433
434							# Called as $self->$method( ... ) in _make_node(), above
435							sub _curly { ## no critic (ProhibitUnusedPrivateSubroutines)
436	35			35		68	my ( $self, $args ) = @_;
437
438	35	100	66			160	if ( $args->[-1] && $args->[-1]->is_quantifier() ) {
		50
439
440							# If the tokenizer has marked the right curly as a quantifier,
441							# make the whole thing a quantifier structure.
442	29					59	return PPIx::Regexp::Structure::Quantifier->__new( @{ $args } );
	29					178
443
444							} elsif ( $args->[-1] ) {
445
446							# If there is a right curly but it is not a quantifier,
447							# make both curlys into literals.
448	6					14	foreach my $inx ( 0, -1 ) {
449	12					46	TOKEN_LITERAL->__PPIX_ELEM__rebless( $args->[$inx] );
450							}
451
452							# Try to recover possible quantifiers not recognized because we
453							# thought this was a structure.
454	6					24	$self->_recover_curly_quantifiers( $args );
455
456	6					11	return @{ $args };
	6					16
457
458							} else {
459
460							# If there is no right curly, just make a generic structure
461							# TODO maybe this should be something else?
462	0					0	return PPIx::Regexp::Structure->__new( @{ $args } );
	0					0
463							}
464							}
465
466							# Recover from an unclosed left curly.
467							# Called as $self->$revover( ... ) in _get_delimited, above
468							sub _recover_curly { ## no critic (ProhibitUnusedPrivateSubroutines)
469	6			6		19	my ( $self, $token ) = @_;
470
471							# Get all the stuff we have accumulated for this curly.
472	6					10	my @content = @{ pop @{ $self->{_rslt} } };
	6					9
	6					20
473
474							# Lose the right bracket, which we have already failed to match.
475	6					12	shift @content;
476
477							# Rebless the left curly appropriately
478	6	100	66			38	if ( $self->{_rslt}[0][-1]->isa( 'PPIx::Regexp::Token::Assertion' )
479							&& q<\b> eq $self->{_rslt}[0][-1]->content() ) {
480							# If following \b, it becomes an unknown.
481	1					7	TOKEN_UNKNOWN->__PPIX_ELEM__rebless( $content[0],
482							error => 'Unterminated bound type',
483							);
484							} else {
485							# Rebless the left curly to a literal.
486	5					28	TOKEN_LITERAL->__PPIX_ELEM__rebless( $content[0] );
487							}
488
489							# Try to recover possible quantifiers not recognized because we
490							# thought this was a structure.
491	6					22	$self->_recover_curly_quantifiers( \@content );
492
493							# Shove the curly and its putative contents into whatever structure
494							# we have going.
495							# The checks are to try to trap things like RT 56864, though on
496							# further reflection it turned out that you could get here with an
497							# empty $self->{_rslt} on things like 'm{)}'. This one did not get
498							# made into an RT ticket, but was fixed by not calling the recovery
499							# code if $self->{_rslt} contained only the enclosing delimiters.
500							ARRAY_REF eq ref $self->{_rslt}
501							or confess 'Programming error - $self->{_rslt} not array ref, ',
502	6	50				18	"parsing '", $self->{tokenizer}->content(), "' at ",
503							$token->content();
504	6					13	@{ $self->{_rslt} }
505							or confess 'Programming error - $self->{_rslt} empty, ',
506	6	50				10	"parsing '", $self->{tokenizer}->content(), "' at ",
507							$token->content();
508	6					9	push @{ $self->{_rslt}[-1] }, @content;
	6					16
509
510							# Shove the mismatched delimiter back into the input so we can have
511							# another crack at it.
512	6	100				15	$token and $self->_unget_token( $token );
513
514							# We gone.
515	6					19	return;
516							}
517
518							sub _recover_curly_quantifiers {
519	12			12		22	my ( undef, $args ) = @_; # Invocant unused
520
521	12	100	100			52	if ( __instance( $args->[0], TOKEN_LITERAL )
			66
522							&& __instance( $args->[1], TOKEN_UNKNOWN )
523							&& PPIx::Regexp::Token::Quantifier->could_be_quantifier(
524							$args->[1]->content() )
525							) {
526	2					14	PPIx::Regexp::Token::Quantifier->
527							__PPIX_ELEM__rebless( $args->[1] );
528
529	2	50	33			5	if ( __instance( $args->[2], TOKEN_UNKNOWN )
530							&& PPIx::Regexp::Token::Greediness->could_be_greediness(
531							$args->[2]->content() )
532							) {
533	2					11	PPIx::Regexp::Token::Greediness
534							->__PPIX_ELEM__rebless( $args->[2] );
535							}
536
537							}
538
539	12					36	return;
540							}
541
542							sub _in_regex_set {
543	193			193		297	my ( $self ) = @_;
544	193					229	foreach my $stack_entry ( reverse @{ $self->{_rslt} } ) {
	193					406
545	302	100				618	$stack_entry->[0] eq '])'
546							and return 1;
547							}
548	189					437	return 0;
549							}
550
551							# Called as $self->$method( ... ) in _make_node(), above
552							sub _round { ## no critic (ProhibitUnusedPrivateSubroutines)
553	193			193		332	my ( $self, $args ) = @_;
554
555							# If we're inside a regex set, parens do not capture.
556							$self->_in_regex_set()
557	193	100				510	and return PPIx::Regexp::Structure->__new( @{ $args } );
	4					19
558
559							# If /n is asserted, parens do not capture.
560							$self->{tokenizer}->modifier( 'n' )
561	189	100				457	and return PPIx::Regexp::Structure->__new( @{ $args } );
	7					32
562
563							# The instantiator will rebless based on the first token if need be.
564	182					251	return PPIx::Regexp::Structure::Capture->__new( @{ $args } );
	182					892
565							}
566
567							# Called as $self->$method( ... ) in _make_node(), above
568							sub _square { ## no critic (ProhibitUnusedPrivateSubroutines)
569	36			36		68	my ( undef, $args ) = @_; # Invocant unused
570	36					51	return PPIx::Regexp::Structure::CharClass->__new( @{ $args } );
	36					239
571							}
572
573							# Called as $self->$method( ... ) in _make_node(), above
574							sub _regex_set { ## no critic (ProhibitUnusedPrivateSubroutines)
575	6			6		14	my ( undef, $args ) = @_; # Invocant unused
576	6					11	return PPIx::Regexp::Structure::RegexSet->__new( @{ $args } );
	6					53
577							}
578
579							# $self->_unget_token( $token );
580							#
581							# This method caches its argument so that it will be returned by
582							# the next call to C<_get_token()>. If more than one argument is
583							# passed, they will be returned in the order given; that is,
584							# _unget_token/_get_token work like unshift/shift.
585
586							sub _unget_token {
587	705			705		1179	my ( $self, @args ) = @_;
588	705					812	unshift @{ $self->{deferred} }, @args;
	705					1437
589	705					1196	return $self;
590							}
591
592							1;
593
594							__END__
595
596							=head1 SUPPORT
597
598							Support is by the author. Please file bug reports at
599							L<https://rt.cpan.org/Public/Dist/Display.html?Name=PPIx-Regexp>,
600							L<https://github.com/trwyant/perl-PPIx-Regexp/issues>, or in
601							electronic mail to the author.
602
603							=head1 AUTHOR
604
605							Thomas R. Wyant, III F<wyant at cpan dot org>
606
607							=head1 COPYRIGHT AND LICENSE
608
609							Copyright (C) 2009-2023, 2025 by Thomas R. Wyant, III
610
611							This program is free software; you can redistribute it and/or modify it
612							under the same terms as Perl 5.10.0. For more details, see the full text
613							of the licenses in the directory LICENSES.
614
615							This program is distributed in the hope that it will be useful, but
616							without any warranty; without even the implied warranty of
617							merchantability or fitness for a particular purpose.
618
619							=cut
620
621							# ex: set textwidth=72 :