File Coverage

blib/lib/PPIx/Regexp/Lexer.pm

Criterion	Covered	Total	%
statement	250	264	94.7
branch	64	80	80.0
condition	26	35	74.2
subroutine	44	45	97.7
pod	5	5	100.0
total	389	429	90.6

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Lexer - Assemble tokenizer output.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Lexer;
8							use PPIx::Regexp::Dumper;
9							my $lex = PPIx::Regexp::Lexer->new('qr{foo}smx');
10							my $dmp = PPIx::Regexp::Dumper->new( $lex );
11							$dmp->print();
12
13							=head1 INHERITANCE
14
15							C is a
16							L.
17
18							C has no descendants.
19
20							=head1 DESCRIPTION
21
22							This class takes the token stream generated by
23							L and generates the
24							parse tree.
25
26							=head1 METHODS
27
28							This class provides the following public methods. Methods not documented
29							here are private, and unsupported in the sense that the author reserves
30							the right to change or remove them without notice.
31
32							=cut
33
34							package PPIx::Regexp::Lexer;
35
36	9			9		59	use strict;
	9					13
	9					250
37	9			9		29	use warnings;
	9					11
	9					331
38
39	9			9		29	use base qw{ PPIx::Regexp::Support };
	9					11
	9					3467
40
41	9			9		84	use Carp qw{ confess };
	9					15
	9					485
42	9					671	use PPIx::Regexp::Constant qw{
43							ARRAY_REF
44							TOKEN_LITERAL
45							TOKEN_UNKNOWN
46							@CARP_NOT
47	9			9		41	};
	9					31
48	9			9		3649	use PPIx::Regexp::Node::Range ();
	9					19
	9					173
49	9			9		3247	use PPIx::Regexp::Node::Unknown ();
	9					20
	9					145
50	9			9		3366	use PPIx::Regexp::Structure ();
	9					19
	9					190
51	9			9		3550	use PPIx::Regexp::Structure::Assertion ();
	9					24
	9					199
52	9			9		3525	use PPIx::Regexp::Structure::Atomic_Script_Run ();
	9					22
	9					160
53	9			9		3483	use PPIx::Regexp::Structure::BranchReset ();
	9					22
	9					149
54	9			9		3393	use PPIx::Regexp::Structure::Code ();
	9					19
	9					174
55	9			9		3334	use PPIx::Regexp::Structure::Capture ();
	9					40
	9					153
56	9			9		3332	use PPIx::Regexp::Structure::CharClass ();
	9					24
	9					156
57	9			9		3379	use PPIx::Regexp::Structure::Subexpression ();
	9					24
	9					150
58	9			9		3430	use PPIx::Regexp::Structure::Main ();
	9					42
	9					182
59	9			9		3216	use PPIx::Regexp::Structure::Modifier ();
	9					21
	9					146
60	9			9		3363	use PPIx::Regexp::Structure::NamedCapture ();
	9					25
	9					158
61	9			9		5034	use PPIx::Regexp::Structure::Quantifier ();
	9					21
	9					165
62	9			9		3449	use PPIx::Regexp::Structure::Regexp ();
	9					24
	9					202
63	9			9		3333	use PPIx::Regexp::Structure::RegexSet ();
	9					25
	9					139
64	9			9		3409	use PPIx::Regexp::Structure::Replacement ();
	9					38
	9					165
65	9			9		3397	use PPIx::Regexp::Structure::Script_Run ();
	9					22
	9					158
66	9			9		3756	use PPIx::Regexp::Structure::Switch ();
	9					27
	9					163
67	9			9		3418	use PPIx::Regexp::Structure::Unknown ();
	9					186
	9					278
68	9			9		3616	use PPIx::Regexp::Token::Unmatched ();
	9					22
	9					142
69	9			9		6251	use PPIx::Regexp::Tokenizer ();
	9					33
	9					268
70	9			9		49	use PPIx::Regexp::Util qw{ __choose_tokenizer_class __instance };
	9					13
	9					15268
71
72							our $VERSION = '0.091_01';
73
74							=head2 new
75
76							This method instantiates the lexer. It takes as its argument either a
77							L or the text to be
78							parsed. In the latter case the tokenizer is instantiated from the text.
79
80							Any optional name/value pairs after the first argument are passed to the
81							tokenizer, which interprets them or not as the case may be.
82
83							=cut
84
85							{
86
87							my $errstr;
88
89							sub new {
90	334			334	1	877	my ( $class, $tokenizer, %args ) = @_;
91	334	50				894	ref $class and $class = ref $class;
92
93	334	50				769	unless ( __instance( $tokenizer, 'PPIx::Regexp::Tokenizer' ) ) {
94							my $tokenizer_class = __choose_tokenizer_class(
95							$tokenizer, \%args )
96	0	0				0	or do {
97	0					0	$errstr = 'Data not supported';
98	0					0	return;
99							};
100							$tokenizer = $tokenizer_class->new( $tokenizer, %args )
101	0	0				0	or do {
102	0					0	$errstr = $tokenizer_class->errstr();
103	0					0	return;
104							};
105							}
106
107							my $self = {
108							deferred => [], # Deferred tokens
109							failures => 0,
110							strict => $args{strict},
111	334					1609	tokenizer => $tokenizer,
112							};
113
114	334					626	bless $self, $class;
115	334					724	return $self;
116							}
117
118							sub errstr {
119	0			0	1	0	return $errstr;
120							}
121
122							}
123
124							=head2 errstr
125
126							This method returns the error string from the last attempt to
127							instantiate a C. If the last attempt succeeded, the
128							error will be C.
129
130							=cut
131
132							# Defined above
133
134							=head2 failures
135
136							print $lexer->failures(), " parse failures\n";
137
138							This method returns the number of parse failures encountered. A
139							parse failure is either a tokenization failure (see
140							L<< PPIx::Regexp::Tokenizer->failures()\|PPIx::Regexp::Tokenizer/failures >>)
141							or a structural error.
142
143							=cut
144
145							sub failures {
146	334			334	1	564	my ( $self ) = @_;
147	334					781	return $self->{failures};
148							}
149
150							=head2 lex
151
152							This method lexes the tokens in the text, and returns the lexed list of
153							elements.
154
155							=cut
156
157							sub lex {
158	334			334	1	703	my ( $self ) = @_;
159
160	334					449	my @content;
161	334					725	$self->{failures} = 0;
162
163							# Accept everything up to the first delimiter.
164	334					499	my $kind; # Initial PPIx::Regexp::Token::Structure
165							{
166	334	100				454	my $token = $self->_get_token()
	672					1698
167							or return $self->_finalize( @content );
168	664	100				2189	$token->isa( 'PPIx::Regexp::Token::Delimiter' ) or do {
169	338	100	100			1599	not $kind
170							and $token->isa( 'PPIx::Regexp::Token::Structure' )
171							and $kind = $token;
172	338					509	push @content, $token;
173	338					511	redo;
174							};
175	326					918	$self->_unget_token( $token );
176							}
177
178							my ( $part_0_class, $part_1_class ) =
179	326					1138	$self->{tokenizer}->__part_classes();
180
181							# Accept the first delimited structure.
182	326					901	push @content, ( my $part_0 = $self->_get_delimited(
183							$part_0_class ) );
184
185							# If we are a substitution ...
186	326	100				671	if ( defined $part_1_class ) {
187
188							# Accept any insignificant stuff.
189	26					69	while ( my $token = $self->_get_token() ) {
190	30	100				61	if ( $token->significant() ) {
191	26					67	$self->_unget_token( $token );
192	26					39	last;
193							} else {
194	4					9	push @content, $token;
195							}
196							}
197
198							# Figure out if we should expect an opening bracket.
199	26		100			125	my $expect_open_bracket = $self->close_bracket(
200							$part_0->start( 0 ) ) \|\| 0;
201
202							# Accept the next delimited structure.
203	26					65	push @content, $self->_get_delimited(
204							$part_1_class,
205							$expect_open_bracket,
206							);
207							}
208
209							# Accept the modifiers (we hope!) plus any trailing white space.
210	326					737	while ( my $token = $self->_get_token() ) {
211	328					610	push @content, $token;
212							}
213
214							# Let all the elements finalize themselves, recording any additional
215							# errors as they do so.
216	326					1074	$self->_finalize( @content );
217
218							# If we found a regular expression (and we should have done so) ...
219	326	50	33			1558	if ( $part_0 && $part_0->can( 'max_capture_number' ) ) {
220							# TODO the above line is really ugly. I'm wondering about
221							# string implementations like:
222							# * return a $part_0_class of undef (but that complicates the
223							# lexing of the structure itself);
224							# * hang this logic on the tokenizer somehow (where it seems out
225							# of place)
226							# * hang this logic on PPIx::Regexp::Structure::Regexp and
227							# ::Replacement.
228							# I also need to figure out how to make \n backreferences come
229							# out as literals. Maybe that is a job best done by the
230							# tokenizer.
231
232							# Retrieve the maximum capture group.
233	326					766	my $max_capture = $part_0->max_capture_number();
234
235							# Hashify the known capture names
236							my $capture_name = {
237	326					833	map { $_ => 1 } $part_0->capture_names(),
	20					62
238							};
239
240							# For all the backreferences found
241	326	100				535	foreach my $elem ( @{ $part_0->find(
	326					584
242							'PPIx::Regexp::Token::Backreference' ) \|\| [] } ) {
243							# Rebless them as needed, recording any errors found.
244							$self->{failures} +=
245	25					114	$elem->__PPIX_LEXER__rebless(
246							capture_name => $capture_name,
247							max_capture => $max_capture,
248							);
249							}
250							}
251
252	326					1223	return @content;
253
254							}
255
256							=head2 strict
257
258							This method returns true or false based on the value of the C<'strict'>
259							argument to C.
260
261							=cut
262
263							sub strict {
264	13			13	1	24	my ( $self ) = @_;
265	13					54	return $self->{strict};
266							}
267
268							# Finalize the content array, updating the parse failures count as we
269							# go.
270							sub _finalize {
271	334			334		625	my ( $self, @content ) = @_;
272	334					573	foreach my $elem ( @content ) {
273	1022					3182	$self->{failures} += $elem->__PPIX_LEXER__finalize( $self );
274							}
275	334	100				632	defined wantarray and return @content;
276	326					536	return;
277							}
278
279							{
280
281							my %bracket = (
282							'{' => '}',
283							'(' => ')',
284							'[' => ']',
285							'(?[' => '])',
286							## '<' => '>',
287							);
288
289							my %unclosed = (
290							'{' => '_recover_curly',
291							);
292
293							sub _get_delimited {
294	352			352		714	my ( $self, $class, $expect_open_bracket ) = @_;
295	352	100				717	defined $expect_open_bracket or $expect_open_bracket = 1;
296
297	352					485	my @rslt;
298	352					789	$self->{_rslt} = \@rslt;
299
300	352	100				698	if ( $expect_open_bracket ) {
301	331	50				539	if ( my $token = $self->_get_token() ) {
302	331					780	push @rslt, [];
303	331	50				936	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
304	331					397	push @{ $rslt[-1] }, '', $token;
	331					877
305							} else {
306	0					0	push @{ $rslt[-1] }, '', undef;
	0					0
307	0					0	$self->_unget_token( $token );
308							}
309							} else {
310	0					0	return;
311							}
312							} else {
313	21					71	push @rslt, [ '', undef ];
314							}
315
316	352					695	while ( my $token = $self->_get_token() ) {
317	2308	100				6299	if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) {
318	352					843	$self->_unget_token( $token );
319	352					573	last;
320							}
321	1956	100				4568	if ( $token->isa( 'PPIx::Regexp::Token::Structure' ) ) {
322	555					1199	my $content = $token->content();
323
324	555	100	66			1876	if ( my $finish = $bracket{$content} ) {
		100	66
		100
		100
325							# Open bracket
326	276					601	push @rslt, [ $finish, $token ];
327
328							} elsif ( $content eq $rslt[-1][0] ) {
329
330							# Matched close bracket
331	269					773	$self->_make_node( $token );
332
333							} elsif ( $content ne ')' ) {
334
335							# If the close bracket is not a parenthesis, it becomes
336							# a literal.
337	4					15	TOKEN_LITERAL->__PPIX_ELEM__rebless( $token );
338	4					6	push @{ $rslt[-1] }, $token;
	4					8
339
340							} elsif ( $content eq ')'
341							and @rslt > 1 # Ignore enclosing delimiter
342							and my $recover = $unclosed{$rslt[-1][1]->content()} ) {
343							# If the close bracket is a parenthesis and there is a
344							# recovery procedure, we use it.
345	1					4	$self->$recover( $token );
346
347							} else {
348
349							# Unmatched close with no recovery.
350	5					12	$self->{failures}++;
351	5					53	PPIx::Regexp::Token::Unmatched->
352							__PPIX_ELEM__rebless( $token );
353	5					8	push @{ $rslt[-1] }, $token;
	5					11
354							}
355
356							} else {
357	1401					1708	push @{ $rslt[-1] }, $token;
	1401					2668
358							}
359
360							# We have to hand-roll the Range object.
361	1956	100	100			4728	if ( __instance( $rslt[-1][-2], 'PPIx::Regexp::Token::Operator' )
			100
362							&& $rslt[-1][-2]->content() eq '-'
363							&& $rslt[-1][0] eq ']' # It's a character class
364							) {
365	13					20	my @tokens = splice @{ $rslt[-1] }, -3;
	13					38
366	13					21	push @{ $rslt[-1] },
	13					80
367							PPIx::Regexp::Node::Range->__new( @tokens );
368							}
369							}
370
371	352					869	while ( @rslt > 1 ) {
372	6	100				23	if ( my $recover = $unclosed{$rslt[-1][1]->content()} ) {
373	5					25	$self->$recover();
374							} else {
375	1					2	$self->{failures}++;
376	1					4	$self->_make_node( undef );
377							}
378							}
379
380	352	50				784	if ( @rslt == 1 ) {
381	352					429	my @last = @{ pop @rslt };
	352					824
382	352					604	shift @last;
383	352					778	push @last, $self->_get_token();
384	352					1444	return $class->__new( @last );
385							} else {
386	0					0	confess "Missing data";
387							}
388
389							}
390
391							}
392
393							# $token = $self->_get_token();
394							#
395							# This method returns the next token from the tokenizer.
396
397							sub _get_token {
398	4347			4347		5524	my ( $self ) = @_;
399
400	4347	100				4267	if ( @{ $self->{deferred} } ) {
	4347					7157
401	705					796	return shift @{ $self->{deferred} };
	705					1490
402							}
403
404	3642	100				8256	my $token = $self->{tokenizer}->next_token() or return;
405
406	3308					7042	return $token;
407							}
408
409							{
410
411							my %handler = (
412							'(' => '_round',
413							'[' => '_square',
414							'{' => '_curly',
415							'(?[' => '_regex_set',
416							);
417
418							sub _make_node {
419	270			270		478	my ( $self, $token ) = @_;
420	270					325	my @args = @{ pop @{ $self->{_rslt} } };
	270					325
	270					675
421	270					515	shift @args;
422	270					383	push @args, $token;
423	270					372	my @node;
424	270	50				549	if ( my $method = $handler{ $args[0]->content() } ) {
425	270					914	@node = $self->$method( \@args );
426							}
427	270	50				571	@node or @node = PPIx::Regexp::Structure->__new( @args );
428	270					331	push @{ $self->{_rslt}[-1] }, @node;
	270					587
429	270					547	return;
430							}
431
432							}
433
434							# Called as $self->$method( ... ) in _make_node(), above
435							sub _curly { ## no critic (ProhibitUnusedPrivateSubroutines)
436	35			35		405	my ( $self, $args ) = @_;
437
438	35	100	66			172	if ( $args->[-1] && $args->[-1]->is_quantifier() ) {
		50
439
440							# If the tokenizer has marked the right curly as a quantifier,
441							# make the whole thing a quantifier structure.
442	29					46	return PPIx::Regexp::Structure::Quantifier->__new( @{ $args } );
	29					195
443
444							} elsif ( $args->[-1] ) {
445
446							# If there is a right curly but it is not a quantifier,
447							# make both curlys into literals.
448	6					13	foreach my $inx ( 0, -1 ) {
449	12					38	TOKEN_LITERAL->__PPIX_ELEM__rebless( $args->[$inx] );
450							}
451
452							# Try to recover possible quantifiers not recognized because we
453							# thought this was a structure.
454	6					21	$self->_recover_curly_quantifiers( $args );
455
456	6					6	return @{ $args };
	6					16
457
458							} else {
459
460							# If there is no right curly, just make a generic structure
461							# TODO maybe this should be something else?
462	0					0	return PPIx::Regexp::Structure->__new( @{ $args } );
	0					0
463							}
464							}
465
466							# Recover from an unclosed left curly.
467							# Called as $self->$revover( ... ) in _get_delimited, above
468							sub _recover_curly { ## no critic (ProhibitUnusedPrivateSubroutines)
469	6			6		15	my ( $self, $token ) = @_;
470
471							# Get all the stuff we have accumulated for this curly.
472	6					13	my @content = @{ pop @{ $self->{_rslt} } };
	6					7
	6					22
473
474							# Lose the right bracket, which we have already failed to match.
475	6					14	shift @content;
476
477							# Rebless the left curly appropriately
478	6	100	66			38	if ( $self->{_rslt}[0][-1]->isa( 'PPIx::Regexp::Token::Assertion' )
479							&& q<\b> eq $self->{_rslt}[0][-1]->content() ) {
480							# If following \b, it becomes an unknown.
481	1					5	TOKEN_UNKNOWN->__PPIX_ELEM__rebless( $content[0],
482							error => 'Unterminated bound type',
483							);
484							} else {
485							# Rebless the left curly to a literal.
486	5					25	TOKEN_LITERAL->__PPIX_ELEM__rebless( $content[0] );
487							}
488
489							# Try to recover possible quantifiers not recognized because we
490							# thought this was a structure.
491	6					23	$self->_recover_curly_quantifiers( \@content );
492
493							# Shove the curly and its putative contents into whatever structure
494							# we have going.
495							# The checks are to try to trap things like RT 56864, though on
496							# further reflection it turned out that you could get here with an
497							# empty $self->{_rslt} on things like 'm{)}'. This one did not get
498							# made into an RT ticket, but was fixed by not calling the recovery
499							# code if $self->{_rslt} contained only the enclosing delimiters.
500							ARRAY_REF eq ref $self->{_rslt}
501							or confess 'Programming error - $self->{_rslt} not array ref, ',
502	6	50				39	"parsing '", $self->{tokenizer}->content(), "' at ",
503							$token->content();
504	6					16	@{ $self->{_rslt} }
505							or confess 'Programming error - $self->{_rslt} empty, ',
506	6	50				9	"parsing '", $self->{tokenizer}->content(), "' at ",
507							$token->content();
508	6					9	push @{ $self->{_rslt}[-1] }, @content;
	6					17
509
510							# Shove the mismatched delimiter back into the input so we can have
511							# another crack at it.
512	6	100				15	$token and $self->_unget_token( $token );
513
514							# We gone.
515	6					17	return;
516							}
517
518							sub _recover_curly_quantifiers {
519	12			12		26	my ( undef, $args ) = @_; # Invocant unused
520
521	12	100	100			34	if ( __instance( $args->[0], TOKEN_LITERAL )
			66
522							&& __instance( $args->[1], TOKEN_UNKNOWN )
523							&& PPIx::Regexp::Token::Quantifier->could_be_quantifier(
524							$args->[1]->content() )
525							) {
526	2					43	PPIx::Regexp::Token::Quantifier->
527							__PPIX_ELEM__rebless( $args->[1] );
528
529	2	50	33			5	if ( __instance( $args->[2], TOKEN_UNKNOWN )
530							&& PPIx::Regexp::Token::Greediness->could_be_greediness(
531							$args->[2]->content() )
532							) {
533	2					10	PPIx::Regexp::Token::Greediness
534							->__PPIX_ELEM__rebless( $args->[2] );
535							}
536
537							}
538
539	12					23	return;
540							}
541
542							sub _in_regex_set {
543	193			193		324	my ( $self ) = @_;
544	193					235	foreach my $stack_entry ( reverse @{ $self->{_rslt} } ) {
	193					370
545	302	100				664	$stack_entry->[0] eq '])'
546							and return 1;
547							}
548	189					341	return 0;
549							}
550
551							# Called as $self->$method( ... ) in _make_node(), above
552							sub _round { ## no critic (ProhibitUnusedPrivateSubroutines)
553	193			193		315	my ( $self, $args ) = @_;
554
555							# If we're inside a regex set, parens do not capture.
556							$self->_in_regex_set()
557	193	100				394	and return PPIx::Regexp::Structure->__new( @{ $args } );
	4					18
558
559							# If /n is asserted, parens do not capture.
560							$self->{tokenizer}->modifier( 'n' )
561	189	100				429	and return PPIx::Regexp::Structure->__new( @{ $args } );
	7					30
562
563							# The instantiator will rebless based on the first token if need be.
564	182					268	return PPIx::Regexp::Structure::Capture->__new( @{ $args } );
	182					845
565							}
566
567							# Called as $self->$method( ... ) in _make_node(), above
568							sub _square { ## no critic (ProhibitUnusedPrivateSubroutines)
569	36			36		70	my ( undef, $args ) = @_; # Invocant unused
570	36					55	return PPIx::Regexp::Structure::CharClass->__new( @{ $args } );
	36					209
571							}
572
573							# Called as $self->$method( ... ) in _make_node(), above
574							sub _regex_set { ## no critic (ProhibitUnusedPrivateSubroutines)
575	6			6		26	my ( undef, $args ) = @_; # Invocant unused
576	6					16	return PPIx::Regexp::Structure::RegexSet->__new( @{ $args } );
	6					57
577							}
578
579							# $self->_unget_token( $token );
580							#
581							# This method caches its argument so that it will be returned by
582							# the next call to C<_get_token()>. If more than one argument is
583							# passed, they will be returned in the order given; that is,
584							# _unget_token/_get_token work like unshift/shift.
585
586							sub _unget_token {
587	705			705		1202	my ( $self, @args ) = @_;
588	705					857	unshift @{ $self->{deferred} }, @args;
	705					1145
589	705					1177	return $self;
590							}
591
592							1;
593
594							__END__