File Coverage

blib/lib/PPIx/Regexp/Token/Interpolation.pm

Criterion	Covered	Total	%
statement	105	111	94.5
branch	62	90	68.8
condition	13	24	54.1
subroutine	17	18	94.4
pod	2	2	100.0
total	199	245	81.2

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Interpolation - Represent an interpolation in the PPIx::Regexp package.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new('qr{$foo}smx')->print();
9
10							=head1 INHERITANCE
11
12							C<PPIx::Regexp::Token::Interpolation> is a
13							L<PPIx::Regexp::Token::Code\|PPIx::Regexp::Token::Code>.
14
15							C<PPIx::Regexp::Token::Interpolation> has no descendants.
16
17							=head1 DESCRIPTION
18
19							This class represents a variable interpolation into a regular
20							expression. In the L</SYNOPSIS> the C<$foo> would be represented by an
21							object of this class.
22
23							=head2 Incompatible Change: Parse of '@{[ ... ]}'
24
25							In versions 0.083 and earlier, C<'@{[ ... ]}'> parsed as a member of
26							this class. In 0.084 and later, it parses as a member of
27							C<PPIx::Regexp::Token::Code>. This more accurately reflects the actual
28							contents of the token, and enables the recognition of the full range of
29							postfix dereference operators for versioning purposes, not just those
30							valid in interpolations.
31
32							=head1 METHODS
33
34							This class provides the following public methods beyond those provided
35							by its superclass.
36
37							=cut
38
39							package PPIx::Regexp::Token::Interpolation;
40
41	9			9		43	use strict;
	9					11
	9					252
42	9			9		26	use warnings;
	9					13
	9					332
43
44	9			9		31	use base qw{ PPIx::Regexp::Token::Code };
	9					20
	9					700
45
46	9			9		36	use Carp qw{ confess };
	9					12
	9					456
47	9					954	use PPIx::Regexp::Constant qw{
48							COOKIE_CLASS
49							COOKIE_REGEX_SET
50							MINIMUM_PERL
51							TOKEN_LITERAL
52							@CARP_NOT
53	9			9		36	};
	9					10
54
55							our $VERSION = '0.091';
56
57	9			9		40	use constant VERSION_WHEN_IN_REGEX_SET => '5.017009';
	9					19
	9					9725
58
59							sub __new {
60	105			105		8625	my ( $class, $content, %arg ) = @_;
61
62							defined $arg{perl_version_introduced}
63	105	50				389	or $arg{perl_version_introduced} = MINIMUM_PERL;
64
65	105					470	my $self = $class->SUPER::__new( $content, %arg );
66
67	105					351	return $self;
68							}
69
70							=head2 is_matcher
71
72							This method returns C<undef> because a static analysis can not in
73							general tell whether a piece of code matches anything.
74
75							=cut
76
77	0			0	1	0	sub is_matcher { return undef; } ## no critic (ProhibitExplicitReturnUndef)
78
79							# Return true if the token can be quantified, and false otherwise
80							# This can be quantified because it might interpolate a quantifiable
81							# token. Of course, it might not, but we need to be permissive here.
82							# sub can_be_quantified { return };
83
84							# We overrode this in PPIx::Regexp::Token::Code, since (?{...}) did not
85							# appear until Perl 5.5. But interpolation has been there since the
86							# beginning, so we have to override again. This turns out to be OK,
87							# though, because while Regex Sets were introduced in 5.17.8,
88							# interpolation inside them was not introduced until 5.17.9.
89							sub perl_version_introduced {
90	17			17	1	1373	my ( $self ) = @_;
91	17					38	return $self->{perl_version_introduced};
92							}
93
94							# Normalize the content of an interpolation object before making it into
95							# a PPI document. The issue here is that things like ${x} are at least
96							# warnings outside strings, but are normal inside them.
97							sub __ppi_normalize_content {
98	115			115		178	my ( $self ) = @_;
99	115					143	my $content;
100	115	50				222	defined( $content = $self->content() )
101							or return $content;
102							# NOTE: perldata gives a regexp for this, but it requires Perl 5.10.
103							# I believe the following caputures the intent, except possibly for
104							# various weird combinations of '::' and "'".
105	115					413	$content =~
106							s/ \A
107							( \$ \# \$* \| [\@\$] \$* ) # Sigil and possible casts
108							[{] \s* (?: :: )* '? # per perldata
109							( ^? (?: \w+ (?: (?: :: \| ' ) \w+ )* (?: :: )? \| [[:punct:]] ) )
110							\s* [}] \z
111							/$1$2/smx;
112	115					329	return $content;
113							}
114
115							# Match the beginning of an interpolation.
116
117							my $interp_re =
118							qr{ \A (?= [\@\$]? \$ [-\w&`'+^./\\";%=~:?!\@\$<>\[\]\{\},#] \|
119							\@ [\w\{] )
120							}smx;
121
122							# Match bracketed interpolation
123
124							my $brkt_interp_re =
125							qr{ \A (?: [\@\$] \$* [#]? \$* [\{] (?: [][\-&`'+,^./\\";%=:?\@\$<>,#] \|
126							\^? \w+ (?: :: \w+ )* ) [\}] \|
127							\@ [\{] \w+ (?: :: \w+ )* [\}] )
128							}smx;
129
130							# We pull out the logic of finding and dealing with the interpolation
131							# into a separate subroutine because if we fail to find an interpolation
132							# we want to do something with the sigils.
133
134							my %allow_subscript_based_on_cast_symbol = (
135							q<$#> => 0,
136							q<$> => 1,
137							q<@> => 1,
138							);
139
140							sub _interpolation {
141	110			110		232	my ( $class, $tokenizer, undef, $in_regexp ) = @_; # $character unused
142
143							# If the regexp does not interpolate, bail now.
144	110	100				260	$tokenizer->interpolates() or return;
145
146							# If we're a bracketed interpolation, just accept it
147	108	100				284	if ( my $len = $tokenizer->find_regexp( $brkt_interp_re ) ) {
148	8					34	return $len;
149							}
150
151							# Make sure we start off plausibly
152	100	100				198	defined $tokenizer->find_regexp( $interp_re )
153							or return;
154
155							# See if PPI can figure out what we have
156	83	50				337	my $doc = $tokenizer->ppi_document()
157							or return;
158
159							# Get the first statement to work on.
160	83	50				124086	my $stmt = $doc->find_first( 'PPI::Statement' )
161							or return;
162
163	83					14402	my @accum; # The elements of the interpolation
164							my $allow_subscript; # Assume no subscripts allowed
165	83					144	my $want_class = __PACKAGE__; # Assume we want an interpolation.
166
167							# Find the beginning of the interpolation
168	83	50				226	my $next = $stmt->schild( 0 ) or return;
169
170							# The interpolation should start with
171	83	100				1175	if ( $next->isa( 'PPI::Token::Symbol' ) ) {
		100
		50
172
173							# A symbol
174	73					135	push @accum, $next;
175	73					129	$allow_subscript = 1; # Subscripts are allowed
176
177							} elsif ( $next->isa( 'PPI::Token::Cast' ) ) {
178
179							# Or a cast followed by a block
180	8					18	push @accum, $next;
181	8	50				33	$next = $next->next_sibling() or return;
182	8	100				173	if ( $next->isa( 'PPI::Token::Symbol' ) ) {
		50
183							defined (
184							$allow_subscript =
185							$allow_subscript_based_on_cast_symbol{
186	6	50				18	$accum[-1]->content()
187							}
188							) or return;
189	6					33	push @accum, $next;
190							} elsif ( $next->isa( 'PPI::Structure::Block' ) ) {
191							# We want @{[ ... ]} to parse as a PPIx::Regexp::Token::Code.
192							# PPI parses this as a cast followed by a block. The block
193							# contains a single statement, which contains a single
194							# constructor. So:
195	2					29	my @kids = $next->schildren();
196	2	50	33			43	if ( @kids == 1 && $kids[0]->isa( 'PPI::Statement' ) ) {
197	2					7	@kids = $kids[0]->schildren();
198	2	50	33			21	if ( @kids == 1 &&
			33
199							$kids[0]->isa( 'PPI::Structure::Constructor' ) &&
200							$kids[0]->start() eq '[' ) {
201	2					36	$want_class = 'PPIx::Regexp::Token::Code';
202							}
203							}
204	2					6	push @accum, $next;
205							} else {
206	0					0	return;
207							}
208
209							} elsif ( $next->isa( 'PPI::Token::ArrayIndex' ) ) {
210
211							# Or an array index
212	2					4	push @accum, $next;
213
214							} else {
215
216							# None others need apply.
217	0					0	return;
218
219							}
220
221							# The interpolation _may_ be subscripted. If so ...
222							{
223
224							# Only accept a subscript if wanted and available
225	83	100	100			106	$allow_subscript and $next = $next->snext_sibling() or last;
	106					429
226
227							# Accept an optional dereference operator.
228	63					1503	my @subscr;
229	63	100				232	if ( $next->isa( 'PPI::Token::Operator' ) ) {
230	18	100				53	$next->content() eq '->' or last;
231	4					18	push @subscr, $next;
232	4	50				13	$next = $next->next_sibling() or last;
233
234							# postderef was introduced in 5.19.5, per perl5195delta.
235	4	50				89	if ( my $deref = $tokenizer->__recognize_postderef(
236							__PACKAGE__, $next ) ) {
237	4					37	push @accum, @subscr, $deref;
238	4					7	last;
239							}
240							}
241
242							# Accept only a subscript
243	45	100				154	$next->isa( 'PPI::Structure::Subscript' ) or last;
244
245							# The subscript must have a closing delimiter.
246	25	50				67	$next->finish() or last;
247
248							# If we are in a regular expression rather than a replacement
249							# string, screen the subscript for content, since [] could be a
250							# character class, and {} could be a quantifier. The perlop docs
251							# say that Perl applies undocumented heuristics subject to
252							# change without notice to figure this out. So we do our poor
253							# best to be heuristical and undocumented.
254	25	100	100			191	not $in_regexp or $class->_subscript( $next ) or last;
255
256							# If we got this far, accept the subscript and try for another
257							# one.
258	23					55	push @accum, @subscr, $next;
259	23					31	redo;
260							}
261
262							# Compute the length of all the PPI elements accumulated, and return
263							# it.
264	83					1046	my $length = 0;
265	83					158	foreach ( @accum ) {
266	122	100				520	$length += ref $_ ? length $_->content() : $_;
267							}
268	83					1014	return ( $length, $want_class );
269							}
270
271							{
272	9			9		64	no warnings qw{ qw }; ## no critic (ProhibitNoWarnings)
	9					13
	9					4289
273
274							my %accept = map { $_ => 1 } qw{ $ $# @ };
275
276							sub __postderef_accept_cast {
277	108			108		205	return \%accept;
278							}
279							}
280
281							{
282
283							my %allowed = (
284							'[' => '_square',
285							'{' => '_curly',
286							);
287
288							sub _subscript {
289	21			21		36	my ( $class, $struct ) = @_;
290
291							# We expect to have a left delimiter, which is either a '[' or a
292							# '{'.
293	21	50				40	my $left = $struct->start() or return;
294	21					93	my $lc = $left->content();
295	21	50				123	my $handler = $allowed{$lc} or return;
296
297							# We expect a single child, which is a PPI::Statement
298	21	50				58	( my @kids = $struct->schildren() ) == 1 or return;
299	21	50				221	$kids[0]->isa( 'PPI::Statement' ) or return;
300
301							# We expect the statement to have at least one child.
302	21	50				44	( @kids = $kids[0]->schildren() ) or return;
303
304	21					165	return $class->$handler( @kids );
305
306							}
307
308							}
309
310							# Return true if we think a curly-bracketed subscript is really a
311							# subscript, rather than a quantifier.
312							# Called as $class->$handler( ... ) above
313							sub _curly { ## no critic (ProhibitUnusedPrivateSubroutines)
314	17			17		35	my ( undef, @kids ) = @_; # Invocant unused
315
316							# If the first child is a word, and either it is an only child or
317							# the next child is the fat comma operator, we accept it as a
318							# subscript.
319	17	100				53	if ( $kids[0]->isa( 'PPI::Token::Word' ) ) {
320	14	100				49	@kids == 1 and return 1;
321	4	50	33			14	$kids[1]->isa( 'PPI::Token::Operator' )
322							and $kids[1]->content() eq '=>'
323							and return 1;
324							}
325
326							# If the first child is a symbol,
327	3	100	66			17	if ( @kids && $kids[0]->isa( 'PPI::Token::Symbol' ) ) {
328							# Accept it if it is the only child
329	1	50				3	@kids == 1
330							and return 1;
331							# Accept it if there are exactly two children and the second is
332							# a subscript.
333	1	50	33			15	@kids == 2
334							and $kids[1]->isa( 'PPI::Structure::Subscript' )
335							and return 1;
336							}
337
338							# We reject anything else.
339	2					9	return;
340							}
341
342							# Return true if we think a square-bracketed subscript is really a
343							# subscript, rather than a character class.
344							# Called as $class->$handler( ... ) above
345							sub _square { ## no critic (ProhibitUnusedPrivateSubroutines)
346	4			4		11	my ( undef, @kids ) = @_; # Invocant unused
347
348							# We expect to have either a number or a symbol as the first
349							# element.
350	4	50				25	$kids[0]->isa( 'PPI::Token::Number' ) and return 1;
351	0	0				0	$kids[0]->isa( 'PPI::Token::Symbol' ) and return 1;
352
353							# Anything else is rejected.
354	0					0	return;
355							}
356
357							# Alternate classes for the sigils, depending on whether we are in a
358							# character class (index 1) or not (index 0).
359							my %sigil_alternate = (
360							'$' => [ 'PPIx::Regexp::Token::Assertion', TOKEN_LITERAL ],
361							'@' => [ TOKEN_LITERAL, TOKEN_LITERAL ],
362							);
363
364							sub __PPIX_TOKENIZER__regexp {
365	1434			1434		2416	my ( $class, $tokenizer, $character ) = @_;
366
367	1434	100				3155	exists $sigil_alternate{$character} or return;
368
369	105	100				349	if ( my ( $accept, $want_class ) =
370							$class->_interpolation( $tokenizer, $character, 1 )
371							) {
372	86					6089	return $tokenizer->make_token( $accept, $want_class );
373							}
374
375	19	50				59	my $alternate = $sigil_alternate{$character} or return;
376	19	50				52	return $tokenizer->make_token(
377							1, $alternate->[$tokenizer->cookie( COOKIE_CLASS ) ? 1 : 0 ] );
378
379							}
380
381							sub __PPIX_TOKENIZER__repl {
382	18			18		54	my ( $class, $tokenizer, $character ) = @_;
383
384	18	100				46	exists $sigil_alternate{$character} or return;
385
386	5	50				16	if ( my ( $accept, $want_class ) =
387							$class->_interpolation( $tokenizer, $character, 0 ) ) {
388	5					314	return $tokenizer->make_token( $accept, $want_class );
389							}
390
391	0						return $tokenizer->make_token( 1, TOKEN_LITERAL );
392
393							}
394
395							1;
396
397							__END__
398
399							=begin comment
400
401							Interpolation notes:
402
403							$ perl -E '$foo = "\\w"; $bar = 3; say qr{$foo{$bar}}'
404							(?-xism:)
405							white2:~/Code/perl/PPIx-Regexp.new tom 22:50:33
406							$ perl -E '$foo = "\\w"; $bar = 3; say qr{foo{$bar}}'
407							(?-xism:foo{3})
408							white2:~/Code/perl/PPIx-Regexp.new tom 22:50:59
409							$ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{$foo{$bar}}'
410							(?-xism:)
411							white2:~/Code/perl/PPIx-Regexp.new tom 22:51:38
412							$ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{$foo}'
413							(?-xism:\w)
414							white2:~/Code/perl/PPIx-Regexp.new tom 22:51:50
415							$ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{$foo{baz}}'
416							(?-xism:)
417							white2:~/Code/perl/PPIx-Regexp.new tom 22:52:49
418							$ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{${foo}{baz}}'
419							(?-xism:\w{baz})
420							white2:~/Code/perl/PPIx-Regexp.new tom 22:54:07
421							$ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{${foo}{$bar}}'
422							(?-xism:\w{3})
423
424							The above makes me think that Perl is extremely reluctant to understand
425							an interpolation followed by curlys as a hash dereference. In fact, only
426							when the interpolation was what PPI calls a block was it understood at
427							all.
428
429							$ perl -E '$foo = { bar => 42 }; say qr{$foo->{bar}};'
430							(?-xism:42)
431							$ perl -E '$foo = { bar => 42 }; say qr{$foo->{baz}};'
432							(?-xism:)
433
434							On the other hand, Perl seems to be less reluctant to accept an explicit
435							dereference as a hash dereference.
436
437							$ perl -E '$foo = "\\w"; $bar = 3; @foo = (42); say qr{$foo}'
438							(?-xism:\w)
439							white2:~/Code/perl/PPIx-Regexp.new tom 22:58:20
440							$ perl -E '$foo = "\\w"; $bar = 3; @foo = (42); say qr{$foo[0]}'
441							(?-xism:42)
442							white2:~/Code/perl/PPIx-Regexp.new tom 22:58:28
443							$ perl -E '$foo = "\\w"; $bar = 3; @foo = (42); say qr{$foo[$bar]}'
444							(?-xism:)
445							white2:~/Code/perl/PPIx-Regexp.new tom 22:58:43
446							$ perl -E '$foo = "\\w"; $bar = 0; @foo = (42); say qr{$foo[$bar]}'
447							(?-xism:42)
448
449							The above makes it somewhat easier to get $foo[$bar] interpreted as an
450							array dereference, but it appears to make use of information that is not
451							available to a static analysis, such as whether $foo[$bar] exists.
452
453							Actually, the above suggests a strategy: a subscript of any kind is to
454							be accepted as a subscript if it looks like \[\d+\], \[\$foo\], \{\w+\},
455							or \{\$foo\}. Otherwise, accept it as a character class or a quantifier
456							depending on the delimiter. Obviously when I bring PPI to bear I will
457							have to keep track of '->' operators before subscripts, and shed them
458							from the interpolation as well if the purported subscript does not pass
459							muster.
460
461							=end comment
462
463							=head1 SUPPORT
464
465							Support is by the author. Please file bug reports at
466							L<https://rt.cpan.org/Public/Dist/Display.html?Name=PPIx-Regexp>,
467							L<https://github.com/trwyant/perl-PPIx-Regexp/issues>, or in
468							electronic mail to the author.
469
470							=head1 AUTHOR
471
472							Thomas R. Wyant, III F<wyant at cpan dot org>
473
474							=head1 COPYRIGHT AND LICENSE
475
476							Copyright (C) 2009-2023, 2025 by Thomas R. Wyant, III
477
478							This program is free software; you can redistribute it and/or modify it
479							under the same terms as Perl 5.10.0. For more details, see the full text
480							of the licenses in the directory LICENSES.
481
482							This program is distributed in the hope that it will be useful, but
483							without any warranty; without even the implied warranty of
484							merchantability or fitness for a particular purpose.
485
486							=cut
487
488							# ex: set textwidth=72 :