File Coverage

blib/lib/PPIx/Regexp/Token/Interpolation.pm
Criterion Covered Total %
statement 105 111 94.5
branch 62 90 68.8
condition 13 24 54.1
subroutine 17 18 94.4
pod 2 2 100.0
total 199 245 81.2


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::Interpolation - Represent an interpolation in the PPIx::Regexp package.
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new('qr{$foo}smx')->print();
9              
10             =head1 INHERITANCE
11              
12             C<PPIx::Regexp::Token::Interpolation> is a
13             L<PPIx::Regexp::Token::Code|PPIx::Regexp::Token::Code>.
14              
15             C<PPIx::Regexp::Token::Interpolation> has no descendants.
16              
17             =head1 DESCRIPTION
18              
19             This class represents a variable interpolation into a regular
20             expression. In the L</SYNOPSIS> the C<$foo> would be represented by an
21             object of this class.
22              
23             =head2 Incompatible Change: Parse of '@{[ ... ]}'
24              
25             In versions 0.083 and earlier, C<'@{[ ... ]}'> parsed as a member of
26             this class. In 0.084 and later, it parses as a member of
27             C<PPIx::Regexp::Token::Code>. This more accurately reflects the actual
28             contents of the token, and enables the recognition of the full range of
29             postfix dereference operators for versioning purposes, not just those
30             valid in interpolations.
31              
32             =head1 METHODS
33              
34             This class provides the following public methods beyond those provided
35             by its superclass.
36              
37             =cut
38              
39             package PPIx::Regexp::Token::Interpolation;
40              
41 9     9   43 use strict;
  9         11  
  9         252  
42 9     9   26 use warnings;
  9         13  
  9         332  
43              
44 9     9   31 use base qw{ PPIx::Regexp::Token::Code };
  9         20  
  9         700  
45              
46 9     9   36 use Carp qw{ confess };
  9         12  
  9         456  
47 9         954 use PPIx::Regexp::Constant qw{
48             COOKIE_CLASS
49             COOKIE_REGEX_SET
50             MINIMUM_PERL
51             TOKEN_LITERAL
52             @CARP_NOT
53 9     9   36 };
  9         10  
54              
55             our $VERSION = '0.091';
56              
57 9     9   40 use constant VERSION_WHEN_IN_REGEX_SET => '5.017009';
  9         19  
  9         9725  
58              
59             sub __new {
60 105     105   8625 my ( $class, $content, %arg ) = @_;
61              
62             defined $arg{perl_version_introduced}
63 105 50       389 or $arg{perl_version_introduced} = MINIMUM_PERL;
64              
65 105         470 my $self = $class->SUPER::__new( $content, %arg );
66              
67 105         351 return $self;
68             }
69              
70             =head2 is_matcher
71              
72             This method returns C<undef> because a static analysis can not in
73             general tell whether a piece of code matches anything.
74              
75             =cut
76              
77 0     0 1 0 sub is_matcher { return undef; } ## no critic (ProhibitExplicitReturnUndef)
78              
79             # Return true if the token can be quantified, and false otherwise
80             # This can be quantified because it might interpolate a quantifiable
81             # token. Of course, it might not, but we need to be permissive here.
82             # sub can_be_quantified { return };
83              
84             # We overrode this in PPIx::Regexp::Token::Code, since (?{...}) did not
85             # appear until Perl 5.5. But interpolation has been there since the
86             # beginning, so we have to override again. This turns out to be OK,
87             # though, because while Regex Sets were introduced in 5.17.8,
88             # interpolation inside them was not introduced until 5.17.9.
89             sub perl_version_introduced {
90 17     17 1 1373 my ( $self ) = @_;
91 17         38 return $self->{perl_version_introduced};
92             }
93              
94             # Normalize the content of an interpolation object before making it into
95             # a PPI document. The issue here is that things like ${x} are at least
96             # warnings outside strings, but are normal inside them.
97             sub __ppi_normalize_content {
98 115     115   178 my ( $self ) = @_;
99 115         143 my $content;
100 115 50       222 defined( $content = $self->content() )
101             or return $content;
102             # NOTE: perldata gives a regexp for this, but it requires Perl 5.10.
103             # I believe the following caputures the intent, except possibly for
104             # various weird combinations of '::' and "'".
105 115         413 $content =~
106             s/ \A
107             ( \$ \# \$* | [\@\$] \$* ) # Sigil and possible casts
108             [{] \s* (?: :: )* '? # per perldata
109             ( ^? (?: \w+ (?: (?: :: | ' ) \w+ )* (?: :: )? | [[:punct:]] ) )
110             \s* [}] \z
111             /$1$2/smx;
112 115         329 return $content;
113             }
114              
115             # Match the beginning of an interpolation.
116              
117             my $interp_re =
118             qr{ \A (?= [\@\$]? \$ [-\w&`'+^./\\";%=~:?!\@\$<>\[\]\{\},#] |
119             \@ [\w\{] )
120             }smx;
121              
122             # Match bracketed interpolation
123              
124             my $brkt_interp_re =
125             qr{ \A (?: [\@\$] \$* [#]? \$* [\{] (?: [][\-&`'+,^./\\";%=:?\@\$<>,#] |
126             \^? \w+ (?: :: \w+ )* ) [\}] |
127             \@ [\{] \w+ (?: :: \w+ )* [\}] )
128             }smx;
129              
130             # We pull out the logic of finding and dealing with the interpolation
131             # into a separate subroutine because if we fail to find an interpolation
132             # we want to do something with the sigils.
133              
134             my %allow_subscript_based_on_cast_symbol = (
135             q<$#> => 0,
136             q<$> => 1,
137             q<@> => 1,
138             );
139              
140             sub _interpolation {
141 110     110   232 my ( $class, $tokenizer, undef, $in_regexp ) = @_; # $character unused
142              
143             # If the regexp does not interpolate, bail now.
144 110 100       260 $tokenizer->interpolates() or return;
145              
146             # If we're a bracketed interpolation, just accept it
147 108 100       284 if ( my $len = $tokenizer->find_regexp( $brkt_interp_re ) ) {
148 8         34 return $len;
149             }
150              
151             # Make sure we start off plausibly
152 100 100       198 defined $tokenizer->find_regexp( $interp_re )
153             or return;
154              
155             # See if PPI can figure out what we have
156 83 50       337 my $doc = $tokenizer->ppi_document()
157             or return;
158              
159             # Get the first statement to work on.
160 83 50       124086 my $stmt = $doc->find_first( 'PPI::Statement' )
161             or return;
162              
163 83         14402 my @accum; # The elements of the interpolation
164             my $allow_subscript; # Assume no subscripts allowed
165 83         144 my $want_class = __PACKAGE__; # Assume we want an interpolation.
166              
167             # Find the beginning of the interpolation
168 83 50       226 my $next = $stmt->schild( 0 ) or return;
169              
170             # The interpolation should start with
171 83 100       1175 if ( $next->isa( 'PPI::Token::Symbol' ) ) {
    100          
    50          
172              
173             # A symbol
174 73         135 push @accum, $next;
175 73         129 $allow_subscript = 1; # Subscripts are allowed
176              
177             } elsif ( $next->isa( 'PPI::Token::Cast' ) ) {
178              
179             # Or a cast followed by a block
180 8         18 push @accum, $next;
181 8 50       33 $next = $next->next_sibling() or return;
182 8 100       173 if ( $next->isa( 'PPI::Token::Symbol' ) ) {
    50          
183             defined (
184             $allow_subscript =
185             $allow_subscript_based_on_cast_symbol{
186 6 50       18 $accum[-1]->content()
187             }
188             ) or return;
189 6         33 push @accum, $next;
190             } elsif ( $next->isa( 'PPI::Structure::Block' ) ) {
191             # We want @{[ ... ]} to parse as a PPIx::Regexp::Token::Code.
192             # PPI parses this as a cast followed by a block. The block
193             # contains a single statement, which contains a single
194             # constructor. So:
195 2         29 my @kids = $next->schildren();
196 2 50 33     43 if ( @kids == 1 && $kids[0]->isa( 'PPI::Statement' ) ) {
197 2         7 @kids = $kids[0]->schildren();
198 2 50 33     21 if ( @kids == 1 &&
      33        
199             $kids[0]->isa( 'PPI::Structure::Constructor' ) &&
200             $kids[0]->start() eq '[' ) {
201 2         36 $want_class = 'PPIx::Regexp::Token::Code';
202             }
203             }
204 2         6 push @accum, $next;
205             } else {
206 0         0 return;
207             }
208              
209             } elsif ( $next->isa( 'PPI::Token::ArrayIndex' ) ) {
210              
211             # Or an array index
212 2         4 push @accum, $next;
213              
214             } else {
215              
216             # None others need apply.
217 0         0 return;
218              
219             }
220              
221             # The interpolation _may_ be subscripted. If so ...
222             {
223              
224             # Only accept a subscript if wanted and available
225 83 100 100     106 $allow_subscript and $next = $next->snext_sibling() or last;
  106         429  
226              
227             # Accept an optional dereference operator.
228 63         1503 my @subscr;
229 63 100       232 if ( $next->isa( 'PPI::Token::Operator' ) ) {
230 18 100       53 $next->content() eq '->' or last;
231 4         18 push @subscr, $next;
232 4 50       13 $next = $next->next_sibling() or last;
233              
234             # postderef was introduced in 5.19.5, per perl5195delta.
235 4 50       89 if ( my $deref = $tokenizer->__recognize_postderef(
236             __PACKAGE__, $next ) ) {
237 4         37 push @accum, @subscr, $deref;
238 4         7 last;
239             }
240             }
241              
242             # Accept only a subscript
243 45 100       154 $next->isa( 'PPI::Structure::Subscript' ) or last;
244              
245             # The subscript must have a closing delimiter.
246 25 50       67 $next->finish() or last;
247              
248             # If we are in a regular expression rather than a replacement
249             # string, screen the subscript for content, since [] could be a
250             # character class, and {} could be a quantifier. The perlop docs
251             # say that Perl applies undocumented heuristics subject to
252             # change without notice to figure this out. So we do our poor
253             # best to be heuristical and undocumented.
254 25 100 100     191 not $in_regexp or $class->_subscript( $next ) or last;
255              
256             # If we got this far, accept the subscript and try for another
257             # one.
258 23         55 push @accum, @subscr, $next;
259 23         31 redo;
260             }
261              
262             # Compute the length of all the PPI elements accumulated, and return
263             # it.
264 83         1046 my $length = 0;
265 83         158 foreach ( @accum ) {
266 122 100       520 $length += ref $_ ? length $_->content() : $_;
267             }
268 83         1014 return ( $length, $want_class );
269             }
270              
271             {
272 9     9   64 no warnings qw{ qw }; ## no critic (ProhibitNoWarnings)
  9         13  
  9         4289  
273              
274             my %accept = map { $_ => 1 } qw{ $ $# @ };
275              
276             sub __postderef_accept_cast {
277 108     108   205 return \%accept;
278             }
279             }
280              
281             {
282              
283             my %allowed = (
284             '[' => '_square',
285             '{' => '_curly',
286             );
287              
288             sub _subscript {
289 21     21   36 my ( $class, $struct ) = @_;
290              
291             # We expect to have a left delimiter, which is either a '[' or a
292             # '{'.
293 21 50       40 my $left = $struct->start() or return;
294 21         93 my $lc = $left->content();
295 21 50       123 my $handler = $allowed{$lc} or return;
296              
297             # We expect a single child, which is a PPI::Statement
298 21 50       58 ( my @kids = $struct->schildren() ) == 1 or return;
299 21 50       221 $kids[0]->isa( 'PPI::Statement' ) or return;
300              
301             # We expect the statement to have at least one child.
302 21 50       44 ( @kids = $kids[0]->schildren() ) or return;
303              
304 21         165 return $class->$handler( @kids );
305              
306             }
307              
308             }
309              
310             # Return true if we think a curly-bracketed subscript is really a
311             # subscript, rather than a quantifier.
312             # Called as $class->$handler( ... ) above
313             sub _curly { ## no critic (ProhibitUnusedPrivateSubroutines)
314 17     17   35 my ( undef, @kids ) = @_; # Invocant unused
315              
316             # If the first child is a word, and either it is an only child or
317             # the next child is the fat comma operator, we accept it as a
318             # subscript.
319 17 100       53 if ( $kids[0]->isa( 'PPI::Token::Word' ) ) {
320 14 100       49 @kids == 1 and return 1;
321 4 50 33     14 $kids[1]->isa( 'PPI::Token::Operator' )
322             and $kids[1]->content() eq '=>'
323             and return 1;
324             }
325              
326             # If the first child is a symbol,
327 3 100 66     17 if ( @kids && $kids[0]->isa( 'PPI::Token::Symbol' ) ) {
328             # Accept it if it is the only child
329 1 50       3 @kids == 1
330             and return 1;
331             # Accept it if there are exactly two children and the second is
332             # a subscript.
333 1 50 33     15 @kids == 2
334             and $kids[1]->isa( 'PPI::Structure::Subscript' )
335             and return 1;
336             }
337              
338             # We reject anything else.
339 2         9 return;
340             }
341              
342             # Return true if we think a square-bracketed subscript is really a
343             # subscript, rather than a character class.
344             # Called as $class->$handler( ... ) above
345             sub _square { ## no critic (ProhibitUnusedPrivateSubroutines)
346 4     4   11 my ( undef, @kids ) = @_; # Invocant unused
347              
348             # We expect to have either a number or a symbol as the first
349             # element.
350 4 50       25 $kids[0]->isa( 'PPI::Token::Number' ) and return 1;
351 0 0       0 $kids[0]->isa( 'PPI::Token::Symbol' ) and return 1;
352              
353             # Anything else is rejected.
354 0         0 return;
355             }
356              
357             # Alternate classes for the sigils, depending on whether we are in a
358             # character class (index 1) or not (index 0).
359             my %sigil_alternate = (
360             '$' => [ 'PPIx::Regexp::Token::Assertion', TOKEN_LITERAL ],
361             '@' => [ TOKEN_LITERAL, TOKEN_LITERAL ],
362             );
363              
364             sub __PPIX_TOKENIZER__regexp {
365 1434     1434   2416 my ( $class, $tokenizer, $character ) = @_;
366              
367 1434 100       3155 exists $sigil_alternate{$character} or return;
368              
369 105 100       349 if ( my ( $accept, $want_class ) =
370             $class->_interpolation( $tokenizer, $character, 1 )
371             ) {
372 86         6089 return $tokenizer->make_token( $accept, $want_class );
373             }
374              
375 19 50       59 my $alternate = $sigil_alternate{$character} or return;
376 19 50       52 return $tokenizer->make_token(
377             1, $alternate->[$tokenizer->cookie( COOKIE_CLASS ) ? 1 : 0 ] );
378              
379             }
380              
381             sub __PPIX_TOKENIZER__repl {
382 18     18   54 my ( $class, $tokenizer, $character ) = @_;
383              
384 18 100       46 exists $sigil_alternate{$character} or return;
385              
386 5 50       16 if ( my ( $accept, $want_class ) =
387             $class->_interpolation( $tokenizer, $character, 0 ) ) {
388 5         314 return $tokenizer->make_token( $accept, $want_class );
389             }
390              
391 0           return $tokenizer->make_token( 1, TOKEN_LITERAL );
392              
393             }
394              
395             1;
396              
397             __END__
398              
399             =begin comment
400              
401             Interpolation notes:
402              
403             $ perl -E '$foo = "\\w"; $bar = 3; say qr{$foo{$bar}}'
404             (?-xism:)
405             white2:~/Code/perl/PPIx-Regexp.new tom 22:50:33
406             $ perl -E '$foo = "\\w"; $bar = 3; say qr{foo{$bar}}'
407             (?-xism:foo{3})
408             white2:~/Code/perl/PPIx-Regexp.new tom 22:50:59
409             $ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{$foo{$bar}}'
410             (?-xism:)
411             white2:~/Code/perl/PPIx-Regexp.new tom 22:51:38
412             $ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{$foo}'
413             (?-xism:\w)
414             white2:~/Code/perl/PPIx-Regexp.new tom 22:51:50
415             $ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{$foo{baz}}'
416             (?-xism:)
417             white2:~/Code/perl/PPIx-Regexp.new tom 22:52:49
418             $ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{${foo}{baz}}'
419             (?-xism:\w{baz})
420             white2:~/Code/perl/PPIx-Regexp.new tom 22:54:07
421             $ perl -E '$foo = "\\w"; $bar = 3; %foo = {baz => 42}; say qr{${foo}{$bar}}'
422             (?-xism:\w{3})
423              
424             The above makes me think that Perl is extremely reluctant to understand
425             an interpolation followed by curlys as a hash dereference. In fact, only
426             when the interpolation was what PPI calls a block was it understood at
427             all.
428              
429             $ perl -E '$foo = { bar => 42 }; say qr{$foo->{bar}};'
430             (?-xism:42)
431             $ perl -E '$foo = { bar => 42 }; say qr{$foo->{baz}};'
432             (?-xism:)
433              
434             On the other hand, Perl seems to be less reluctant to accept an explicit
435             dereference as a hash dereference.
436              
437             $ perl -E '$foo = "\\w"; $bar = 3; @foo = (42); say qr{$foo}'
438             (?-xism:\w)
439             white2:~/Code/perl/PPIx-Regexp.new tom 22:58:20
440             $ perl -E '$foo = "\\w"; $bar = 3; @foo = (42); say qr{$foo[0]}'
441             (?-xism:42)
442             white2:~/Code/perl/PPIx-Regexp.new tom 22:58:28
443             $ perl -E '$foo = "\\w"; $bar = 3; @foo = (42); say qr{$foo[$bar]}'
444             (?-xism:)
445             white2:~/Code/perl/PPIx-Regexp.new tom 22:58:43
446             $ perl -E '$foo = "\\w"; $bar = 0; @foo = (42); say qr{$foo[$bar]}'
447             (?-xism:42)
448              
449             The above makes it somewhat easier to get $foo[$bar] interpreted as an
450             array dereference, but it appears to make use of information that is not
451             available to a static analysis, such as whether $foo[$bar] exists.
452              
453             Actually, the above suggests a strategy: a subscript of any kind is to
454             be accepted as a subscript if it looks like \[\d+\], \[\$foo\], \{\w+\},
455             or \{\$foo\}. Otherwise, accept it as a character class or a quantifier
456             depending on the delimiter. Obviously when I bring PPI to bear I will
457             have to keep track of '->' operators before subscripts, and shed them
458             from the interpolation as well if the purported subscript does not pass
459             muster.
460              
461             =end comment
462              
463             =head1 SUPPORT
464              
465             Support is by the author. Please file bug reports at
466             L<https://rt.cpan.org/Public/Dist/Display.html?Name=PPIx-Regexp>,
467             L<https://github.com/trwyant/perl-PPIx-Regexp/issues>, or in
468             electronic mail to the author.
469              
470             =head1 AUTHOR
471              
472             Thomas R. Wyant, III F<wyant at cpan dot org>
473              
474             =head1 COPYRIGHT AND LICENSE
475              
476             Copyright (C) 2009-2023, 2025 by Thomas R. Wyant, III
477              
478             This program is free software; you can redistribute it and/or modify it
479             under the same terms as Perl 5.10.0. For more details, see the full text
480             of the licenses in the directory LICENSES.
481              
482             This program is distributed in the hope that it will be useful, but
483             without any warranty; without even the implied warranty of
484             merchantability or fitness for a particular purpose.
485              
486             =cut
487              
488             # ex: set textwidth=72 :