File Coverage

blib/lib/PPIx/Regexp/Tokenizer.pm
Criterion Covered Total %
statement 497 525 94.6
branch 177 238 74.3
condition 44 64 68.7
subroutine 86 90 95.5
pod 29 29 100.0
total 833 946 88.0


line stmt bran cond sub pod time code
1             package PPIx::Regexp::Tokenizer;
2              
3 9     9   47 use strict;
  9         12  
  9         282  
4 9     9   30 use warnings;
  9         10  
  9         345  
5              
6 9     9   33 use base qw{ PPIx::Regexp::Support };
  9         12  
  9         2105  
7              
8 9     9   50 use Carp qw{ carp croak confess };
  9         12  
  9         485  
9 9     9   2079 use PPI::Document;
  9         789231  
  9         399  
10 9         1224 use PPIx::Regexp::Constant qw{
11             ARRAY_REF
12             CODE_REF
13             HASH_REF
14             LOCATION_LINE
15             LOCATION_CHARACTER
16             LOCATION_COLUMN
17             LOCATION_LOGICAL_LINE
18             MINIMUM_PERL
19             REGEXP_REF
20             TOKEN_LITERAL
21             TOKEN_UNKNOWN
22             @CARP_NOT
23 9     9   82 };
  9         12  
24 9     9   4160 use PPIx::Regexp::Token::Assertion ();
  9         27  
  9         207  
25 9     9   3849 use PPIx::Regexp::Token::Backreference ();
  9         35  
  9         265  
26 9     9   3636 use PPIx::Regexp::Token::Backtrack ();
  9         25  
  9         188  
27 9     9   3486 use PPIx::Regexp::Token::CharClass::POSIX ();
  9         23  
  9         175  
28 9     9   3564 use PPIx::Regexp::Token::CharClass::POSIX::Unknown ();
  9         26  
  9         186  
29 9     9   3827 use PPIx::Regexp::Token::CharClass::Simple ();
  9         26  
  9         185  
30 9     9   3537 use PPIx::Regexp::Token::Code ();
  9         26  
  9         165  
31 9     9   3766 use PPIx::Regexp::Token::Comment ();
  9         28  
  9         171  
32 9     9   3898 use PPIx::Regexp::Token::Condition ();
  9         25  
  9         260  
33 9     9   3489 use PPIx::Regexp::Token::Control ();
  9         24  
  9         204  
34 9     9   3403 use PPIx::Regexp::Token::Delimiter ();
  9         26  
  9         196  
35 9     9   3168 use PPIx::Regexp::Token::Greediness ();
  9         24  
  9         161  
36 9     9   3427 use PPIx::Regexp::Token::GroupType::Assertion ();
  9         27  
  9         225  
37 9     9   3569 use PPIx::Regexp::Token::GroupType::Atomic_Script_Run ();
  9         24  
  9         193  
38 9     9   3421 use PPIx::Regexp::Token::GroupType::BranchReset ();
  9         23  
  9         190  
39 9     9   3587 use PPIx::Regexp::Token::GroupType::Code ();
  9         22  
  9         191  
40 9     9   3626 use PPIx::Regexp::Token::GroupType::Modifier ();
  9         22  
  9         182  
41 9     9   3290 use PPIx::Regexp::Token::GroupType::NamedCapture ();
  9         25  
  9         167  
42 9     9   3169 use PPIx::Regexp::Token::GroupType::Script_Run ();
  9         21  
  9         182  
43 9     9   3493 use PPIx::Regexp::Token::GroupType::Subexpression ();
  9         20  
  9         179  
44 9     9   3507 use PPIx::Regexp::Token::GroupType::Switch ();
  9         25  
  9         270  
45 9     9   3817 use PPIx::Regexp::Token::Interpolation ();
  9         26  
  9         215  
46 9     9   4159 use PPIx::Regexp::Token::Literal ();
  9         25  
  9         238  
47 9     9   54 use PPIx::Regexp::Token::Modifier ();
  9         14  
  9         102  
48 9     9   3765 use PPIx::Regexp::Token::Operator ();
  9         20  
  9         176  
49 9     9   3524 use PPIx::Regexp::Token::Quantifier ();
  9         24  
  9         183  
50 9     9   44 use PPIx::Regexp::Token::Recursion ();
  9         13  
  9         94  
51 9     9   25 use PPIx::Regexp::Token::Structure ();
  9         11  
  9         85  
52 9     9   3550 use PPIx::Regexp::Token::Unknown ();
  9         25  
  9         153  
53 9     9   3390 use PPIx::Regexp::Token::Whitespace ();
  9         27  
  9         208  
54 9         438 use PPIx::Regexp::Util qw{
55             is_ppi_regexp_element
56             __instance
57 9     9   40 };
  9         15  
58              
59 9     9   38 use Scalar::Util qw{ looks_like_number };
  9         13  
  9         48528  
60              
61             our $VERSION = '0.091';
62              
63             our $DEFAULT_POSTDEREF;
64             defined $DEFAULT_POSTDEREF
65             or $DEFAULT_POSTDEREF = 1;
66              
67             {
68             # Names of classes containing tokenization machinery. There are few
69             # known ordering requirements, since each class recognizes its own,
70             # and I have tried to prevent overlap. Absent such constraints, the
71             # order is in perceived frequency of acceptance, to keep the search
72             # as short as possible. If I were conscientious I would gather
73             # statistics on this.
74             my @classes = ( # TODO make readonly when acceptable way appears
75             'PPIx::Regexp::Token::Literal',
76             'PPIx::Regexp::Token::Interpolation',
77             'PPIx::Regexp::Token::Control', # Note 1
78             'PPIx::Regexp::Token::CharClass::Simple', # Note 2
79             'PPIx::Regexp::Token::Quantifier',
80             'PPIx::Regexp::Token::Greediness',
81             'PPIx::Regexp::Token::CharClass::POSIX', # Note 3
82             'PPIx::Regexp::Token::Structure',
83             'PPIx::Regexp::Token::Assertion',
84             'PPIx::Regexp::Token::Backreference',
85             'PPIx::Regexp::Token::Operator', # Note 4
86             );
87              
88             # Note 1: If we are in quote mode ( \Q ... \E ), Control makes a
89             # literal out of anything it sees other than \E. So it
90             # needs to come before almost all other tokenizers. Not
91             # Literal, which already makes literals, and not
92             # Interpolation, which is legal in quote mode, but
93             # everything else.
94              
95             # Note 2: CharClass::Simple must come after Literal, because it
96             # relies on Literal to recognize a Unicode named character
97             # ( \N{something} ), so any \N that comes through to it
98             # must be the \N simple character class (which represents
99             # anything but a newline, and was introduced in Perl
100             # 5.11.0.
101              
102             # Note 3: CharClass::POSIX has to come before Structure, since both
103             # look for square brackets, and CharClass::POSIX is the
104             # more particular.
105              
106             # Note 4: Operator relies on Literal making the characters literal
107             # if they appear in a context where they can not be
108             # operators, and Control making them literals if quoting,
109             # so it must come after both.
110              
111             # Return the declared tokenizer classes.
112             sub __tokenizer_classes {
113 543     543   2217 return @classes;
114             }
115              
116             }
117              
118             {
119             my $errstr;
120              
121             sub new {
122 742     742 1 429994 my ( $class, $re, %args ) = @_;
123 742 50       2056 ref $class and $class = ref $class;
124              
125 742         1281 $errstr = undef;
126              
127             exists $args{default_modifiers}
128             and ARRAY_REF ne ref $args{default_modifiers}
129 742 50 66     2230 and do {
130 0         0 $errstr = 'default_modifiers must be an array reference';
131 0         0 return;
132             };
133              
134             my $self = {
135             index_locations => $args{index_locations}, # Index locations
136             capture => undef, # Captures from find_regexp.
137             content => undef, # The string we are tokenizing.
138             cookie => {}, # Cookies
139             cursor_curr => 0, # The current position in the string.
140             cursor_limit => undef, # The end of the portion of the
141             # string being tokenized.
142             cursor_orig => undef, # Position of cursor when tokenizer
143             # called. Used by get_token to prevent
144             # recursion.
145             cursor_modifiers => undef, # Position of modifiers.
146             default_modifiers => $args{default_modifiers} || [],
147             delimiter_finish => undef, # Finishing delimiter of regexp.
148             delimiter_start => undef, # Starting delimiter of regexp.
149             encoding => $args{encoding}, # Character encoding.
150             expect => undef, # Extra classes to expect.
151             expect_next => undef, # Extra classes as of next parse cycle
152             failures => 0, # Number of parse failures.
153             find => undef, # String for find_regexp
154             known => {}, # Known tokenizers, by mode.
155             location => $args{location},
156             match => undef, # Match from find_regexp.
157             mode => 'init', # Initialize
158             modifiers => [{}], # Modifier hash.
159             pending => [], # Tokens made but not returned.
160             prior => TOKEN_UNKNOWN, # Prior significant token.
161             source => $re, # The object we were initialized with.
162             strict => $args{strict}, # like "use re 'strict';".
163             trace => __PACKAGE__->__defined_or(
164 742   100     9082 $args{trace}, $ENV{PPIX_REGEXP_TOKENIZER_TRACE}, 0 ),
165             };
166              
167 742 100       2696 if ( __instance( $re, 'PPI::Element' ) ) {
    100          
168 11 50       34 is_ppi_regexp_element( $re )
169             or return __set_errstr( ref $re, 'not supported by', $class );
170             # TODO conditionalizstion on PPI class does not really
171             # belong here, but at the moment I have no other idea of
172             # where to put it.
173 11 50       82 $self->{content} = $re->isa( 'PPI::Token::HereDoc' ) ?
174             join( '', $re->content(), "\n", $re->heredoc(),
175             $re->terminator(), "\n" ) :
176             $re->content();
177             } elsif ( ref $re ) {
178 2         5 return __set_errstr( ref $re, 'not supported' );
179             } else {
180 729         1245 $self->{content} = $re;
181             }
182              
183 740         1549 bless $self, $class;
184              
185 740         2271 $self->{content} = $self->decode( $self->{content} );
186              
187 740         1556 $self->{cursor_limit} = length $self->{content};
188              
189             $self->{trace}
190 740 50       1758 and warn "\ntokenizing '$self->{content}'\n";
191              
192 740         2132 return $self;
193             }
194              
195             sub __set_errstr {
196 2     2   6 $errstr = join ' ', @_;
197 2         12 return;
198             }
199              
200             sub errstr {
201 2     2 1 5 return $errstr;
202             }
203              
204             }
205              
206             sub capture {
207 715     715 1 1181 my ( $self ) = @_;
208 715 100       1433 $self->{capture} or return;
209 694 50       1360 defined wantarray or return;
210 694 50       1200 return wantarray ? @{ $self->{capture} } : $self->{capture};
  694         2681  
211             }
212              
213             sub content {
214 1     1 1 2 my ( $self ) = @_;
215 1         4 return $self->{content};
216             }
217              
218             sub cookie {
219 10182     10182 1 13695 my ( $self, $name, @args ) = @_;
220 10182 50       13627 defined $name
221             or confess "Programming error - undefined cookie name";
222 10182 50       13755 if ( $self->{trace} ) {
223 0         0 local $" = ', ';
224 0         0 warn "cookie( '$name', @args )\n";
225             }
226 10182 100       25865 @args or return $self->{cookie}{$name};
227 721         1207 my $cookie = shift @args;
228 721 100       1764 if ( CODE_REF eq ref $cookie ) {
    50          
229 593         2246 return ( $self->{cookie}{$name} = $cookie );
230             } elsif ( defined $cookie ) {
231 0         0 confess "Programming error - cookie must be CODE ref or undef";
232             } else {
233 128         410 return delete $self->{cookie}{$name};
234             }
235             }
236              
237             # NOTE: Currently this is called only against
238             # COOKIE_LOOKAROUND_ASSERTION, once in PPIx::Token::GroupType::Assertion
239             # to prevent the cookie from being remade if it already exists, and once
240             # in PPIx::Regexp::Token::Assertion to determine if \K is inside a
241             # lookaround assertion. If it gets used other places, or if there is
242             # call for it, I should consider removing the underscores and
243             # documenting it as public.
244             sub __cookie_exists {
245 57     57   109 my ( $self, $name ) = @_;
246 57 50       121 defined $name
247             or confess "Programming error - undefined cookie name";
248 57         189 return $self->{cookie}{$name};
249             }
250              
251             sub default_modifiers {
252 0     0 1 0 my ( $self ) = @_;
253 0         0 return [ @{ $self->{default_modifiers} } ];
  0         0  
254             }
255              
256             sub __effective_modifiers {
257 334     334   597 my ( $self ) = @_;
258             HASH_REF eq ref $self->{effective_modifiers}
259 334 100       966 or return {};
260 326         508 return { %{ $self->{effective_modifiers} } };
  326         1101  
261             }
262              
263             sub encoding {
264 0     0 1 0 my ( $self ) = @_;
265 0         0 return $self->{encoding};
266             }
267              
268             sub expect {
269 330     330 1 1024 my ( $self, @args ) = @_;
270              
271             @args
272 330 50       624 or return;
273              
274             $self->{expect_next} = [
275 330 50       603 map { m/ \A PPIx::Regexp:: /smx ? $_ : 'PPIx::Regexp::' . $_ }
  2602         5136  
276             @args
277             ];
278 330         680 $self->{expect} = undef;
279 330         618 return;
280             }
281              
282             sub failures {
283 8     8 1 16 my ( $self ) = @_;
284 8         16 return $self->{failures};
285             }
286              
287             sub find_matching_delimiter {
288 589     589 1 937 my ( $self ) = @_;
289 589   100     1832 $self->{cursor_curr} ||= 0;
290             my $start = substr
291             $self->{content},
292             $self->{cursor_curr},
293 589         1276 1;
294              
295 589         858 my $inx = $self->{cursor_curr};
296 589   66     1623 my $finish = (
297             my $bracketed = $self->close_bracket( $start ) ) || $start;
298              
299             =begin comment
300              
301             $self->{trace}
302             and warn "Find matching delimiter: Start with '$start' at $self->{cursor_curr}, end with '$finish' at or before $self->{cursor_limit}\n";
303              
304             =end comment
305              
306             =cut
307              
308 589         900 my $nest = 0;
309              
310 589         1599 while ( ++$inx < $self->{cursor_limit} ) {
311 6144         7257 my $char = substr $self->{content}, $inx, 1;
312              
313             =begin comment
314              
315             $self->{trace}
316             and warn " looking at '$char' at $inx, nest level $nest\n";
317              
318             =end comment
319              
320             =cut
321              
322 6144 100 100     16155 if ( $char eq '\\' && $finish ne '\\' ) {
    100 100        
    100          
323 317         522 ++$inx;
324             } elsif ( $bracketed && $char eq $start ) {
325 1         2 ++$nest;
326             } elsif ( $char eq $finish ) {
327             --$nest < 0
328 588 100       2294 and return $inx - $self->{cursor_curr};
329             }
330             }
331              
332 2         6 return;
333             }
334              
335             sub find_regexp {
336 16627     16627 1 20345 my ( $self, $regexp ) = @_;
337              
338 16627 50 0     25574 REGEXP_REF eq ref $regexp
339             or confess
340             'Argument is a ', ( ref $regexp || 'scalar' ), ' not a Regexp';
341              
342 16627 100       27118 defined $self->{find} or $self->_remainder();
343              
344 16627 100       66695 $self->{find} =~ $regexp
345             or return;
346              
347 1848         2332 my @capture;
348 1848         5190 foreach my $inx ( 0 .. $#+ ) {
349 4267 100 66     13690 if ( defined $-[$inx] && defined $+[$inx] ) {
350             push @capture, $self->{capture} = substr
351             $self->{find},
352 3778         15986 $-[$inx],
353             $+[$inx] - $-[$inx];
354             } else {
355 489         783 push @capture, undef;
356             }
357             }
358 1848         3405 $self->{match} = shift @capture;
359 1848         2871 $self->{capture} = \@capture;
360              
361             # The following circumlocution seems to be needed under Perl 5.13.0
362             # for reasons I do not fathom -- at least in the case where
363             # wantarray is false. RT 56864 details the symptoms, which I was
364             # never able to reproduce outside Perl::Critic. But returning $+[0]
365             # directly, the value could transmogrify between here and the
366             # calling module.
367             ## my @data = ( $-[0], $+[0] );
368             ## return wantarray ? @data : $data[1];
369 1848 50       6379 return wantarray ? ( $-[0] + 0, $+[0] + 0 ) : $+[0] + 0;
370             }
371              
372             sub get_mode {
373 2598     2598 1 3260 my ( $self ) = @_;
374 2598         5077 return $self->{mode};
375             }
376              
377             sub get_start_delimiter {
378 1794     1794 1 2036 my ( $self ) = @_;
379 1794         4983 return $self->{delimiter_start};
380             }
381              
382             sub get_token {
383 4133     4133 1 5170 my ( $self ) = @_;
384              
385             caller eq __PACKAGE__ or $self->{cursor_curr} > $self->{cursor_orig}
386 4133 50 66     10247 or confess 'Programming error - get_token() called without ',
387             'first calling make_token()';
388              
389 4133         6316 my $handler = '__PPIX_TOKENIZER__' . $self->{mode};
390              
391             my $code = $self->can( $handler )
392             or confess 'Programming error - ',
393             "Getting token in mode '$self->{mode}'. ",
394             "cursor_curr = $self->{cursor_curr}; ",
395             "cursor_limit = $self->{cursor_limit}; ",
396             "length( content ) = ", length $self->{content},
397 4133 50       11779 "; content = '$self->{content}'";
398              
399             my $character = substr(
400             $self->{content},
401             $self->{cursor_curr},
402 4133         7640 1
403             );
404              
405             $self->{trace}
406 4133 50       7265 and warn "get_token() got '$character' from $self->{cursor_curr}\n";
407              
408 4133         7034 return ( $code->( $self, $character ) );
409             }
410              
411             sub interpolates {
412 141     141 1 230 my ( $self ) = @_;
413 141         474 return $self->{delimiter_start} ne q{'};
414             }
415              
416             sub make_token {
417 5243     5243 1 8672 my ( $self, $length, $class, $arg ) = @_;
418 5243 100       7935 defined $class or $class = caller;
419              
420 5243 50       9691 if ( $length + $self->{cursor_curr} > $self->{cursor_limit} ) {
421             $length = $self->{cursor_limit} - $self->{cursor_curr}
422 0 0       0 or return;
423             }
424              
425 5243 50       12819 $class =~ m/ \A PPIx::Regexp:: /smx
426             or $class = 'PPIx::Regexp::' . $class;
427             my $content = substr
428             $self->{content},
429             $self->{cursor_curr},
430 5243         8622 $length;
431              
432             $self->{trace}
433 5243 50       7863 and warn "make_token( $length, '$class' ) => '$content'\n";
434 5243 50       7894 $self->{trace} > 1
435             and warn " make_token: cursor_curr = $self->{cursor_curr}; ",
436             "cursor_limit = $self->{cursor_limit}\n";
437             my $token = $class->__new( $content,
438             tokenizer => $self,
439 5243 100       6327 %{ $arg || {} } )
  5243 50       23979  
440             or return;
441              
442             $self->{index_locations}
443 5243 100       11023 and $self->_update_location( $token );
444              
445             $token->significant()
446 5243 100       11087 and $self->{expect} = undef;
447              
448 5243 100       18333 $token->isa( TOKEN_UNKNOWN ) and $self->{failures}++;
449              
450 5243         6786 $self->{cursor_curr} += $length;
451 5243         6281 $self->{find} = undef;
452 5243         5979 $self->{match} = undef;
453 5243         6285 $self->{capture} = undef;
454              
455 5243         6260 foreach my $name ( keys %{ $self->{cookie} } ) {
  5243         9742  
456 3615         4418 my $cookie = $self->{cookie}{$name};
457             $cookie->( $self, $token )
458 3615 100       6492 or delete $self->{cookie}{$name};
459             }
460              
461             # Record this token as the prior token if it is significant. We must
462             # do this after processing cookies, so that the cookies have access
463             # to the old token if they want.
464             $token->significant()
465 5243 100       8390 and $self->{prior_significant_token} = $token;
466              
467 5243         15786 return $token;
468             }
469              
470             sub match {
471 86     86 1 150 my ( $self ) = @_;
472 86         175 return $self->{match};
473             }
474              
475             sub modifier {
476 4932     4932 1 6663 my ( $self, $modifier ) = @_;
477             return PPIx::Regexp::Token::Modifier::__asserts(
478 4932         9618 $self->{modifiers}[-1], $modifier );
479             }
480              
481             sub modifier_duplicate {
482 292     292 1 442 my ( $self ) = @_;
483 292         573 push @{ $self->{modifiers} },
484 292         397 { %{ $self->{modifiers}[-1] } };
  292         855  
485 292         477 return;
486             }
487              
488             sub modifier_modify {
489 595     595 1 1322 my ( $self, %args ) = @_;
490              
491             # Modifier code is centralized in PPIx::Regexp::Token::Modifier
492             $self->{modifiers}[-1] =
493             PPIx::Regexp::Token::Modifier::__PPIX_TOKENIZER__modifier_modify(
494 595         1915 $self->{modifiers}[-1], \%args );
495              
496 595         1015 return;
497              
498             }
499              
500             sub modifier_pop {
501 288     288 1 492 my ( $self ) = @_;
502 288         747 @{ $self->{modifiers} } > 1
503 288 100       328 and pop @{ $self->{modifiers} };
  282         603  
504 288         666 return;
505             }
506              
507             sub modifier_seen {
508 8     8 1 25 my ( $self, $modifier ) = @_;
509 8         15 foreach my $mod ( reverse @{ $self->{modifiers} } ) {
  8         24  
510 10 100       35 exists $mod->{$modifier}
511             and return 1;
512             }
513 5         23 return;
514             }
515              
516             sub next_token {
517 5780     5780 1 7011 my ( $self ) = @_;
518              
519             {
520              
521 5780 100       6357 if ( @{ $self->{pending} } ) {
  9896         9693  
  9896         15389  
522 5241         5305 return shift @{ $self->{pending} };
  5241         12984  
523             }
524              
525 4655 100       8421 if ( $self->{cursor_curr} >= $self->{cursor_limit} ) {
526             $self->{cursor_limit} >= length $self->{content}
527 1099 100       3059 and return;
528 560 50       1310 $self->{mode} eq 'finish' and return;
529 560         1442 $self->_set_mode( 'finish' );
530 560         899 $self->{cursor_limit} += length $self->{delimiter_finish};
531             }
532              
533 4116 50       7397 if ( my @tokens = $self->get_token() ) {
534 4116         4544 push @{ $self->{pending} }, @tokens;
  4116         6564  
535 4116         6007 redo;
536              
537             }
538              
539             }
540              
541 0         0 return;
542              
543             }
544              
545             sub peek {
546 379     379 1 606 my ( $self, $offset ) = @_;
547 379 100       712 defined $offset or $offset = 0;
548 379 50       684 $offset < 0 and return;
549 379         544 $offset += $self->{cursor_curr};
550 379 50       644 $offset >= $self->{cursor_limit} and return;
551 379         1184 return substr $self->{content}, $offset, 1;
552             }
553              
554             sub ppi_document {
555 83     83 1 156 my ( $self ) = @_;
556              
557 83 50       156 defined $self->{find} or $self->_remainder();
558              
559 83         467 return PPI::Document->new( \"$self->{find}" );
560             }
561              
562             sub prior_significant_token {
563 2413     2413 1 3302 my ( $self, $method, @args ) = @_;
564 2413 100       3260 defined $method or return $self->{prior_significant_token};
565             $self->{prior_significant_token}->can( $method )
566             or confess 'Programming error - ',
567             ( ref $self->{prior_significant_token} ||
568 2394 50 0     6943 $self->{prior_significant_token} ),
569             ' does not support method ', $method;
570 2394         5897 return $self->{prior_significant_token}->$method( @args );
571             }
572              
573             # my $length = $token->__recognize_postderef( $tokenizer, $iterator ).
574             #
575             # This method is private to the PPIx-Regexp package, and may be changed
576             # or retracted without warning. What it does is to recognize postfix
577             # dereferences. It returns the length in characters of the first postfix
578             # dereference found, or a false value if none is found.
579             #
580             # The optional $iterator argument can be one of the following:
581             # - A code reference, which will be called to provide PPI::Element
582             # objects to be checked to see if they represent a postfix
583             # dereference.
584             # - A PPI::Element, which is checked to see if it is a postfix
585             # dereference.
586             # - Undef, or omitted, in which case ppi() is called on the invocant,
587             # and everything that follows the '->' operator is checked to see if
588             # it is a postfix dereference.
589             # - Anything else results in an exception and stack trace.
590              
591             {
592             sub __recognize_postderef {
593 149     149   251 my ( $self, $token, $iterator ) = @_;
594              
595             # Note that if ppi() gets called I have to hold a reference to
596             # the returned object until I am done with all its children.
597 149         191 my $ppi;
598 149 100       305 if ( ! defined $iterator ) {
    50          
    0          
599              
600             # This MUST be done before ppi() is called.
601             $self->{index_locations}
602 145 100       297 and $self->_update_location( $token );
603              
604 145         410 $ppi = $token->ppi();
605 29         6362 my @ops = grep { '->' eq $_->content() } @{
606 145 100       256 $ppi->find( 'PPI::Token::Operator' ) || [] };
  145         458  
607             $iterator = sub {
608 151 100   151   625 my $op = shift @ops
609             or return;
610 15         55 return $op->snext_sibling();
611 145         31538 };
612             } elsif ( $iterator->isa( 'PPI::Element' ) ) {
613 4         9 my @eles = ( $iterator );
614             $iterator = sub {
615 4     4   10 return shift @eles;
616 4         13 };
617             } elsif ( CODE_REF ne ref $iterator ) {
618 0         0 confess 'Programming error - Iterator not understood';
619             }
620              
621 149         675 my $accept = $token->__postderef_accept_cast();
622              
623 149         296 while ( my $elem = $iterator->() ) {
624              
625 19         407 my $content = $elem->content();
626              
627             # As of PPI 1.238, all postfix dereferences are parsed as
628             # casts. So if we find a cast of the correct content we have
629             # a postfix deref.
630 19 100       168 $elem->isa( 'PPI::Token::Cast' )
631             or next;
632              
633 15 100       85 if ( $content =~ m/ ( .* ) \* \z /smx ) {
    50          
634             # If we're an acceptable cast ending in a glob, accept
635             # it.
636 10 100       90 $accept->{$1}
637             and return length $content;
638             } elsif ( $accept->{$content} ) {
639             # If we're an acceptable cast followed by a subscript,
640             # we're a slice -- accept both cast and subscript.
641 5 50       16 my $next = $elem->snext_sibling()
642             or next;
643 5 50       84 $next->isa( 'PPI::Structure::Subscript' )
644             or next;
645 5         17 return length( $content ) + length( $next->content() );
646             }
647              
648             # Otherwise, we're not a postfix dereference; try the next
649             # iteration.
650             }
651              
652             # No postfix dereference found.
653 136         681 return;
654             }
655             }
656              
657             sub significant {
658 0     0 1 0 return 1;
659             }
660              
661             sub strict {
662 4     4 1 7 my ( $self ) = @_;
663 4         20 return $self->{strict};
664             }
665              
666             sub _known_tokenizers {
667 3036     3036   3743 my ( $self ) = @_;
668              
669 3036         3546 my $mode = $self->{mode};
670              
671 3036         3394 my @expect;
672 3036 100       5051 if ( $self->{expect_next} ) {
673 328         537 $self->{expect} = $self->{expect_next};
674 328         534 $self->{expect_next} = undef;
675             }
676 3036 100       4903 if ( $self->{expect} ) {
677             @expect = $self->_known_tokenizer_check(
678 334         470 @{ $self->{expect} } );
  334         809  
679             }
680              
681             exists $self->{known}{$mode} and return (
682 3036 100       6371 @expect, @{ $self->{known}{$mode} } );
  2493         7574  
683              
684 543         1346 my @found = $self->_known_tokenizer_check(
685             $self->__tokenizer_classes() );
686              
687 543         1809 $self->{known}{$mode} = \@found;
688 543         1660 return (@expect, @found);
689             }
690              
691             sub _known_tokenizer_check {
692 877     877   2566 my ( $self, @args ) = @_;
693              
694 877         1190 my $handler = '__PPIX_TOKENIZER__' . $self->{mode};
695 877         1085 my @found;
696              
697 877         1221 foreach my $class ( @args ) {
698              
699 8611 100       33493 $class->can( $handler ) or next;
700 8408         10970 push @found, $class;
701              
702             }
703              
704 877         3385 return @found;
705             }
706              
707             sub tokens {
708 205     205 1 450 my ( $self ) = @_;
709              
710 205         300 my @rslt;
711 205         738 while ( my $token = $self->next_token() ) {
712 1933         3631 push @rslt, $token;
713             }
714              
715 205         1206 return @rslt;
716             }
717              
718             # $self->_deprecation_notice( $type, $name );
719             #
720             # This method centralizes deprecation. Type is 'attribute' or
721             # 'method'. Deprecation is driven of the %deprecate hash. Values
722             # are:
723             # false - no warning
724             # 1 - warn on first use
725             # 2 - warn on each use
726             # 3 - die on each use.
727             #
728             # $self->_deprecation_in_progress( $type, $name )
729             #
730             # This method returns true if the deprecation is in progress. In
731             # fact it returns the deprecation level.
732              
733             =begin comment
734              
735             {
736              
737             my %deprecate = (
738             attribute => {
739             postderef => 3,
740             },
741             );
742              
743             sub _deprecation_notice {
744             my ( undef, $type, $name, $repl ) = @_; # Invocant unused
745             $deprecate{$type} or return;
746             $deprecate{$type}{$name} or return;
747             my $msg = sprintf 'The %s %s is %s', $name, $type,
748             $deprecate{$type}{$name} > 2 ? 'removed' : 'deprecated';
749             defined $repl
750             and $msg .= "; use $repl instead";
751             $deprecate{$type}{$name} >= 3
752             and croak $msg;
753             warnings::enabled( 'deprecated' )
754             and carp $msg;
755             $deprecate{$type}{$name} == 1
756             and $deprecate{$type}{$name} = 0;
757             return;
758             }
759              
760             sub _deprecation_in_progress {
761             my ( $self, $type, $name ) = @_;
762             $deprecate{$type} or return;
763             return $deprecate{$type}{$name};
764             }
765              
766             }
767              
768             =end comment
769              
770             =cut
771              
772             sub _remainder {
773 3623     3623   4530 my ( $self ) = @_;
774              
775             $self->{cursor_curr} > $self->{cursor_limit}
776 3623 50       6404 and confess "Programming error - Trying to find past end of string";
777             $self->{find} = substr(
778             $self->{content},
779             $self->{cursor_curr},
780             $self->{cursor_limit} - $self->{cursor_curr}
781 3623         7473 );
782              
783 3623         4716 return;
784             }
785              
786             sub _make_final_token {
787 10     10   22 my ( $self, $len, $class, $arg ) = @_;
788 10         26 my $token = $self->make_token( $len, $class, $arg );
789 10         45 $self->_set_mode( 'kaput' );
790 10         57 return $token;
791             }
792              
793             sub _set_mode {
794 1657     1657   2699 my ( $self, $mode ) = @_;
795             $self->{trace}
796 1657 50       2946 and warn "Tokenizer going from mode $self->{mode} to $mode\n";
797 1657         2316 $self->{mode} = $mode;
798 1657 100       2889 if ( 'kaput' eq $mode ) {
799             $self->{cursor_curr} = $self->{cursor_limit} =
800 537         1061 length $self->{content};
801             }
802 1657         2259 return;
803             }
804              
805             sub __init_error {
806 10     10   16 my ( $self , $err ) = @_;
807 10 100       26 defined $err
808             or $err = 'Tokenizer found illegal first characters';
809             return $self->_make_final_token(
810 10         55 length $self->{content}, TOKEN_UNKNOWN, {
811             error => $err,
812             },
813             );
814             }
815              
816             sub _update_location {
817 107     107   156 my ( $self, $token ) = @_;
818             $token->{location} # Idempotent
819 107 100       187 and return;
820 105   66     213 my $loc = $self->{_location} ||= do {
821             my %loc = (
822             location => $self->{location},
823 12         34 );
824 12 100       40 if ( __instance( $self->{source}, 'PPI::Element' ) ) {
825 11   33     77 $loc{location} ||= $self->{source}->location();
826 11 50       1395 if ( my $doc = $self->{source}->document() ) {
827 11         250 $loc{tab_width} = $doc->tab_width();
828             }
829             }
830 12   100     70 $loc{tab_width} ||= 1;
831 12         30 \%loc;
832             };
833             $loc->{location}
834 105 50       197 or return;
835 105         104 $token->{location} = [ @{ $loc->{location} } ];
  105         220  
836 105 50       208 if ( defined( my $content = $token->content() ) ) {
837              
838 105         124 my $lines;
839 105         214 pos( $content ) = 0;
840 105         238 $lines++ while $content =~ m/ \n /smxgc;
841 105 100       160 if ( pos $content ) {
842 2         3 $loc->{location}[LOCATION_LINE] += $lines;
843 2         4 $loc->{location}[LOCATION_LOGICAL_LINE] += $lines;
844             $loc->{location}[LOCATION_CHARACTER] =
845 2         3 $loc->{location}[LOCATION_COLUMN] = 1;
846             }
847              
848 105 100       205 if ( my $chars = length( $content ) - pos( $content ) ) {
849 102         120 $loc->{location}[LOCATION_CHARACTER] += $chars;
850 102 100 100     197 if ( $loc->{tab_width} > 1 && $content =~ m/ \t /smx ) {
851 5         8 my $pos = $loc->{location}[LOCATION_COLUMN];
852 5         6 my $tab_width = $loc->{tab_width};
853             # Stolen shamelessly from PPI::Document::_visual_length
854 5         6 my ( $vis_inc );
855 5         14 foreach my $part ( split /(\t)/, $content ) {
856 10 100       14 if ($part eq "\t") {
857 5         6 $vis_inc = $tab_width - ($pos-1) % $tab_width;
858             } else {
859 5         6 $vis_inc = length $part;
860             }
861 10         10 $pos += $vis_inc;
862             }
863 5         10 $loc->{location}[LOCATION_COLUMN] = $pos;
864             } else {
865 97         133 $loc->{location}[LOCATION_COLUMN] += $chars;
866             }
867             }
868              
869             }
870 105         175 return;
871             }
872              
873             sub __PPIX_TOKENIZER__init {
874 537     537   1062 my ( $self ) = @_;
875              
876 537 50       2879 $self->find_regexp(
877             qr{ \A ( \s* ) ( qr | m | s )? ( \s* ) ( . ) }smx )
878             or return $self->__init_error();
879              
880 537         2114 my ( $leading_white, $type, $next_white, $delim_start ) = $self->capture();
881              
882 537 100       1577 defined $type
883             or $type = '';
884              
885 537 100 100     2443 $type
886             or $delim_start =~ m< \A [/?] \z >smx
887             or return $self->__init_error();
888 531 100 100     2135 $type
      100        
889             and not $next_white
890             and $delim_start =~ m< \A \w \z >smx
891             and return $self->__init_error();
892              
893 529         1274 $self->{type} = $type;
894              
895 529         830 my @tokens;
896              
897 529 100       1568 '' ne $leading_white
898             and push @tokens, $self->make_token( length $leading_white,
899             'PPIx::Regexp::Token::Whitespace' );
900 529         1830 push @tokens, $self->make_token( length $type,
901             'PPIx::Regexp::Token::Structure' );
902 529 100       1207 '' ne $next_white
903             and push @tokens, $self->make_token( length $next_white,
904             'PPIx::Regexp::Token::Whitespace' );
905              
906 529         1144 $self->{delimiter_start} = $delim_start;
907              
908             $self->{trace}
909 529 50       1357 and warn "Tokenizer found regexp start delimiter '$delim_start' at $self->{cursor_curr}\n";
910              
911 529 50       1366 if ( my $offset = $self->find_matching_delimiter() ) {
912 529         1047 my $cursor_limit = $self->{cursor_curr} + $offset;
913             $self->{trace}
914 529 50       1127 and warn "Tokenizer found regexp end delimiter at $cursor_limit\n";
915 529 100       1562 if ( $self->__number_of_extra_parts() ) {
916             ### my $found_embedded_comments;
917 46 100       152 if ( $self->close_bracket(
918             $self->{delimiter_start} ) ) {
919             pos $self->{content} = $self->{cursor_curr} +
920 7         39 $offset + 1;
921             # If we're bracketed, there may be Perl comments between
922             # the regex and the replacement. PPI gets the parse
923             # wrong as of 1.220, but if we get the handling of the
924             # underlying string right, we will Just Work when PPI
925             # gets it right.
926 7         54 while ( $self->{content} =~
927             m/ \G \s* \n \s* \# [^\n]* /smxgc ) {
928             ## $found_embedded_comments = 1;
929             }
930 7         27 $self->{content} =~ m/ \s* /smxgc;
931             } else {
932             pos $self->{content} = $self->{cursor_curr} +
933 39         199 $offset;
934             }
935             # Localizing cursor_curr and delimiter_start would be
936             # cleaner, but I don't want the old values restored if a
937             # parse error occurs.
938 46         109 my $cursor_curr = $self->{cursor_curr};
939 46         132 my $delimiter_start = $self->{delimiter_start};
940 46         114 $self->{cursor_curr} = pos $self->{content};
941             $self->{delimiter_start} = substr
942             $self->{content},
943             $self->{cursor_curr},
944 46         112 1;
945             $self->{trace}
946 46 50       126 and warn "Tokenizer found replacement start delimiter '$self->{delimiter_start}' at $self->{cursor_curr}\n";
947 46 100       101 if ( my $s_off = $self->find_matching_delimiter() ) {
948             $self->{cursor_modifiers} =
949 44         112 $self->{cursor_curr} + $s_off + 1;
950             $self->{trace}
951 44 50       143 and warn "Tokenizer found replacement end delimiter at @{[
952 0         0 $self->{cursor_curr} + $s_off ]}\n";
953 44         78 $self->{cursor_curr} = $cursor_curr;
954 44         93 $self->{delimiter_start} = $delimiter_start;
955             } else {
956             $self->{trace}
957 2 50       6 and warn 'Tokenizer failed to find replacement',
958             "end delimiter starting at $self->{cursor_curr}\n";
959 2         3 $self->{cursor_curr} = 0;
960             # TODO If I were smart enough here I could check for
961             # PPI mis-parses like s{foo}
962             # #{bar}
963             # {baz}
964             # here, doing so if $found_embedded_comments (commented
965             # out above) is true. The problem is that there seem to
966             # as many mis-parses as there are possible delimiters.
967 2         6 return $self->__init_error(
968             'Tokenizer found mismatched replacement delimiters',
969             );
970             }
971             } else {
972 483         888 $self->{cursor_modifiers} = $cursor_limit + 1;
973             }
974 527         1079 $self->{cursor_limit} = $cursor_limit;
975             } else {
976 0         0 $self->{cursor_curr} = 0;
977             return $self->_make_final_token(
978 0         0 length( $self->{content} ), TOKEN_UNKNOWN, {
979             error => 'Tokenizer found mismatched regexp delimiters',
980             },
981             );
982             }
983              
984             {
985             # We have to instantiate the trailing tokens now so we can
986             # figure out what modifiers are in effect. But we can't
987             # index their locations (if desired) because they are being
988             # instantiated out of order
989              
990 527         697 local $self->{index_locations} = 0;
  527         1136  
991              
992 527         723 my @mods = @{ $self->{default_modifiers} };
  527         1050  
993 527         1751 pos $self->{content} = $self->{cursor_modifiers};
994 527         1149 local $self->{cursor_curr} = $self->{cursor_modifiers};
995 527         1261 local $self->{cursor_limit} = length $self->{content};
996 527         827 my @trailing;
997             {
998 527         655 my $len = $self->find_regexp( qr{ \A [[:lower:]]* }smx );
  527         1881  
999 527         1469 push @trailing, $self->make_token( $len,
1000             'PPIx::Regexp::Token::Modifier' );
1001             }
1002 527 100       1597 if ( my $len = $self->find_regexp( qr{ \A \s+ }smx ) ) {
1003 1         4 push @trailing, $self->make_token( $len,
1004             'PPIx::Regexp::Token::Whitespace' );
1005             }
1006 527 100       1741 if ( my $len = $self->find_regexp( qr{ \A .+ }smx ) ) {
1007 1         5 push @trailing, $self->make_token( $len, TOKEN_UNKNOWN, {
1008             error => 'Trailing characters after expression',
1009             } );
1010             }
1011 527         1508 $self->{trailing_tokens} = \@trailing;
1012 527         1359 push @mods, $trailing[0]->content();
1013             $self->{effective_modifiers} =
1014 527         1235 PPIx::Regexp::Token::Modifier::__aggregate_modifiers (
1015             @mods );
1016             $self->{modifiers} = [
1017 527         1042 { %{ $self->{effective_modifiers} } },
  527         2481  
1018             ];
1019             }
1020              
1021             $self->{delimiter_finish} = substr
1022             $self->{content},
1023             $self->{cursor_limit},
1024 527         1478 1;
1025              
1026 527         1193 push @tokens, $self->make_token( 1,
1027             'PPIx::Regexp::Token::Delimiter' );
1028              
1029 527         1738 $self->_set_mode( 'regexp' );
1030              
1031 527         803 $self->{find} = undef;
1032              
1033 527         2072 return @tokens;
1034             }
1035              
1036             # Match the initial part of the regexp including any leading white
1037             # space. The initial delimiter is the first thing not consumed, though
1038             # we check it for sanity.
1039             sub __initial_match {
1040 0     0   0 my ( $self ) = @_;
1041              
1042 0 0       0 $self->find_regexp(
1043             qr{ \A ( \s* ) ( qr | m | s )? ( \s* ) (?: [^\w\s] ) }smx )
1044             or return;
1045              
1046 0         0 my ( $leading_white, $type, $next_white ) = $self->capture();
1047              
1048 0 0       0 defined $type
1049             or $type = '';
1050              
1051 0         0 $self->{type} = $type;
1052              
1053 0         0 my @tokens;
1054              
1055 0 0       0 '' ne $leading_white
1056             and push @tokens, $self->make_token( length $leading_white,
1057             'PPIx::Regexp::Token::Whitespace' );
1058 0         0 push @tokens, $self->make_token( length $type,
1059             'PPIx::Regexp::Token::Structure' );
1060 0 0       0 '' ne $next_white
1061             and push @tokens, $self->make_token( length $next_white,
1062             'PPIx::Regexp::Token::Whitespace' );
1063              
1064 0         0 return @tokens;
1065             }
1066              
1067             {
1068             my %extra_parts = (
1069             s => 1,
1070             );
1071              
1072             # Return the number of extra delimited parts. This will be 0 except
1073             # for s///, which will be 1.
1074             sub __number_of_extra_parts {
1075 855     855   1332 my ( $self ) = @_;
1076 855   100     3328 return $extra_parts{$self->{type}} || 0;
1077             }
1078             }
1079              
1080             {
1081             my @part_class = qw{
1082             PPIx::Regexp::Structure::Regexp
1083             PPIx::Regexp::Structure::Replacement
1084             };
1085              
1086             # Return the classes for the parts of the expression.
1087             sub __part_classes {
1088 326     326   678 my ( $self ) = @_;
1089 326         756 my $max = $self->__number_of_extra_parts();
1090 326         1429 return @part_class[ 0 .. $max ];
1091             }
1092             }
1093              
1094             sub __PPIX_TOKENIZER__regexp {
1095 3036     3036   4516 my ( $self, $character ) = @_;
1096              
1097 3036         4039 my $mode = $self->{mode};
1098 3036         3828 my $handler = '__PPIX_TOKENIZER__' . $mode;
1099              
1100 3036         4244 $self->{cursor_orig} = $self->{cursor_curr};
1101 3036         5333 foreach my $class ( $self->_known_tokenizers() ) {
1102 13478         38113 my @tokens = grep { $_ } $class->$handler( $self, $character );
  3867         7054  
1103             $self->{trace}
1104 13478 50       19899 and warn $class, "->$handler( \$self, '$character' )",
1105             " => (@tokens)\n";
1106             @tokens
1107             and return ( map {
1108 13478 100       19996 ref $_ ? $_ : $self->make_token( $_,
  3033 100       8408  
1109             $class ) } @tokens );
1110             }
1111              
1112             # Find a fallback processor for the character.
1113 27   33     204 my $fallback = __PACKAGE__->can( '__PPIX_TOKEN_FALLBACK__' . $mode )
1114             || __PACKAGE__->can( '__PPIX_TOKEN_FALLBACK__regexp' )
1115             || confess "Programming error - unable to find fallback for $mode";
1116 27         67 return $fallback->( $self, $character );
1117             }
1118              
1119             *__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp;
1120              
1121             sub __PPIX_TOKEN_FALLBACK__regexp {
1122 18     18   39 my ( $self, $character ) = @_;
1123              
1124             # As a fallback in regexp mode, any escaped character is a literal.
1125 18 100 66     62 if ( $character eq '\\'
1126             && $self->{cursor_limit} - $self->{cursor_curr} > 1
1127             ) {
1128 2         6 return $self->make_token( 2, TOKEN_LITERAL );
1129             }
1130              
1131             # Any normal character is unknown.
1132 16         78 return $self->make_token( 1, TOKEN_UNKNOWN, {
1133             error => 'Tokenizer found unexpected literal',
1134             },
1135             );
1136             }
1137              
1138             sub __PPIX_TOKEN_FALLBACK__repl {
1139 9     9   15 my ( $self, $character ) = @_;
1140              
1141             # As a fallback in replacement mode, any escaped character is a literal.
1142 9 100 66     28 if ( $character eq '\\'
1143             && defined ( my $next = $self->peek( 1 ) ) ) {
1144              
1145 5 0 33     12 if ( $self->interpolates() || $next eq q<'> || $next eq '\\' ) {
      33        
1146 5         12 return $self->make_token( 2, TOKEN_LITERAL );
1147             }
1148 0         0 return $self->make_token( 1, TOKEN_LITERAL );
1149             }
1150              
1151             # So is any normal character.
1152 4         9 return $self->make_token( 1, TOKEN_LITERAL );
1153             }
1154              
1155             sub __PPIX_TOKENIZER__finish {
1156 560     560   987 my ( $self ) = @_; # $character unused
1157              
1158             $self->{cursor_limit} > length $self->{content}
1159 560 50       1439 and confess "Programming error - ran off string";
1160              
1161             my @tokens = $self->make_token( length $self->{delimiter_finish},
1162 560         1510 'PPIx::Regexp::Token::Delimiter' );
1163              
1164 560 100       1415 if ( $self->{cursor_curr} == $self->{cursor_modifiers} ) {
1165              
1166             # We are out of string. Add the trailing tokens (created when we
1167             # did the initial bracket scan) and close up shop.
1168            
1169 516         1416 push @tokens, $self->_get_trailing_tokens();
1170              
1171 516         996 $self->_set_mode( 'kaput' );
1172              
1173             } else {
1174              
1175             # Clear the cookies, because we are going around again.
1176 44         167 $self->{cookie} = {};
1177              
1178             # Move the cursor limit to just before the modifiers.
1179 44         117 $self->{cursor_limit} = $self->{cursor_modifiers} - 1;
1180              
1181             # If the preceding regular expression was bracketed, we need to
1182             # consume possible whitespace and find another delimiter.
1183              
1184 44 100       172 if ( $self->close_bracket( $self->{delimiter_start} ) ) {
1185 7         11 my $accept;
1186             # If we are bracketed, there can be honest-to-God Perl
1187             # comments between the regexp and the replacement, not just
1188             # regexp comments. As of version 1.220, PPI does not get
1189             # this parse right, but if we can handle this is a string,
1190             # then we will Just Work when PPI gets itself straight.
1191 7         31 while ( $self->find_regexp(
1192             qr{ \A ( \s* \n \s* ) ( \# [^\n]* \n ) }smx ) ) {
1193 2         7 my ( $white_space, $comment ) = $self->capture();
1194 2         6 push @tokens, $self->make_token(
1195             length $white_space,
1196             'PPIx::Regexp::Token::Whitespace',
1197             ), $self->make_token(
1198             length $comment,
1199             'PPIx::Regexp::Token::Comment',
1200             );
1201             }
1202 7 100       26 $accept = $self->find_regexp( qr{ \A \s+ }smx )
1203             and push @tokens, $self->make_token(
1204             $accept, 'PPIx::Regexp::Token::Whitespace' );
1205 7         26 my $character = $self->peek();
1206 7         17 $self->{delimiter_start} = $character;
1207 7         19 push @tokens, $self->make_token(
1208             1, 'PPIx::Regexp::Token::Delimiter' );
1209             $self->{delimiter_finish} = substr
1210             $self->{content},
1211 7         19 $self->{cursor_limit} - 1,
1212             1;
1213             }
1214              
1215 44 100       105 if ( $self->modifier( 'e*' ) ) {
1216             # With /e or /ee, the replacement portion is code. We make
1217             # it all into one big PPIx::Regexp::Token::Code, slap on the
1218             # trailing delimiter and modifiers, and return it all.
1219             push @tokens, $self->make_token(
1220             $self->{cursor_limit} - $self->{cursor_curr},
1221 11         54 'PPIx::Regexp::Token::Code',
1222             { perl_version_introduced => MINIMUM_PERL },
1223             );
1224 11         34 $self->{cursor_limit} = length $self->{content};
1225 11         32 push @tokens, $self->make_token( 1,
1226             'PPIx::Regexp::Token::Delimiter' ),
1227             $self->_get_trailing_tokens();
1228 11         29 $self->_set_mode( 'kaput' );
1229             } else {
1230             # Put our mode to replacement.
1231 33         70 $self->_set_mode( 'repl' );
1232             }
1233              
1234             }
1235              
1236 560         1507 return @tokens;
1237              
1238             }
1239              
1240             # To common processing on trailing tokens.
1241             sub _get_trailing_tokens {
1242 527     527   874 my ( $self ) = @_;
1243 527 100       1074 if ( $self->{index_locations} ) {
1244             # We turned off index_locations when these were created, because
1245             # they were done out of order. Fix that now.
1246 11         16 foreach my $token ( @{ $self->{trailing_tokens} } ) {
  11         21  
1247 11         24 $self->_update_location( $token );
1248             }
1249             }
1250 527         648 return @{ delete $self->{trailing_tokens} };
  527         1341  
1251             }
1252              
1253             1;
1254              
1255             __END__
1256              
1257             =head1 NAME
1258              
1259             PPIx::Regexp::Tokenizer - Tokenize a regular expression
1260              
1261             =head1 SYNOPSIS
1262              
1263             use PPIx::Regexp::Dumper;
1264             PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
1265             ->print();
1266              
1267             =head1 INHERITANCE
1268              
1269             C<PPIx::Regexp::Tokenizer> is a
1270             L<PPIx::Regexp::Support|PPIx::Regexp::Support>.
1271              
1272             C<PPIx::Regexp::Tokenizer> has no descendants.
1273              
1274             =head1 DESCRIPTION
1275              
1276             This class provides tokenization of the regular expression.
1277              
1278             =head1 METHODS
1279              
1280             This class provides the following public methods. Methods not documented
1281             here (or documented below under L</EXTERNAL TOKENIZERS>) are private,
1282             and unsupported in the sense that the author reserves the right to
1283             change or remove them without notice.
1284              
1285             =head2 new
1286              
1287             my $tokenizer = PPIx::Regexp::Tokenizer->new( 'xyzzy' );
1288              
1289             This static method instantiates the tokenizer. You must pass it the
1290             regular expression to be parsed, either as a string or as a
1291             L<PPI::Element|PPI::Element> of some sort. You can also pass optional
1292             name/value pairs of arguments. The option names are specified B<without>
1293             a leading dash. Supported options are:
1294              
1295             =over
1296              
1297             =item default_modifiers array_reference
1298              
1299             This argument specifies default statement modifiers. It is optional, but
1300             if specified must be an array reference. See the
1301             L<PPIx::Regexp|PPIx::Regexp> L<new()|PPIx::Regexp/new> documentation for
1302             the details.
1303              
1304             =item encoding name
1305              
1306             This option specifies the encoding of the string to be tokenized. If
1307             specified, an C<Encode::decode> is done on the string (or the C<content>
1308             of the PPI class) before it is tokenized.
1309              
1310             =item index_locations
1311              
1312             This Boolean option specifies that the locations of the generated tokens
1313             are to be computed.
1314              
1315             =item strict boolean
1316              
1317             This option specifies whether tokenization should assume
1318             C<use re 'strict';> is in effect.
1319              
1320             The C<'strict'> pragma was introduced in Perl 5.22, and its
1321             documentation says that it is experimental, and that there is no
1322             commitment to backward compatibility. The same applies to the
1323             tokenization produced when this option is asserted.
1324              
1325             =item trace number
1326              
1327             Specifying a positive value for this option causes a trace of the
1328             tokenization. This option is unsupported in the sense that the author
1329             reserves the right to alter it without notice.
1330              
1331             If this option is unspecified, the value comes from environment variable
1332             C<PPIX_REGEXP_TOKENIZER_TRACE> (see L</ENVIRONMENT VARIABLES>). If this
1333             environment variable does not exist, the default is 0.
1334              
1335             =back
1336              
1337             Undocumented options are unsupported.
1338              
1339             The returned value is the instantiated tokenizer, or C<undef> if
1340             instantiation failed. In the latter case a call to L</errstr> will
1341             return the reason.
1342              
1343             =head2 content
1344              
1345             print $tokenizer->content();
1346              
1347             This method returns the string being tokenized. This will be the result
1348             of the L<< PPI::Element->content()|PPI::Element/content >> method if the
1349             object was instantiated with a L<PPI::Element|PPI::Element>.
1350              
1351             =head2 default_modifiers
1352              
1353             print join ', ', @{ $tokenizer->default_modifiers() };
1354              
1355             This method returns a reference to a copy of the array passed to the
1356             C<default_modifiers> argument to L<new()|/new>. If this argument was not
1357             used to instantiate the object, the return is a reference to an empty
1358             array.
1359              
1360             =head2 encoding
1361              
1362             This method returns the encoding of the data being parsed, if one was
1363             set when the class was instantiated; otherwise it simply returns undef.
1364              
1365             =head2 errstr
1366              
1367             my $tokenizer = PPIx::Regexp::Tokenizer->new( 'xyzzy' )
1368             or die PPIx::Regexp::Tokenizer->errstr();
1369              
1370             This static method returns an error description if tokenizer
1371             instantiation failed.
1372              
1373             =head2 failures
1374              
1375             print $tokenizer->failures(), " tokenization failures\n";
1376              
1377             This method returns the number of tokenization failures encountered. A
1378             tokenization failure is represented in the output token stream by a
1379             L<PPIx::Regexp::Token::Unknown|PPIx::Regexp::Token::Unknown>.
1380              
1381             =head2 modifier
1382              
1383             $tokenizer->modifier( 'x' )
1384             and print "Tokenizing an extended regular expression\n";
1385              
1386             This method returns true if the given modifier character was found on
1387             the end of the regular expression, and false otherwise.
1388              
1389             Starting with version 0.036_01, if the argument is a
1390             single-character modifier followed by an asterisk (intended as a wild
1391             card character), the return is the number of times that modifier
1392             appears. In this case an exception will be thrown if you specify a
1393             multi-character modifier (e.g. C<'ee*'>), or if you specify one of the
1394             match semantics modifiers (e.g. C<'a*'>).
1395              
1396             If called by an external tokenizer, this method returns true if if the
1397             given modifier was true at the current point in the tokenization.
1398              
1399             =head2 next_token
1400              
1401             my $token = $tokenizer->next_token();
1402              
1403             This method returns the next token in the token stream, or nothing if
1404             there are no more tokens.
1405              
1406             =head2 significant
1407              
1408             This method exists simply for the convenience of
1409             L<PPIx::Regexp::Dumper|PPIx::Regexp::Dumper>. It always returns true.
1410              
1411             =head2 tokens
1412              
1413             my @tokens = $tokenizer->tokens();
1414              
1415             This method returns all remaining tokens in the token stream.
1416              
1417             =head1 EXTERNAL TOKENIZERS
1418              
1419             This class does very little of its own tokenization. Instead the token
1420             classes contain external tokenization routines, whose name is
1421             '__PPIX_TOKENIZER__' concatenated with the current mode of the tokenizer
1422             ('regexp' for regular expressions, 'repl' for the replacement string).
1423              
1424             These external tokenizers are called as static methods, and passed the
1425             C<PPIx::Regexp::Tokenizer> object and the current character in the
1426             character stream.
1427              
1428             If the external tokenizer wants to make one or more tokens, it returns
1429             an array containing either length in characters for tokens of the
1430             tokenizer's own class, or the results of one or more L</make_token>
1431             calls for tokens of an arbitrary class.
1432              
1433             If the external tokenizer is not interested in the characters starting
1434             at the current position it simply returns.
1435              
1436             The following methods are for the use of external tokenizers, and B<are
1437             not part of the public interface to this class.>
1438              
1439             =head2 capture
1440              
1441             if ( $tokenizer->find_regexp( qr{ \A ( foo ) }smx ) ) {
1442             foreach ( $tokenizer->capture() ) {
1443             print "$_\n";
1444             }
1445             }
1446              
1447             This method returns all the contents of any capture buffers from the
1448             previous call to L</find_regexp>. The first element of the array (i.e.
1449             element 0) corresponds to C<$1>, and so on.
1450              
1451             The captures are cleared by L</make_token>, as well as by another call
1452             to L</find_regexp>.
1453              
1454             =head2 cookie
1455              
1456             $tokenizer->cookie( foo => sub { 1 } );
1457             my $cookie = $tokenizer->cookie( 'foo' );
1458             my $old_hint = $tokenizer->cookie( foo => undef );
1459              
1460             This method either creates, deletes, or accesses a cookie.
1461              
1462             A cookie is a code reference which is called whenever the tokenizer makes
1463             a token. If it returns a false value, it is deleted. Explicitly setting
1464             the cookie to C<undef> also deletes it.
1465              
1466             When you call C<< $tokenizer->cookie( 'foo' ) >>, the current cookie is
1467             returned. If you pass a new value of C<undef> to delete the token, the
1468             deleted cookie (if any) is returned.
1469              
1470             When the L</make_token> method calls a cookie, it passes it the tokenizer
1471             and the token just made. If a token calls a cookie, it is recommended that
1472             it merely pass the tokenizer, though of course the token can do whatever
1473             it wants.
1474              
1475             The cookie mechanism seems to be a bit of a crock, but it appeared to be
1476             more work to fix things up in the lexer after the tokenizer got
1477             something wrong.
1478              
1479             The recommended way to write a cookie is to use a closure to store any
1480             necessary data, and have a call to the cookie return the data; otherwise
1481             the ultimate consumer of the cookie has no way to access the data. Of
1482             course, it may be that the presence of the cookie at a certain point in
1483             the parse is all that is required.
1484              
1485             =head2 expect
1486              
1487             $tokenizer->expect( 'PPIx::Regexp::Token::Code' );
1488              
1489             This method inserts a given class at the head of the token scan, for the
1490             next iteration only. More than one class can be specified. Class names
1491             can be abbreviated by removing the leading 'PPIx::Regexp::'.
1492              
1493             If no class is specified, this method does nothing.
1494              
1495             The expectation lasts from the next time L</get_token> is called until
1496             the next time L</make_token> makes a significant token, or until the
1497             next C<expect> call if that is done sooner.
1498              
1499             =head2 find_regexp
1500              
1501             my $end = $tokenizer->find_regexp( qr{ \A \w+ }smx );
1502             my ( $begin, $end ) = $tokenizer->find_regexp(
1503             qr{ \A \w+ }smx );
1504              
1505             This method finds the given regular expression in the content, starting
1506             at the current position. If called in scalar context, the offset from
1507             the current position to the end of the matched string is returned. If
1508             called in list context, the offsets to both the beginning and the end of
1509             the matched string are returned.
1510              
1511             =head2 find_matching_delimiter
1512              
1513             my $offset = $tokenizer->find_matching_delimiter();
1514              
1515             This method is used by tokenizers to find the delimiter matching the
1516             character at the current position in the content string. If the
1517             delimiter is an opening bracket of some sort, bracket nesting will be
1518             taken into account.
1519              
1520             When searching for the matching delimiter, the back slash character is
1521             considered to escape the following character, so back-slashed delimiters
1522             will be ignored. No other quoting mechanisms are recognized, though, so
1523             delimiters inside quotes still count. This is actually the way Perl
1524             works, as
1525              
1526             $ perl -e 'qr<(?{ print "}" })>'
1527              
1528             demonstrates.
1529              
1530             This method returns the offset from the current position in the content
1531             string to the matching delimiter (which will always be positive), or
1532             undef if no match can be found.
1533              
1534             =head2 get_mode
1535              
1536             This method returns the name of the current mode of the tokenizer.
1537              
1538             =head2 get_start_delimiter
1539              
1540             my $start_delimiter = $tokenizer->get_start_delimiter();
1541              
1542             This method is used by tokenizers to access the start delimiter for the
1543             regular expression.
1544              
1545             =head2 get_token
1546              
1547             my $token = $tokenizer->make_token( 3 );
1548             my @tokens = $tokenizer->get_token();
1549              
1550             This method returns the next token that can be made from the input
1551             stream. It is B<not> part of the external interface, but is intended for
1552             the use of an external tokenizer which calls it after making and
1553             retaining its own token to look at the next token ( if any ) in the
1554             input stream.
1555              
1556             If any external tokenizer calls get_token without first calling
1557             make_token, a fatal error occurs; this is better than the infinite
1558             recursion which would occur if the condition were not trapped.
1559              
1560             An external tokenizer B<must> return anything returned by get_token;
1561             otherwise tokens get lost.
1562              
1563             =head2 interpolates
1564              
1565             This method returns true if the top-level structure being tokenized
1566             interpolates; that is, if the delimiter is not a single quote.
1567              
1568             =head2 make_token
1569              
1570             return $tokenizer->make_token( 3, 'PPIx::Regexp::Token::Unknown' );
1571              
1572             This method is used by this class (and possibly by individual
1573             tokenizers) to manufacture a token. Its arguments are the number of
1574             characters to include in the token, and optionally the class of the
1575             token. If no class name is given, the caller's class is used. Class
1576             names may be shortened by removing the initial 'PPIx::Regexp::', which
1577             will be restored by this method.
1578              
1579             The token will be manufactured from the given number of characters
1580             starting at the current cursor position, which will be adjusted.
1581              
1582             If the given length would include characters past the end of the string
1583             being tokenized, the length is reduced appropriately. If this means a
1584             token with no characters, nothing is returned.
1585              
1586             =head2 match
1587              
1588             if ( $tokenizer->find_regexp( qr{ \A \w+ }smx ) ) {
1589             print $tokenizer->match(), "\n";
1590             }
1591              
1592             This method returns the string matched by the previous call to
1593             L</find_regexp>.
1594              
1595             The match is set to C<undef> by L</make_token>, as well as by another
1596             call to L</find_regexp>.
1597              
1598             =head2 modifier_duplicate
1599              
1600             $tokenizer->modifier_duplicate();
1601              
1602             This method duplicates the modifiers on the top of the modifier stack,
1603             with the intent of creating a locally-scoped copy of the modifiers. This
1604             should only be called by an external tokenizer that is actually creating
1605             a modifier scope. In other words, only when creating a
1606             L<PPIx::Regexp::Token::Structure|PPIx::Regexp::Token::Structure> token
1607             whose content is '('.
1608              
1609             =head2 modifier_modify
1610              
1611             $tokenizer->modifier_modify( name => $value ... );
1612              
1613             This method sets new values for the modifiers in the local scope. Only
1614             the modifiers whose names are actually passed have their values changed.
1615              
1616             This method is intended to be called after manufacturing a
1617             L<PPIx::Regexp::Token::Modifier|PPIx::Regexp::Token::Modifier> token,
1618             and passed the results of its C<modifiers> method.
1619              
1620             =head2 modifier_pop
1621              
1622             $tokenizer->modifier_pop();
1623              
1624             This method removes the modifiers on the top of the modifier stack. This
1625             should only be called by an external tokenizer that is ending a modifier
1626             scope. In other words, only when creating a
1627             L<PPIx::Regexp::Token::Structure|PPIx::Regexp::Token::Structure> token
1628             whose content is ')'.
1629              
1630             Note that this method will never pop the last modifier item off the
1631             stack, to guard against unmatched right parentheses.
1632              
1633             =head2 modifier_seen
1634              
1635             $tokenizer->modifier_seen( 'i' )
1636             and print "/i was seen at some point.\n";
1637              
1638             Unlike L<modifier()|/modifier>, this method returns a true value if the
1639             given modifier has been seen in any scope visible from the current
1640             location in the parse. There is no magic for group match semantics (
1641             /a, /aa, /d, /l, /u) or modifiers that can be repeated, like /x and /xx,
1642             or /e and /ee.
1643              
1644             =head2 peek
1645              
1646             my $character = $tokenizer->peek();
1647             my $next_char = $tokenizer->peek( 1 );
1648              
1649             This method returns the character at the given non-negative offset from
1650             the current position. If no offset is given, an offset of 0 is used.
1651              
1652             If you ask for a negative offset or an offset off the end of the sting,
1653             C<undef> is returned.
1654              
1655             =head2 ppi_document
1656              
1657             This method makes a PPI document out of the remainder of the string, and
1658             returns it.
1659              
1660             =head2 prior_significant_token
1661              
1662             $tokenizer->prior_significant_token( 'can_be_quantified' )
1663             and print "The prior token can be quantified.\n";
1664              
1665             This method calls the named method on the most-recently-instantiated
1666             significant token, and returns the result. Any arguments subsequent to
1667             the method name will be passed to the method.
1668              
1669             Because this method is designed to be used within the tokenizing system,
1670             it will die horribly if the named method does not exist.
1671              
1672             If called with no arguments at all the most-recently-instantiated
1673             significant token is returned.
1674              
1675             =head2 strict
1676              
1677             say 'Parse is ', $tokenizer->strict() ? 'strict' : 'lenient';
1678              
1679             This method simply returns true or false, depending on whether the
1680             C<'strict'> option to C<new()> was true or false.
1681              
1682             =head1 ENVIRONMENT VARIABLES
1683              
1684             A tokenizer trace can be requested by setting environment variable
1685             PPIX_REGEXP_TOKENIZER_TRACE to a numeric value other than 0. Use of this
1686             environment variable is unsupported in the same sense that the C<trace>
1687             option of L</new> is unsupported. Explicitly specifying the C<trace>
1688             option to L</new> overrides the environment variable.
1689              
1690             The real reason this is documented is to give the user a way to
1691             troubleshoot funny output from the tokenizer.
1692              
1693             =head1 SUPPORT
1694              
1695             Support is by the author. Please file bug reports at
1696             L<https://rt.cpan.org/Public/Dist/Display.html?Name=PPIx-Regexp>,
1697             L<https://github.com/trwyant/perl-PPIx-Regexp/issues>, or in
1698             electronic mail to the author.
1699              
1700             =head1 AUTHOR
1701              
1702             Thomas R. Wyant, III F<wyant at cpan dot org>
1703              
1704             =head1 COPYRIGHT AND LICENSE
1705              
1706             Copyright (C) 2009-2023, 2025 by Thomas R. Wyant, III
1707              
1708             This program is free software; you can redistribute it and/or modify it
1709             under the same terms as Perl 5.10.0. For more details, see the full text
1710             of the licenses in the directory LICENSES.
1711              
1712             This program is distributed in the hope that it will be useful, but
1713             without any warranty; without even the implied warranty of
1714             merchantability or fitness for a particular purpose.
1715              
1716             =cut
1717              
1718             # ex: set textwidth=72 :