File Coverage

blib/lib/PPIx/Regexp/Tokenizer.pm

Criterion	Covered	Total	%
statement	497	525	94.6
branch	177	238	74.3
condition	44	64	68.7
subroutine	86	90	95.5
pod	29	29	100.0
total	833	946	88.0

line	stmt	bran	cond	sub	pod	time	code
1							package PPIx::Regexp::Tokenizer;
2
3	9			9		47	use strict;
	9					12
	9					282
4	9			9		30	use warnings;
	9					10
	9					345
5
6	9			9		33	use base qw{ PPIx::Regexp::Support };
	9					12
	9					2105
7
8	9			9		50	use Carp qw{ carp croak confess };
	9					12
	9					485
9	9			9		2079	use PPI::Document;
	9					789231
	9					399
10	9					1224	use PPIx::Regexp::Constant qw{
11							ARRAY_REF
12							CODE_REF
13							HASH_REF
14							LOCATION_LINE
15							LOCATION_CHARACTER
16							LOCATION_COLUMN
17							LOCATION_LOGICAL_LINE
18							MINIMUM_PERL
19							REGEXP_REF
20							TOKEN_LITERAL
21							TOKEN_UNKNOWN
22							@CARP_NOT
23	9			9		82	};
	9					12
24	9			9		4160	use PPIx::Regexp::Token::Assertion ();
	9					27
	9					207
25	9			9		3849	use PPIx::Regexp::Token::Backreference ();
	9					35
	9					265
26	9			9		3636	use PPIx::Regexp::Token::Backtrack ();
	9					25
	9					188
27	9			9		3486	use PPIx::Regexp::Token::CharClass::POSIX ();
	9					23
	9					175
28	9			9		3564	use PPIx::Regexp::Token::CharClass::POSIX::Unknown ();
	9					26
	9					186
29	9			9		3827	use PPIx::Regexp::Token::CharClass::Simple ();
	9					26
	9					185
30	9			9		3537	use PPIx::Regexp::Token::Code ();
	9					26
	9					165
31	9			9		3766	use PPIx::Regexp::Token::Comment ();
	9					28
	9					171
32	9			9		3898	use PPIx::Regexp::Token::Condition ();
	9					25
	9					260
33	9			9		3489	use PPIx::Regexp::Token::Control ();
	9					24
	9					204
34	9			9		3403	use PPIx::Regexp::Token::Delimiter ();
	9					26
	9					196
35	9			9		3168	use PPIx::Regexp::Token::Greediness ();
	9					24
	9					161
36	9			9		3427	use PPIx::Regexp::Token::GroupType::Assertion ();
	9					27
	9					225
37	9			9		3569	use PPIx::Regexp::Token::GroupType::Atomic_Script_Run ();
	9					24
	9					193
38	9			9		3421	use PPIx::Regexp::Token::GroupType::BranchReset ();
	9					23
	9					190
39	9			9		3587	use PPIx::Regexp::Token::GroupType::Code ();
	9					22
	9					191
40	9			9		3626	use PPIx::Regexp::Token::GroupType::Modifier ();
	9					22
	9					182
41	9			9		3290	use PPIx::Regexp::Token::GroupType::NamedCapture ();
	9					25
	9					167
42	9			9		3169	use PPIx::Regexp::Token::GroupType::Script_Run ();
	9					21
	9					182
43	9			9		3493	use PPIx::Regexp::Token::GroupType::Subexpression ();
	9					20
	9					179
44	9			9		3507	use PPIx::Regexp::Token::GroupType::Switch ();
	9					25
	9					270
45	9			9		3817	use PPIx::Regexp::Token::Interpolation ();
	9					26
	9					215
46	9			9		4159	use PPIx::Regexp::Token::Literal ();
	9					25
	9					238
47	9			9		54	use PPIx::Regexp::Token::Modifier ();
	9					14
	9					102
48	9			9		3765	use PPIx::Regexp::Token::Operator ();
	9					20
	9					176
49	9			9		3524	use PPIx::Regexp::Token::Quantifier ();
	9					24
	9					183
50	9			9		44	use PPIx::Regexp::Token::Recursion ();
	9					13
	9					94
51	9			9		25	use PPIx::Regexp::Token::Structure ();
	9					11
	9					85
52	9			9		3550	use PPIx::Regexp::Token::Unknown ();
	9					25
	9					153
53	9			9		3390	use PPIx::Regexp::Token::Whitespace ();
	9					27
	9					208
54	9					438	use PPIx::Regexp::Util qw{
55							is_ppi_regexp_element
56							__instance
57	9			9		40	};
	9					15
58
59	9			9		38	use Scalar::Util qw{ looks_like_number };
	9					13
	9					48528
60
61							our $VERSION = '0.091';
62
63							our $DEFAULT_POSTDEREF;
64							defined $DEFAULT_POSTDEREF
65							or $DEFAULT_POSTDEREF = 1;
66
67							{
68							# Names of classes containing tokenization machinery. There are few
69							# known ordering requirements, since each class recognizes its own,
70							# and I have tried to prevent overlap. Absent such constraints, the
71							# order is in perceived frequency of acceptance, to keep the search
72							# as short as possible. If I were conscientious I would gather
73							# statistics on this.
74							my @classes = ( # TODO make readonly when acceptable way appears
75							'PPIx::Regexp::Token::Literal',
76							'PPIx::Regexp::Token::Interpolation',
77							'PPIx::Regexp::Token::Control', # Note 1
78							'PPIx::Regexp::Token::CharClass::Simple', # Note 2
79							'PPIx::Regexp::Token::Quantifier',
80							'PPIx::Regexp::Token::Greediness',
81							'PPIx::Regexp::Token::CharClass::POSIX', # Note 3
82							'PPIx::Regexp::Token::Structure',
83							'PPIx::Regexp::Token::Assertion',
84							'PPIx::Regexp::Token::Backreference',
85							'PPIx::Regexp::Token::Operator', # Note 4
86							);
87
88							# Note 1: If we are in quote mode ( \Q ... \E ), Control makes a
89							# literal out of anything it sees other than \E. So it
90							# needs to come before almost all other tokenizers. Not
91							# Literal, which already makes literals, and not
92							# Interpolation, which is legal in quote mode, but
93							# everything else.
94
95							# Note 2: CharClass::Simple must come after Literal, because it
96							# relies on Literal to recognize a Unicode named character
97							# ( \N{something} ), so any \N that comes through to it
98							# must be the \N simple character class (which represents
99							# anything but a newline, and was introduced in Perl
100							# 5.11.0.
101
102							# Note 3: CharClass::POSIX has to come before Structure, since both
103							# look for square brackets, and CharClass::POSIX is the
104							# more particular.
105
106							# Note 4: Operator relies on Literal making the characters literal
107							# if they appear in a context where they can not be
108							# operators, and Control making them literals if quoting,
109							# so it must come after both.
110
111							# Return the declared tokenizer classes.
112							sub __tokenizer_classes {
113	543			543		2217	return @classes;
114							}
115
116							}
117
118							{
119							my $errstr;
120
121							sub new {
122	742			742	1	429994	my ( $class, $re, %args ) = @_;
123	742	50				2056	ref $class and $class = ref $class;
124
125	742					1281	$errstr = undef;
126
127							exists $args{default_modifiers}
128							and ARRAY_REF ne ref $args{default_modifiers}
129	742	50	66			2230	and do {
130	0					0	$errstr = 'default_modifiers must be an array reference';
131	0					0	return;
132							};
133
134							my $self = {
135							index_locations => $args{index_locations}, # Index locations
136							capture => undef, # Captures from find_regexp.
137							content => undef, # The string we are tokenizing.
138							cookie => {}, # Cookies
139							cursor_curr => 0, # The current position in the string.
140							cursor_limit => undef, # The end of the portion of the
141							# string being tokenized.
142							cursor_orig => undef, # Position of cursor when tokenizer
143							# called. Used by get_token to prevent
144							# recursion.
145							cursor_modifiers => undef, # Position of modifiers.
146							default_modifiers => $args{default_modifiers} \|\| [],
147							delimiter_finish => undef, # Finishing delimiter of regexp.
148							delimiter_start => undef, # Starting delimiter of regexp.
149							encoding => $args{encoding}, # Character encoding.
150							expect => undef, # Extra classes to expect.
151							expect_next => undef, # Extra classes as of next parse cycle
152							failures => 0, # Number of parse failures.
153							find => undef, # String for find_regexp
154							known => {}, # Known tokenizers, by mode.
155							location => $args{location},
156							match => undef, # Match from find_regexp.
157							mode => 'init', # Initialize
158							modifiers => [{}], # Modifier hash.
159							pending => [], # Tokens made but not returned.
160							prior => TOKEN_UNKNOWN, # Prior significant token.
161							source => $re, # The object we were initialized with.
162							strict => $args{strict}, # like "use re 'strict';".
163							trace => __PACKAGE__->__defined_or(
164	742		100			9082	$args{trace}, $ENV{PPIX_REGEXP_TOKENIZER_TRACE}, 0 ),
165							};
166
167	742	100				2696	if ( __instance( $re, 'PPI::Element' ) ) {
		100
168	11	50				34	is_ppi_regexp_element( $re )
169							or return __set_errstr( ref $re, 'not supported by', $class );
170							# TODO conditionalizstion on PPI class does not really
171							# belong here, but at the moment I have no other idea of
172							# where to put it.
173	11	50				82	$self->{content} = $re->isa( 'PPI::Token::HereDoc' ) ?
174							join( '', $re->content(), "\n", $re->heredoc(),
175							$re->terminator(), "\n" ) :
176							$re->content();
177							} elsif ( ref $re ) {
178	2					5	return __set_errstr( ref $re, 'not supported' );
179							} else {
180	729					1245	$self->{content} = $re;
181							}
182
183	740					1549	bless $self, $class;
184
185	740					2271	$self->{content} = $self->decode( $self->{content} );
186
187	740					1556	$self->{cursor_limit} = length $self->{content};
188
189							$self->{trace}
190	740	50				1758	and warn "\ntokenizing '$self->{content}'\n";
191
192	740					2132	return $self;
193							}
194
195							sub __set_errstr {
196	2			2		6	$errstr = join ' ', @_;
197	2					12	return;
198							}
199
200							sub errstr {
201	2			2	1	5	return $errstr;
202							}
203
204							}
205
206							sub capture {
207	715			715	1	1181	my ( $self ) = @_;
208	715	100				1433	$self->{capture} or return;
209	694	50				1360	defined wantarray or return;
210	694	50				1200	return wantarray ? @{ $self->{capture} } : $self->{capture};
	694					2681
211							}
212
213							sub content {
214	1			1	1	2	my ( $self ) = @_;
215	1					4	return $self->{content};
216							}
217
218							sub cookie {
219	10182			10182	1	13695	my ( $self, $name, @args ) = @_;
220	10182	50				13627	defined $name
221							or confess "Programming error - undefined cookie name";
222	10182	50				13755	if ( $self->{trace} ) {
223	0					0	local $" = ', ';
224	0					0	warn "cookie( '$name', @args )\n";
225							}
226	10182	100				25865	@args or return $self->{cookie}{$name};
227	721					1207	my $cookie = shift @args;
228	721	100				1764	if ( CODE_REF eq ref $cookie ) {
		50
229	593					2246	return ( $self->{cookie}{$name} = $cookie );
230							} elsif ( defined $cookie ) {
231	0					0	confess "Programming error - cookie must be CODE ref or undef";
232							} else {
233	128					410	return delete $self->{cookie}{$name};
234							}
235							}
236
237							# NOTE: Currently this is called only against
238							# COOKIE_LOOKAROUND_ASSERTION, once in PPIx::Token::GroupType::Assertion
239							# to prevent the cookie from being remade if it already exists, and once
240							# in PPIx::Regexp::Token::Assertion to determine if \K is inside a
241							# lookaround assertion. If it gets used other places, or if there is
242							# call for it, I should consider removing the underscores and
243							# documenting it as public.
244							sub __cookie_exists {
245	57			57		109	my ( $self, $name ) = @_;
246	57	50				121	defined $name
247							or confess "Programming error - undefined cookie name";
248	57					189	return $self->{cookie}{$name};
249							}
250
251							sub default_modifiers {
252	0			0	1	0	my ( $self ) = @_;
253	0					0	return [ @{ $self->{default_modifiers} } ];
	0					0
254							}
255
256							sub __effective_modifiers {
257	334			334		597	my ( $self ) = @_;
258							HASH_REF eq ref $self->{effective_modifiers}
259	334	100				966	or return {};
260	326					508	return { %{ $self->{effective_modifiers} } };
	326					1101
261							}
262
263							sub encoding {
264	0			0	1	0	my ( $self ) = @_;
265	0					0	return $self->{encoding};
266							}
267
268							sub expect {
269	330			330	1	1024	my ( $self, @args ) = @_;
270
271							@args
272	330	50				624	or return;
273
274							$self->{expect_next} = [
275	330	50				603	map { m/ \A PPIx::Regexp:: /smx ? $_ : 'PPIx::Regexp::' . $_ }
	2602					5136
276							@args
277							];
278	330					680	$self->{expect} = undef;
279	330					618	return;
280							}
281
282							sub failures {
283	8			8	1	16	my ( $self ) = @_;
284	8					16	return $self->{failures};
285							}
286
287							sub find_matching_delimiter {
288	589			589	1	937	my ( $self ) = @_;
289	589		100			1832	$self->{cursor_curr} \|\|= 0;
290							my $start = substr
291							$self->{content},
292							$self->{cursor_curr},
293	589					1276	1;
294
295	589					858	my $inx = $self->{cursor_curr};
296	589		66			1623	my $finish = (
297							my $bracketed = $self->close_bracket( $start ) ) \|\| $start;
298
299							=begin comment
300
301							$self->{trace}
302							and warn "Find matching delimiter: Start with '$start' at $self->{cursor_curr}, end with '$finish' at or before $self->{cursor_limit}\n";
303
304							=end comment
305
306							=cut
307
308	589					900	my $nest = 0;
309
310	589					1599	while ( ++$inx < $self->{cursor_limit} ) {
311	6144					7257	my $char = substr $self->{content}, $inx, 1;
312
313							=begin comment
314
315							$self->{trace}
316							and warn " looking at '$char' at $inx, nest level $nest\n";
317
318							=end comment
319
320							=cut
321
322	6144	100	100			16155	if ( $char eq '\\' && $finish ne '\\' ) {
		100	100
		100
323	317					522	++$inx;
324							} elsif ( $bracketed && $char eq $start ) {
325	1					2	++$nest;
326							} elsif ( $char eq $finish ) {
327							--$nest < 0
328	588	100				2294	and return $inx - $self->{cursor_curr};
329							}
330							}
331
332	2					6	return;
333							}
334
335							sub find_regexp {
336	16627			16627	1	20345	my ( $self, $regexp ) = @_;
337
338	16627	50	0			25574	REGEXP_REF eq ref $regexp
339							or confess
340							'Argument is a ', ( ref $regexp \|\| 'scalar' ), ' not a Regexp';
341
342	16627	100				27118	defined $self->{find} or $self->_remainder();
343
344	16627	100				66695	$self->{find} =~ $regexp
345							or return;
346
347	1848					2332	my @capture;
348	1848					5190	foreach my $inx ( 0 .. $#+ ) {
349	4267	100	66			13690	if ( defined $-[$inx] && defined $+[$inx] ) {
350							push @capture, $self->{capture} = substr
351							$self->{find},
352	3778					15986	$-[$inx],
353							$+[$inx] - $-[$inx];
354							} else {
355	489					783	push @capture, undef;
356							}
357							}
358	1848					3405	$self->{match} = shift @capture;
359	1848					2871	$self->{capture} = \@capture;
360
361							# The following circumlocution seems to be needed under Perl 5.13.0
362							# for reasons I do not fathom -- at least in the case where
363							# wantarray is false. RT 56864 details the symptoms, which I was
364							# never able to reproduce outside Perl::Critic. But returning $+[0]
365							# directly, the value could transmogrify between here and the
366							# calling module.
367							## my @data = ( $-[0], $+[0] );
368							## return wantarray ? @data : $data[1];
369	1848	50				6379	return wantarray ? ( $-[0] + 0, $+[0] + 0 ) : $+[0] + 0;
370							}
371
372							sub get_mode {
373	2598			2598	1	3260	my ( $self ) = @_;
374	2598					5077	return $self->{mode};
375							}
376
377							sub get_start_delimiter {
378	1794			1794	1	2036	my ( $self ) = @_;
379	1794					4983	return $self->{delimiter_start};
380							}
381
382							sub get_token {
383	4133			4133	1	5170	my ( $self ) = @_;
384
385							caller eq __PACKAGE__ or $self->{cursor_curr} > $self->{cursor_orig}
386	4133	50	66			10247	or confess 'Programming error - get_token() called without ',
387							'first calling make_token()';
388
389	4133					6316	my $handler = '__PPIX_TOKENIZER__' . $self->{mode};
390
391							my $code = $self->can( $handler )
392							or confess 'Programming error - ',
393							"Getting token in mode '$self->{mode}'. ",
394							"cursor_curr = $self->{cursor_curr}; ",
395							"cursor_limit = $self->{cursor_limit}; ",
396							"length( content ) = ", length $self->{content},
397	4133	50				11779	"; content = '$self->{content}'";
398
399							my $character = substr(
400							$self->{content},
401							$self->{cursor_curr},
402	4133					7640	1
403							);
404
405							$self->{trace}
406	4133	50				7265	and warn "get_token() got '$character' from $self->{cursor_curr}\n";
407
408	4133					7034	return ( $code->( $self, $character ) );
409							}
410
411							sub interpolates {
412	141			141	1	230	my ( $self ) = @_;
413	141					474	return $self->{delimiter_start} ne q{'};
414							}
415
416							sub make_token {
417	5243			5243	1	8672	my ( $self, $length, $class, $arg ) = @_;
418	5243	100				7935	defined $class or $class = caller;
419
420	5243	50				9691	if ( $length + $self->{cursor_curr} > $self->{cursor_limit} ) {
421							$length = $self->{cursor_limit} - $self->{cursor_curr}
422	0	0				0	or return;
423							}
424
425	5243	50				12819	$class =~ m/ \A PPIx::Regexp:: /smx
426							or $class = 'PPIx::Regexp::' . $class;
427							my $content = substr
428							$self->{content},
429							$self->{cursor_curr},
430	5243					8622	$length;
431
432							$self->{trace}
433	5243	50				7863	and warn "make_token( $length, '$class' ) => '$content'\n";
434	5243	50				7894	$self->{trace} > 1
435							and warn " make_token: cursor_curr = $self->{cursor_curr}; ",
436							"cursor_limit = $self->{cursor_limit}\n";
437							my $token = $class->__new( $content,
438							tokenizer => $self,
439	5243	100				6327	%{ $arg \|\| {} } )
	5243	50				23979
440							or return;
441
442							$self->{index_locations}
443	5243	100				11023	and $self->_update_location( $token );
444
445							$token->significant()
446	5243	100				11087	and $self->{expect} = undef;
447
448	5243	100				18333	$token->isa( TOKEN_UNKNOWN ) and $self->{failures}++;
449
450	5243					6786	$self->{cursor_curr} += $length;
451	5243					6281	$self->{find} = undef;
452	5243					5979	$self->{match} = undef;
453	5243					6285	$self->{capture} = undef;
454
455	5243					6260	foreach my $name ( keys %{ $self->{cookie} } ) {
	5243					9742
456	3615					4418	my $cookie = $self->{cookie}{$name};
457							$cookie->( $self, $token )
458	3615	100				6492	or delete $self->{cookie}{$name};
459							}
460
461							# Record this token as the prior token if it is significant. We must
462							# do this after processing cookies, so that the cookies have access
463							# to the old token if they want.
464							$token->significant()
465	5243	100				8390	and $self->{prior_significant_token} = $token;
466
467	5243					15786	return $token;
468							}
469
470							sub match {
471	86			86	1	150	my ( $self ) = @_;
472	86					175	return $self->{match};
473							}
474
475							sub modifier {
476	4932			4932	1	6663	my ( $self, $modifier ) = @_;
477							return PPIx::Regexp::Token::Modifier::__asserts(
478	4932					9618	$self->{modifiers}[-1], $modifier );
479							}
480
481							sub modifier_duplicate {
482	292			292	1	442	my ( $self ) = @_;
483	292					573	push @{ $self->{modifiers} },
484	292					397	{ %{ $self->{modifiers}[-1] } };
	292					855
485	292					477	return;
486							}
487
488							sub modifier_modify {
489	595			595	1	1322	my ( $self, %args ) = @_;
490
491							# Modifier code is centralized in PPIx::Regexp::Token::Modifier
492							$self->{modifiers}[-1] =
493							PPIx::Regexp::Token::Modifier::__PPIX_TOKENIZER__modifier_modify(
494	595					1915	$self->{modifiers}[-1], \%args );
495
496	595					1015	return;
497
498							}
499
500							sub modifier_pop {
501	288			288	1	492	my ( $self ) = @_;
502	288					747	@{ $self->{modifiers} } > 1
503	288	100				328	and pop @{ $self->{modifiers} };
	282					603
504	288					666	return;
505							}
506
507							sub modifier_seen {
508	8			8	1	25	my ( $self, $modifier ) = @_;
509	8					15	foreach my $mod ( reverse @{ $self->{modifiers} } ) {
	8					24
510	10	100				35	exists $mod->{$modifier}
511							and return 1;
512							}
513	5					23	return;
514							}
515
516							sub next_token {
517	5780			5780	1	7011	my ( $self ) = @_;
518
519							{
520
521	5780	100				6357	if ( @{ $self->{pending} } ) {
	9896					9693
	9896					15389
522	5241					5305	return shift @{ $self->{pending} };
	5241					12984
523							}
524
525	4655	100				8421	if ( $self->{cursor_curr} >= $self->{cursor_limit} ) {
526							$self->{cursor_limit} >= length $self->{content}
527	1099	100				3059	and return;
528	560	50				1310	$self->{mode} eq 'finish' and return;
529	560					1442	$self->_set_mode( 'finish' );
530	560					899	$self->{cursor_limit} += length $self->{delimiter_finish};
531							}
532
533	4116	50				7397	if ( my @tokens = $self->get_token() ) {
534	4116					4544	push @{ $self->{pending} }, @tokens;
	4116					6564
535	4116					6007	redo;
536
537							}
538
539							}
540
541	0					0	return;
542
543							}
544
545							sub peek {
546	379			379	1	606	my ( $self, $offset ) = @_;
547	379	100				712	defined $offset or $offset = 0;
548	379	50				684	$offset < 0 and return;
549	379					544	$offset += $self->{cursor_curr};
550	379	50				644	$offset >= $self->{cursor_limit} and return;
551	379					1184	return substr $self->{content}, $offset, 1;
552							}
553
554							sub ppi_document {
555	83			83	1	156	my ( $self ) = @_;
556
557	83	50				156	defined $self->{find} or $self->_remainder();
558
559	83					467	return PPI::Document->new( \"$self->{find}" );
560							}
561
562							sub prior_significant_token {
563	2413			2413	1	3302	my ( $self, $method, @args ) = @_;
564	2413	100				3260	defined $method or return $self->{prior_significant_token};
565							$self->{prior_significant_token}->can( $method )
566							or confess 'Programming error - ',
567							( ref $self->{prior_significant_token} \|\|
568	2394	50	0			6943	$self->{prior_significant_token} ),
569							' does not support method ', $method;
570	2394					5897	return $self->{prior_significant_token}->$method( @args );
571							}
572
573							# my $length = $token->__recognize_postderef( $tokenizer, $iterator ).
574							#
575							# This method is private to the PPIx-Regexp package, and may be changed
576							# or retracted without warning. What it does is to recognize postfix
577							# dereferences. It returns the length in characters of the first postfix
578							# dereference found, or a false value if none is found.
579							#
580							# The optional $iterator argument can be one of the following:
581							# - A code reference, which will be called to provide PPI::Element
582							# objects to be checked to see if they represent a postfix
583							# dereference.
584							# - A PPI::Element, which is checked to see if it is a postfix
585							# dereference.
586							# - Undef, or omitted, in which case ppi() is called on the invocant,
587							# and everything that follows the '->' operator is checked to see if
588							# it is a postfix dereference.
589							# - Anything else results in an exception and stack trace.
590
591							{
592							sub __recognize_postderef {
593	149			149		251	my ( $self, $token, $iterator ) = @_;
594
595							# Note that if ppi() gets called I have to hold a reference to
596							# the returned object until I am done with all its children.
597	149					191	my $ppi;
598	149	100				305	if ( ! defined $iterator ) {
		50
		0
599
600							# This MUST be done before ppi() is called.
601							$self->{index_locations}
602	145	100				297	and $self->_update_location( $token );
603
604	145					410	$ppi = $token->ppi();
605	29					6362	my @ops = grep { '->' eq $_->content() } @{
606	145	100				256	$ppi->find( 'PPI::Token::Operator' ) \|\| [] };
	145					458
607							$iterator = sub {
608	151	100		151		625	my $op = shift @ops
609							or return;
610	15					55	return $op->snext_sibling();
611	145					31538	};
612							} elsif ( $iterator->isa( 'PPI::Element' ) ) {
613	4					9	my @eles = ( $iterator );
614							$iterator = sub {
615	4			4		10	return shift @eles;
616	4					13	};
617							} elsif ( CODE_REF ne ref $iterator ) {
618	0					0	confess 'Programming error - Iterator not understood';
619							}
620
621	149					675	my $accept = $token->__postderef_accept_cast();
622
623	149					296	while ( my $elem = $iterator->() ) {
624
625	19					407	my $content = $elem->content();
626
627							# As of PPI 1.238, all postfix dereferences are parsed as
628							# casts. So if we find a cast of the correct content we have
629							# a postfix deref.
630	19	100				168	$elem->isa( 'PPI::Token::Cast' )
631							or next;
632
633	15	100				85	if ( $content =~ m/ ( .* ) \* \z /smx ) {
		50
634							# If we're an acceptable cast ending in a glob, accept
635							# it.
636	10	100				90	$accept->{$1}
637							and return length $content;
638							} elsif ( $accept->{$content} ) {
639							# If we're an acceptable cast followed by a subscript,
640							# we're a slice -- accept both cast and subscript.
641	5	50				16	my $next = $elem->snext_sibling()
642							or next;
643	5	50				84	$next->isa( 'PPI::Structure::Subscript' )
644							or next;
645	5					17	return length( $content ) + length( $next->content() );
646							}
647
648							# Otherwise, we're not a postfix dereference; try the next
649							# iteration.
650							}
651
652							# No postfix dereference found.
653	136					681	return;
654							}
655							}
656
657							sub significant {
658	0			0	1	0	return 1;
659							}
660
661							sub strict {
662	4			4	1	7	my ( $self ) = @_;
663	4					20	return $self->{strict};
664							}
665
666							sub _known_tokenizers {
667	3036			3036		3743	my ( $self ) = @_;
668
669	3036					3546	my $mode = $self->{mode};
670
671	3036					3394	my @expect;
672	3036	100				5051	if ( $self->{expect_next} ) {
673	328					537	$self->{expect} = $self->{expect_next};
674	328					534	$self->{expect_next} = undef;
675							}
676	3036	100				4903	if ( $self->{expect} ) {
677							@expect = $self->_known_tokenizer_check(
678	334					470	@{ $self->{expect} } );
	334					809
679							}
680
681							exists $self->{known}{$mode} and return (
682	3036	100				6371	@expect, @{ $self->{known}{$mode} } );
	2493					7574
683
684	543					1346	my @found = $self->_known_tokenizer_check(
685							$self->__tokenizer_classes() );
686
687	543					1809	$self->{known}{$mode} = \@found;
688	543					1660	return (@expect, @found);
689							}
690
691							sub _known_tokenizer_check {
692	877			877		2566	my ( $self, @args ) = @_;
693
694	877					1190	my $handler = '__PPIX_TOKENIZER__' . $self->{mode};
695	877					1085	my @found;
696
697	877					1221	foreach my $class ( @args ) {
698
699	8611	100				33493	$class->can( $handler ) or next;
700	8408					10970	push @found, $class;
701
702							}
703
704	877					3385	return @found;
705							}
706
707							sub tokens {
708	205			205	1	450	my ( $self ) = @_;
709
710	205					300	my @rslt;
711	205					738	while ( my $token = $self->next_token() ) {
712	1933					3631	push @rslt, $token;
713							}
714
715	205					1206	return @rslt;
716							}
717
718							# $self->_deprecation_notice( $type, $name );
719							#
720							# This method centralizes deprecation. Type is 'attribute' or
721							# 'method'. Deprecation is driven of the %deprecate hash. Values
722							# are:
723							# false - no warning
724							# 1 - warn on first use
725							# 2 - warn on each use
726							# 3 - die on each use.
727							#
728							# $self->_deprecation_in_progress( $type, $name )
729							#
730							# This method returns true if the deprecation is in progress. In
731							# fact it returns the deprecation level.
732
733							=begin comment
734
735							{
736
737							my %deprecate = (
738							attribute => {
739							postderef => 3,
740							},
741							);
742
743							sub _deprecation_notice {
744							my ( undef, $type, $name, $repl ) = @_; # Invocant unused
745							$deprecate{$type} or return;
746							$deprecate{$type}{$name} or return;
747							my $msg = sprintf 'The %s %s is %s', $name, $type,
748							$deprecate{$type}{$name} > 2 ? 'removed' : 'deprecated';
749							defined $repl
750							and $msg .= "; use $repl instead";
751							$deprecate{$type}{$name} >= 3
752							and croak $msg;
753							warnings::enabled( 'deprecated' )
754							and carp $msg;
755							$deprecate{$type}{$name} == 1
756							and $deprecate{$type}{$name} = 0;
757							return;
758							}
759
760							sub _deprecation_in_progress {
761							my ( $self, $type, $name ) = @_;
762							$deprecate{$type} or return;
763							return $deprecate{$type}{$name};
764							}
765
766							}
767
768							=end comment
769
770							=cut
771
772							sub _remainder {
773	3623			3623		4530	my ( $self ) = @_;
774
775							$self->{cursor_curr} > $self->{cursor_limit}
776	3623	50				6404	and confess "Programming error - Trying to find past end of string";
777							$self->{find} = substr(
778							$self->{content},
779							$self->{cursor_curr},
780							$self->{cursor_limit} - $self->{cursor_curr}
781	3623					7473	);
782
783	3623					4716	return;
784							}
785
786							sub _make_final_token {
787	10			10		22	my ( $self, $len, $class, $arg ) = @_;
788	10					26	my $token = $self->make_token( $len, $class, $arg );
789	10					45	$self->_set_mode( 'kaput' );
790	10					57	return $token;
791							}
792
793							sub _set_mode {
794	1657			1657		2699	my ( $self, $mode ) = @_;
795							$self->{trace}
796	1657	50				2946	and warn "Tokenizer going from mode $self->{mode} to $mode\n";
797	1657					2316	$self->{mode} = $mode;
798	1657	100				2889	if ( 'kaput' eq $mode ) {
799							$self->{cursor_curr} = $self->{cursor_limit} =
800	537					1061	length $self->{content};
801							}
802	1657					2259	return;
803							}
804
805							sub __init_error {
806	10			10		16	my ( $self , $err ) = @_;
807	10	100				26	defined $err
808							or $err = 'Tokenizer found illegal first characters';
809							return $self->_make_final_token(
810	10					55	length $self->{content}, TOKEN_UNKNOWN, {
811							error => $err,
812							},
813							);
814							}
815
816							sub _update_location {
817	107			107		156	my ( $self, $token ) = @_;
818							$token->{location} # Idempotent
819	107	100				187	and return;
820	105		66			213	my $loc = $self->{_location} \|\|= do {
821							my %loc = (
822							location => $self->{location},
823	12					34	);
824	12	100				40	if ( __instance( $self->{source}, 'PPI::Element' ) ) {
825	11		33			77	$loc{location} \|\|= $self->{source}->location();
826	11	50				1395	if ( my $doc = $self->{source}->document() ) {
827	11					250	$loc{tab_width} = $doc->tab_width();
828							}
829							}
830	12		100			70	$loc{tab_width} \|\|= 1;
831	12					30	\%loc;
832							};
833							$loc->{location}
834	105	50				197	or return;
835	105					104	$token->{location} = [ @{ $loc->{location} } ];
	105					220
836	105	50				208	if ( defined( my $content = $token->content() ) ) {
837
838	105					124	my $lines;
839	105					214	pos( $content ) = 0;
840	105					238	$lines++ while $content =~ m/ \n /smxgc;
841	105	100				160	if ( pos $content ) {
842	2					3	$loc->{location}[LOCATION_LINE] += $lines;
843	2					4	$loc->{location}[LOCATION_LOGICAL_LINE] += $lines;
844							$loc->{location}[LOCATION_CHARACTER] =
845	2					3	$loc->{location}[LOCATION_COLUMN] = 1;
846							}
847
848	105	100				205	if ( my $chars = length( $content ) - pos( $content ) ) {
849	102					120	$loc->{location}[LOCATION_CHARACTER] += $chars;
850	102	100	100			197	if ( $loc->{tab_width} > 1 && $content =~ m/ \t /smx ) {
851	5					8	my $pos = $loc->{location}[LOCATION_COLUMN];
852	5					6	my $tab_width = $loc->{tab_width};
853							# Stolen shamelessly from PPI::Document::_visual_length
854	5					6	my ( $vis_inc );
855	5					14	foreach my $part ( split /(\t)/, $content ) {
856	10	100				14	if ($part eq "\t") {
857	5					6	$vis_inc = $tab_width - ($pos-1) % $tab_width;
858							} else {
859	5					6	$vis_inc = length $part;
860							}
861	10					10	$pos += $vis_inc;
862							}
863	5					10	$loc->{location}[LOCATION_COLUMN] = $pos;
864							} else {
865	97					133	$loc->{location}[LOCATION_COLUMN] += $chars;
866							}
867							}
868
869							}
870	105					175	return;
871							}
872
873							sub __PPIX_TOKENIZER__init {
874	537			537		1062	my ( $self ) = @_;
875
876	537	50				2879	$self->find_regexp(
877							qr{ \A ( \s* ) ( qr \| m \| s )? ( \s* ) ( . ) }smx )
878							or return $self->__init_error();
879
880	537					2114	my ( $leading_white, $type, $next_white, $delim_start ) = $self->capture();
881
882	537	100				1577	defined $type
883							or $type = '';
884
885	537	100	100			2443	$type
886							or $delim_start =~ m< \A [/?] \z >smx
887							or return $self->__init_error();
888	531	100	100			2135	$type
			100
889							and not $next_white
890							and $delim_start =~ m< \A \w \z >smx
891							and return $self->__init_error();
892
893	529					1274	$self->{type} = $type;
894
895	529					830	my @tokens;
896
897	529	100				1568	'' ne $leading_white
898							and push @tokens, $self->make_token( length $leading_white,
899							'PPIx::Regexp::Token::Whitespace' );
900	529					1830	push @tokens, $self->make_token( length $type,
901							'PPIx::Regexp::Token::Structure' );
902	529	100				1207	'' ne $next_white
903							and push @tokens, $self->make_token( length $next_white,
904							'PPIx::Regexp::Token::Whitespace' );
905
906	529					1144	$self->{delimiter_start} = $delim_start;
907
908							$self->{trace}
909	529	50				1357	and warn "Tokenizer found regexp start delimiter '$delim_start' at $self->{cursor_curr}\n";
910
911	529	50				1366	if ( my $offset = $self->find_matching_delimiter() ) {
912	529					1047	my $cursor_limit = $self->{cursor_curr} + $offset;
913							$self->{trace}
914	529	50				1127	and warn "Tokenizer found regexp end delimiter at $cursor_limit\n";
915	529	100				1562	if ( $self->__number_of_extra_parts() ) {
916							### my $found_embedded_comments;
917	46	100				152	if ( $self->close_bracket(
918							$self->{delimiter_start} ) ) {
919							pos $self->{content} = $self->{cursor_curr} +
920	7					39	$offset + 1;
921							# If we're bracketed, there may be Perl comments between
922							# the regex and the replacement. PPI gets the parse
923							# wrong as of 1.220, but if we get the handling of the
924							# underlying string right, we will Just Work when PPI
925							# gets it right.
926	7					54	while ( $self->{content} =~
927							m/ \G \s* \n \s* \# [^\n]* /smxgc ) {
928							## $found_embedded_comments = 1;
929							}
930	7					27	$self->{content} =~ m/ \s* /smxgc;
931							} else {
932							pos $self->{content} = $self->{cursor_curr} +
933	39					199	$offset;
934							}
935							# Localizing cursor_curr and delimiter_start would be
936							# cleaner, but I don't want the old values restored if a
937							# parse error occurs.
938	46					109	my $cursor_curr = $self->{cursor_curr};
939	46					132	my $delimiter_start = $self->{delimiter_start};
940	46					114	$self->{cursor_curr} = pos $self->{content};
941							$self->{delimiter_start} = substr
942							$self->{content},
943							$self->{cursor_curr},
944	46					112	1;
945							$self->{trace}
946	46	50				126	and warn "Tokenizer found replacement start delimiter '$self->{delimiter_start}' at $self->{cursor_curr}\n";
947	46	100				101	if ( my $s_off = $self->find_matching_delimiter() ) {
948							$self->{cursor_modifiers} =
949	44					112	$self->{cursor_curr} + $s_off + 1;
950							$self->{trace}
951	44	50				143	and warn "Tokenizer found replacement end delimiter at @{[
952	0					0	$self->{cursor_curr} + $s_off ]}\n";
953	44					78	$self->{cursor_curr} = $cursor_curr;
954	44					93	$self->{delimiter_start} = $delimiter_start;
955							} else {
956							$self->{trace}
957	2	50				6	and warn 'Tokenizer failed to find replacement',
958							"end delimiter starting at $self->{cursor_curr}\n";
959	2					3	$self->{cursor_curr} = 0;
960							# TODO If I were smart enough here I could check for
961							# PPI mis-parses like s{foo}
962							# #{bar}
963							# {baz}
964							# here, doing so if $found_embedded_comments (commented
965							# out above) is true. The problem is that there seem to
966							# as many mis-parses as there are possible delimiters.
967	2					6	return $self->__init_error(
968							'Tokenizer found mismatched replacement delimiters',
969							);
970							}
971							} else {
972	483					888	$self->{cursor_modifiers} = $cursor_limit + 1;
973							}
974	527					1079	$self->{cursor_limit} = $cursor_limit;
975							} else {
976	0					0	$self->{cursor_curr} = 0;
977							return $self->_make_final_token(
978	0					0	length( $self->{content} ), TOKEN_UNKNOWN, {
979							error => 'Tokenizer found mismatched regexp delimiters',
980							},
981							);
982							}
983
984							{
985							# We have to instantiate the trailing tokens now so we can
986							# figure out what modifiers are in effect. But we can't
987							# index their locations (if desired) because they are being
988							# instantiated out of order
989
990	527					697	local $self->{index_locations} = 0;
	527					1136
991
992	527					723	my @mods = @{ $self->{default_modifiers} };
	527					1050
993	527					1751	pos $self->{content} = $self->{cursor_modifiers};
994	527					1149	local $self->{cursor_curr} = $self->{cursor_modifiers};
995	527					1261	local $self->{cursor_limit} = length $self->{content};
996	527					827	my @trailing;
997							{
998	527					655	my $len = $self->find_regexp( qr{ \A [[:lower:]]* }smx );
	527					1881
999	527					1469	push @trailing, $self->make_token( $len,
1000							'PPIx::Regexp::Token::Modifier' );
1001							}
1002	527	100				1597	if ( my $len = $self->find_regexp( qr{ \A \s+ }smx ) ) {
1003	1					4	push @trailing, $self->make_token( $len,
1004							'PPIx::Regexp::Token::Whitespace' );
1005							}
1006	527	100				1741	if ( my $len = $self->find_regexp( qr{ \A .+ }smx ) ) {
1007	1					5	push @trailing, $self->make_token( $len, TOKEN_UNKNOWN, {
1008							error => 'Trailing characters after expression',
1009							} );
1010							}
1011	527					1508	$self->{trailing_tokens} = \@trailing;
1012	527					1359	push @mods, $trailing[0]->content();
1013							$self->{effective_modifiers} =
1014	527					1235	PPIx::Regexp::Token::Modifier::__aggregate_modifiers (
1015							@mods );
1016							$self->{modifiers} = [
1017	527					1042	{ %{ $self->{effective_modifiers} } },
	527					2481
1018							];
1019							}
1020
1021							$self->{delimiter_finish} = substr
1022							$self->{content},
1023							$self->{cursor_limit},
1024	527					1478	1;
1025
1026	527					1193	push @tokens, $self->make_token( 1,
1027							'PPIx::Regexp::Token::Delimiter' );
1028
1029	527					1738	$self->_set_mode( 'regexp' );
1030
1031	527					803	$self->{find} = undef;
1032
1033	527					2072	return @tokens;
1034							}
1035
1036							# Match the initial part of the regexp including any leading white
1037							# space. The initial delimiter is the first thing not consumed, though
1038							# we check it for sanity.
1039							sub __initial_match {
1040	0			0		0	my ( $self ) = @_;
1041
1042	0	0				0	$self->find_regexp(
1043							qr{ \A ( \s* ) ( qr \| m \| s )? ( \s* ) (?: [^\w\s] ) }smx )
1044							or return;
1045
1046	0					0	my ( $leading_white, $type, $next_white ) = $self->capture();
1047
1048	0	0				0	defined $type
1049							or $type = '';
1050
1051	0					0	$self->{type} = $type;
1052
1053	0					0	my @tokens;
1054
1055	0	0				0	'' ne $leading_white
1056							and push @tokens, $self->make_token( length $leading_white,
1057							'PPIx::Regexp::Token::Whitespace' );
1058	0					0	push @tokens, $self->make_token( length $type,
1059							'PPIx::Regexp::Token::Structure' );
1060	0	0				0	'' ne $next_white
1061							and push @tokens, $self->make_token( length $next_white,
1062							'PPIx::Regexp::Token::Whitespace' );
1063
1064	0					0	return @tokens;
1065							}
1066
1067							{
1068							my %extra_parts = (
1069							s => 1,
1070							);
1071
1072							# Return the number of extra delimited parts. This will be 0 except
1073							# for s///, which will be 1.
1074							sub __number_of_extra_parts {
1075	855			855		1332	my ( $self ) = @_;
1076	855		100			3328	return $extra_parts{$self->{type}} \|\| 0;
1077							}
1078							}
1079
1080							{
1081							my @part_class = qw{
1082							PPIx::Regexp::Structure::Regexp
1083							PPIx::Regexp::Structure::Replacement
1084							};
1085
1086							# Return the classes for the parts of the expression.
1087							sub __part_classes {
1088	326			326		678	my ( $self ) = @_;
1089	326					756	my $max = $self->__number_of_extra_parts();
1090	326					1429	return @part_class[ 0 .. $max ];
1091							}
1092							}
1093
1094							sub __PPIX_TOKENIZER__regexp {
1095	3036			3036		4516	my ( $self, $character ) = @_;
1096
1097	3036					4039	my $mode = $self->{mode};
1098	3036					3828	my $handler = '__PPIX_TOKENIZER__' . $mode;
1099
1100	3036					4244	$self->{cursor_orig} = $self->{cursor_curr};
1101	3036					5333	foreach my $class ( $self->_known_tokenizers() ) {
1102	13478					38113	my @tokens = grep { $_ } $class->$handler( $self, $character );
	3867					7054
1103							$self->{trace}
1104	13478	50				19899	and warn $class, "->$handler( \$self, '$character' )",
1105							" => (@tokens)\n";
1106							@tokens
1107							and return ( map {
1108	13478	100				19996	ref $_ ? $_ : $self->make_token( $_,
	3033	100				8408
1109							$class ) } @tokens );
1110							}
1111
1112							# Find a fallback processor for the character.
1113	27		33			204	my $fallback = __PACKAGE__->can( '__PPIX_TOKEN_FALLBACK__' . $mode )
1114							\|\| __PACKAGE__->can( '__PPIX_TOKEN_FALLBACK__regexp' )
1115							\|\| confess "Programming error - unable to find fallback for $mode";
1116	27					67	return $fallback->( $self, $character );
1117							}
1118
1119							*__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp;
1120
1121							sub __PPIX_TOKEN_FALLBACK__regexp {
1122	18			18		39	my ( $self, $character ) = @_;
1123
1124							# As a fallback in regexp mode, any escaped character is a literal.
1125	18	100	66			62	if ( $character eq '\\'
1126							&& $self->{cursor_limit} - $self->{cursor_curr} > 1
1127							) {
1128	2					6	return $self->make_token( 2, TOKEN_LITERAL );
1129							}
1130
1131							# Any normal character is unknown.
1132	16					78	return $self->make_token( 1, TOKEN_UNKNOWN, {
1133							error => 'Tokenizer found unexpected literal',
1134							},
1135							);
1136							}
1137
1138							sub __PPIX_TOKEN_FALLBACK__repl {
1139	9			9		15	my ( $self, $character ) = @_;
1140
1141							# As a fallback in replacement mode, any escaped character is a literal.
1142	9	100	66			28	if ( $character eq '\\'
1143							&& defined ( my $next = $self->peek( 1 ) ) ) {
1144
1145	5	0	33			12	if ( $self->interpolates() \|\| $next eq q<'> \|\| $next eq '\\' ) {
			33
1146	5					12	return $self->make_token( 2, TOKEN_LITERAL );
1147							}
1148	0					0	return $self->make_token( 1, TOKEN_LITERAL );
1149							}
1150
1151							# So is any normal character.
1152	4					9	return $self->make_token( 1, TOKEN_LITERAL );
1153							}
1154
1155							sub __PPIX_TOKENIZER__finish {
1156	560			560		987	my ( $self ) = @_; # $character unused
1157
1158							$self->{cursor_limit} > length $self->{content}
1159	560	50				1439	and confess "Programming error - ran off string";
1160
1161							my @tokens = $self->make_token( length $self->{delimiter_finish},
1162	560					1510	'PPIx::Regexp::Token::Delimiter' );
1163
1164	560	100				1415	if ( $self->{cursor_curr} == $self->{cursor_modifiers} ) {
1165
1166							# We are out of string. Add the trailing tokens (created when we
1167							# did the initial bracket scan) and close up shop.
1168
1169	516					1416	push @tokens, $self->_get_trailing_tokens();
1170
1171	516					996	$self->_set_mode( 'kaput' );
1172
1173							} else {
1174
1175							# Clear the cookies, because we are going around again.
1176	44					167	$self->{cookie} = {};
1177
1178							# Move the cursor limit to just before the modifiers.
1179	44					117	$self->{cursor_limit} = $self->{cursor_modifiers} - 1;
1180
1181							# If the preceding regular expression was bracketed, we need to
1182							# consume possible whitespace and find another delimiter.
1183
1184	44	100				172	if ( $self->close_bracket( $self->{delimiter_start} ) ) {
1185	7					11	my $accept;
1186							# If we are bracketed, there can be honest-to-God Perl
1187							# comments between the regexp and the replacement, not just
1188							# regexp comments. As of version 1.220, PPI does not get
1189							# this parse right, but if we can handle this is a string,
1190							# then we will Just Work when PPI gets itself straight.
1191	7					31	while ( $self->find_regexp(
1192							qr{ \A ( \s* \n \s* ) ( \# [^\n]* \n ) }smx ) ) {
1193	2					7	my ( $white_space, $comment ) = $self->capture();
1194	2					6	push @tokens, $self->make_token(
1195							length $white_space,
1196							'PPIx::Regexp::Token::Whitespace',
1197							), $self->make_token(
1198							length $comment,
1199							'PPIx::Regexp::Token::Comment',
1200							);
1201							}
1202	7	100				26	$accept = $self->find_regexp( qr{ \A \s+ }smx )
1203							and push @tokens, $self->make_token(
1204							$accept, 'PPIx::Regexp::Token::Whitespace' );
1205	7					26	my $character = $self->peek();
1206	7					17	$self->{delimiter_start} = $character;
1207	7					19	push @tokens, $self->make_token(
1208							1, 'PPIx::Regexp::Token::Delimiter' );
1209							$self->{delimiter_finish} = substr
1210							$self->{content},
1211	7					19	$self->{cursor_limit} - 1,
1212							1;
1213							}
1214
1215	44	100				105	if ( $self->modifier( 'e*' ) ) {
1216							# With /e or /ee, the replacement portion is code. We make
1217							# it all into one big PPIx::Regexp::Token::Code, slap on the
1218							# trailing delimiter and modifiers, and return it all.
1219							push @tokens, $self->make_token(
1220							$self->{cursor_limit} - $self->{cursor_curr},
1221	11					54	'PPIx::Regexp::Token::Code',
1222							{ perl_version_introduced => MINIMUM_PERL },
1223							);
1224	11					34	$self->{cursor_limit} = length $self->{content};
1225	11					32	push @tokens, $self->make_token( 1,
1226							'PPIx::Regexp::Token::Delimiter' ),
1227							$self->_get_trailing_tokens();
1228	11					29	$self->_set_mode( 'kaput' );
1229							} else {
1230							# Put our mode to replacement.
1231	33					70	$self->_set_mode( 'repl' );
1232							}
1233
1234							}
1235
1236	560					1507	return @tokens;
1237
1238							}
1239
1240							# To common processing on trailing tokens.
1241							sub _get_trailing_tokens {
1242	527			527		874	my ( $self ) = @_;
1243	527	100				1074	if ( $self->{index_locations} ) {
1244							# We turned off index_locations when these were created, because
1245							# they were done out of order. Fix that now.
1246	11					16	foreach my $token ( @{ $self->{trailing_tokens} } ) {
	11					21
1247	11					24	$self->_update_location( $token );
1248							}
1249							}
1250	527					648	return @{ delete $self->{trailing_tokens} };
	527					1341
1251							}
1252
1253							1;
1254
1255							__END__
1256
1257							=head1 NAME
1258
1259							PPIx::Regexp::Tokenizer - Tokenize a regular expression
1260
1261							=head1 SYNOPSIS
1262
1263							use PPIx::Regexp::Dumper;
1264							PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
1265							->print();
1266
1267							=head1 INHERITANCE
1268
1269							C<PPIx::Regexp::Tokenizer> is a
1270							L<PPIx::Regexp::Support\|PPIx::Regexp::Support>.
1271
1272							C<PPIx::Regexp::Tokenizer> has no descendants.
1273
1274							=head1 DESCRIPTION
1275
1276							This class provides tokenization of the regular expression.
1277
1278							=head1 METHODS
1279
1280							This class provides the following public methods. Methods not documented
1281							here (or documented below under L</EXTERNAL TOKENIZERS>) are private,
1282							and unsupported in the sense that the author reserves the right to
1283							change or remove them without notice.
1284
1285							=head2 new
1286
1287							my $tokenizer = PPIx::Regexp::Tokenizer->new( 'xyzzy' );
1288
1289							This static method instantiates the tokenizer. You must pass it the
1290							regular expression to be parsed, either as a string or as a
1291							L<PPI::Element\|PPI::Element> of some sort. You can also pass optional
1292							name/value pairs of arguments. The option names are specified B<without>
1293							a leading dash. Supported options are:
1294
1295							=over
1296
1297							=item default_modifiers array_reference
1298
1299							This argument specifies default statement modifiers. It is optional, but
1300							if specified must be an array reference. See the
1301							L<PPIx::Regexp\|PPIx::Regexp> L<new()\|PPIx::Regexp/new> documentation for
1302							the details.
1303
1304							=item encoding name
1305
1306							This option specifies the encoding of the string to be tokenized. If
1307							specified, an C<Encode::decode> is done on the string (or the C<content>
1308							of the PPI class) before it is tokenized.
1309
1310							=item index_locations
1311
1312							This Boolean option specifies that the locations of the generated tokens
1313							are to be computed.
1314
1315							=item strict boolean
1316
1317							This option specifies whether tokenization should assume
1318							C<use re 'strict';> is in effect.
1319
1320							The C<'strict'> pragma was introduced in Perl 5.22, and its
1321							documentation says that it is experimental, and that there is no
1322							commitment to backward compatibility. The same applies to the
1323							tokenization produced when this option is asserted.
1324
1325							=item trace number
1326
1327							Specifying a positive value for this option causes a trace of the
1328							tokenization. This option is unsupported in the sense that the author
1329							reserves the right to alter it without notice.
1330
1331							If this option is unspecified, the value comes from environment variable
1332							C<PPIX_REGEXP_TOKENIZER_TRACE> (see L</ENVIRONMENT VARIABLES>). If this
1333							environment variable does not exist, the default is 0.
1334
1335							=back
1336
1337							Undocumented options are unsupported.
1338
1339							The returned value is the instantiated tokenizer, or C<undef> if
1340							instantiation failed. In the latter case a call to L</errstr> will
1341							return the reason.
1342
1343							=head2 content
1344
1345							print $tokenizer->content();
1346
1347							This method returns the string being tokenized. This will be the result
1348							of the L<< PPI::Element->content()\|PPI::Element/content >> method if the
1349							object was instantiated with a L<PPI::Element\|PPI::Element>.
1350
1351							=head2 default_modifiers
1352
1353							print join ', ', @{ $tokenizer->default_modifiers() };
1354
1355							This method returns a reference to a copy of the array passed to the
1356							C<default_modifiers> argument to L<new()\|/new>. If this argument was not
1357							used to instantiate the object, the return is a reference to an empty
1358							array.
1359
1360							=head2 encoding
1361
1362							This method returns the encoding of the data being parsed, if one was
1363							set when the class was instantiated; otherwise it simply returns undef.
1364
1365							=head2 errstr
1366
1367							my $tokenizer = PPIx::Regexp::Tokenizer->new( 'xyzzy' )
1368							or die PPIx::Regexp::Tokenizer->errstr();
1369
1370							This static method returns an error description if tokenizer
1371							instantiation failed.
1372
1373							=head2 failures
1374
1375							print $tokenizer->failures(), " tokenization failures\n";
1376
1377							This method returns the number of tokenization failures encountered. A
1378							tokenization failure is represented in the output token stream by a
1379							L<PPIx::Regexp::Token::Unknown\|PPIx::Regexp::Token::Unknown>.
1380
1381							=head2 modifier
1382
1383							$tokenizer->modifier( 'x' )
1384							and print "Tokenizing an extended regular expression\n";
1385
1386							This method returns true if the given modifier character was found on
1387							the end of the regular expression, and false otherwise.
1388
1389							Starting with version 0.036_01, if the argument is a
1390							single-character modifier followed by an asterisk (intended as a wild
1391							card character), the return is the number of times that modifier
1392							appears. In this case an exception will be thrown if you specify a
1393							multi-character modifier (e.g. C<'ee*'>), or if you specify one of the
1394							match semantics modifiers (e.g. C<'a*'>).
1395
1396							If called by an external tokenizer, this method returns true if if the
1397							given modifier was true at the current point in the tokenization.
1398
1399							=head2 next_token
1400
1401							my $token = $tokenizer->next_token();
1402
1403							This method returns the next token in the token stream, or nothing if
1404							there are no more tokens.
1405
1406							=head2 significant
1407
1408							This method exists simply for the convenience of
1409							L<PPIx::Regexp::Dumper\|PPIx::Regexp::Dumper>. It always returns true.
1410
1411							=head2 tokens
1412
1413							my @tokens = $tokenizer->tokens();
1414
1415							This method returns all remaining tokens in the token stream.
1416
1417							=head1 EXTERNAL TOKENIZERS
1418
1419							This class does very little of its own tokenization. Instead the token
1420							classes contain external tokenization routines, whose name is
1421							'__PPIX_TOKENIZER__' concatenated with the current mode of the tokenizer
1422							('regexp' for regular expressions, 'repl' for the replacement string).
1423
1424							These external tokenizers are called as static methods, and passed the
1425							C<PPIx::Regexp::Tokenizer> object and the current character in the
1426							character stream.
1427
1428							If the external tokenizer wants to make one or more tokens, it returns
1429							an array containing either length in characters for tokens of the
1430							tokenizer's own class, or the results of one or more L</make_token>
1431							calls for tokens of an arbitrary class.
1432
1433							If the external tokenizer is not interested in the characters starting
1434							at the current position it simply returns.
1435
1436							The following methods are for the use of external tokenizers, and B<are
1437							not part of the public interface to this class.>
1438
1439							=head2 capture
1440
1441							if ( $tokenizer->find_regexp( qr{ \A ( foo ) }smx ) ) {
1442							foreach ( $tokenizer->capture() ) {
1443							print "$_\n";
1444							}
1445							}
1446
1447							This method returns all the contents of any capture buffers from the
1448							previous call to L</find_regexp>. The first element of the array (i.e.
1449							element 0) corresponds to C<$1>, and so on.
1450
1451							The captures are cleared by L</make_token>, as well as by another call
1452							to L</find_regexp>.
1453
1454							=head2 cookie
1455
1456							$tokenizer->cookie( foo => sub { 1 } );
1457							my $cookie = $tokenizer->cookie( 'foo' );
1458							my $old_hint = $tokenizer->cookie( foo => undef );
1459
1460							This method either creates, deletes, or accesses a cookie.
1461
1462							A cookie is a code reference which is called whenever the tokenizer makes
1463							a token. If it returns a false value, it is deleted. Explicitly setting
1464							the cookie to C<undef> also deletes it.
1465
1466							When you call C<< $tokenizer->cookie( 'foo' ) >>, the current cookie is
1467							returned. If you pass a new value of C<undef> to delete the token, the
1468							deleted cookie (if any) is returned.
1469
1470							When the L</make_token> method calls a cookie, it passes it the tokenizer
1471							and the token just made. If a token calls a cookie, it is recommended that
1472							it merely pass the tokenizer, though of course the token can do whatever
1473							it wants.
1474
1475							The cookie mechanism seems to be a bit of a crock, but it appeared to be
1476							more work to fix things up in the lexer after the tokenizer got
1477							something wrong.
1478
1479							The recommended way to write a cookie is to use a closure to store any
1480							necessary data, and have a call to the cookie return the data; otherwise
1481							the ultimate consumer of the cookie has no way to access the data. Of
1482							course, it may be that the presence of the cookie at a certain point in
1483							the parse is all that is required.
1484
1485							=head2 expect
1486
1487							$tokenizer->expect( 'PPIx::Regexp::Token::Code' );
1488
1489							This method inserts a given class at the head of the token scan, for the
1490							next iteration only. More than one class can be specified. Class names
1491							can be abbreviated by removing the leading 'PPIx::Regexp::'.
1492
1493							If no class is specified, this method does nothing.
1494
1495							The expectation lasts from the next time L</get_token> is called until
1496							the next time L</make_token> makes a significant token, or until the
1497							next C<expect> call if that is done sooner.
1498
1499							=head2 find_regexp
1500
1501							my $end = $tokenizer->find_regexp( qr{ \A \w+ }smx );
1502							my ( $begin, $end ) = $tokenizer->find_regexp(
1503							qr{ \A \w+ }smx );
1504
1505							This method finds the given regular expression in the content, starting
1506							at the current position. If called in scalar context, the offset from
1507							the current position to the end of the matched string is returned. If
1508							called in list context, the offsets to both the beginning and the end of
1509							the matched string are returned.
1510
1511							=head2 find_matching_delimiter
1512
1513							my $offset = $tokenizer->find_matching_delimiter();
1514
1515							This method is used by tokenizers to find the delimiter matching the
1516							character at the current position in the content string. If the
1517							delimiter is an opening bracket of some sort, bracket nesting will be
1518							taken into account.
1519
1520							When searching for the matching delimiter, the back slash character is
1521							considered to escape the following character, so back-slashed delimiters
1522							will be ignored. No other quoting mechanisms are recognized, though, so
1523							delimiters inside quotes still count. This is actually the way Perl
1524							works, as
1525
1526							$ perl -e 'qr<(?{ print "}" })>'
1527
1528							demonstrates.
1529
1530							This method returns the offset from the current position in the content
1531							string to the matching delimiter (which will always be positive), or
1532							undef if no match can be found.
1533
1534							=head2 get_mode
1535
1536							This method returns the name of the current mode of the tokenizer.
1537
1538							=head2 get_start_delimiter
1539
1540							my $start_delimiter = $tokenizer->get_start_delimiter();
1541
1542							This method is used by tokenizers to access the start delimiter for the
1543							regular expression.
1544
1545							=head2 get_token
1546
1547							my $token = $tokenizer->make_token( 3 );
1548							my @tokens = $tokenizer->get_token();
1549
1550							This method returns the next token that can be made from the input
1551							stream. It is B<not> part of the external interface, but is intended for
1552							the use of an external tokenizer which calls it after making and
1553							retaining its own token to look at the next token ( if any ) in the
1554							input stream.
1555
1556							If any external tokenizer calls get_token without first calling
1557							make_token, a fatal error occurs; this is better than the infinite
1558							recursion which would occur if the condition were not trapped.
1559
1560							An external tokenizer B<must> return anything returned by get_token;
1561							otherwise tokens get lost.
1562
1563							=head2 interpolates
1564
1565							This method returns true if the top-level structure being tokenized
1566							interpolates; that is, if the delimiter is not a single quote.
1567
1568							=head2 make_token
1569
1570							return $tokenizer->make_token( 3, 'PPIx::Regexp::Token::Unknown' );
1571
1572							This method is used by this class (and possibly by individual
1573							tokenizers) to manufacture a token. Its arguments are the number of
1574							characters to include in the token, and optionally the class of the
1575							token. If no class name is given, the caller's class is used. Class
1576							names may be shortened by removing the initial 'PPIx::Regexp::', which
1577							will be restored by this method.
1578
1579							The token will be manufactured from the given number of characters
1580							starting at the current cursor position, which will be adjusted.
1581
1582							If the given length would include characters past the end of the string
1583							being tokenized, the length is reduced appropriately. If this means a
1584							token with no characters, nothing is returned.
1585
1586							=head2 match
1587
1588							if ( $tokenizer->find_regexp( qr{ \A \w+ }smx ) ) {
1589							print $tokenizer->match(), "\n";
1590							}
1591
1592							This method returns the string matched by the previous call to
1593							L</find_regexp>.
1594
1595							The match is set to C<undef> by L</make_token>, as well as by another
1596							call to L</find_regexp>.
1597
1598							=head2 modifier_duplicate
1599
1600							$tokenizer->modifier_duplicate();
1601
1602							This method duplicates the modifiers on the top of the modifier stack,
1603							with the intent of creating a locally-scoped copy of the modifiers. This
1604							should only be called by an external tokenizer that is actually creating
1605							a modifier scope. In other words, only when creating a
1606							L<PPIx::Regexp::Token::Structure\|PPIx::Regexp::Token::Structure> token
1607							whose content is '('.
1608
1609							=head2 modifier_modify
1610
1611							$tokenizer->modifier_modify( name => $value ... );
1612
1613							This method sets new values for the modifiers in the local scope. Only
1614							the modifiers whose names are actually passed have their values changed.
1615
1616							This method is intended to be called after manufacturing a
1617							L<PPIx::Regexp::Token::Modifier\|PPIx::Regexp::Token::Modifier> token,
1618							and passed the results of its C<modifiers> method.
1619
1620							=head2 modifier_pop
1621
1622							$tokenizer->modifier_pop();
1623
1624							This method removes the modifiers on the top of the modifier stack. This
1625							should only be called by an external tokenizer that is ending a modifier
1626							scope. In other words, only when creating a
1627							L<PPIx::Regexp::Token::Structure\|PPIx::Regexp::Token::Structure> token
1628							whose content is ')'.
1629
1630							Note that this method will never pop the last modifier item off the
1631							stack, to guard against unmatched right parentheses.
1632
1633							=head2 modifier_seen
1634
1635							$tokenizer->modifier_seen( 'i' )
1636							and print "/i was seen at some point.\n";
1637
1638							Unlike L<modifier()\|/modifier>, this method returns a true value if the
1639							given modifier has been seen in any scope visible from the current
1640							location in the parse. There is no magic for group match semantics (
1641							/a, /aa, /d, /l, /u) or modifiers that can be repeated, like /x and /xx,
1642							or /e and /ee.
1643
1644							=head2 peek
1645
1646							my $character = $tokenizer->peek();
1647							my $next_char = $tokenizer->peek( 1 );
1648
1649							This method returns the character at the given non-negative offset from
1650							the current position. If no offset is given, an offset of 0 is used.
1651
1652							If you ask for a negative offset or an offset off the end of the sting,
1653							C<undef> is returned.
1654
1655							=head2 ppi_document
1656
1657							This method makes a PPI document out of the remainder of the string, and
1658							returns it.
1659
1660							=head2 prior_significant_token
1661
1662							$tokenizer->prior_significant_token( 'can_be_quantified' )
1663							and print "The prior token can be quantified.\n";
1664
1665							This method calls the named method on the most-recently-instantiated
1666							significant token, and returns the result. Any arguments subsequent to
1667							the method name will be passed to the method.
1668
1669							Because this method is designed to be used within the tokenizing system,
1670							it will die horribly if the named method does not exist.
1671
1672							If called with no arguments at all the most-recently-instantiated
1673							significant token is returned.
1674
1675							=head2 strict
1676
1677							say 'Parse is ', $tokenizer->strict() ? 'strict' : 'lenient';
1678
1679							This method simply returns true or false, depending on whether the
1680							C<'strict'> option to C<new()> was true or false.
1681
1682							=head1 ENVIRONMENT VARIABLES
1683
1684							A tokenizer trace can be requested by setting environment variable
1685							PPIX_REGEXP_TOKENIZER_TRACE to a numeric value other than 0. Use of this
1686							environment variable is unsupported in the same sense that the C<trace>
1687							option of L</new> is unsupported. Explicitly specifying the C<trace>
1688							option to L</new> overrides the environment variable.
1689
1690							The real reason this is documented is to give the user a way to
1691							troubleshoot funny output from the tokenizer.
1692
1693							=head1 SUPPORT
1694
1695							Support is by the author. Please file bug reports at
1696							L<https://rt.cpan.org/Public/Dist/Display.html?Name=PPIx-Regexp>,
1697							L<https://github.com/trwyant/perl-PPIx-Regexp/issues>, or in
1698							electronic mail to the author.
1699
1700							=head1 AUTHOR
1701
1702							Thomas R. Wyant, III F<wyant at cpan dot org>
1703
1704							=head1 COPYRIGHT AND LICENSE
1705
1706							Copyright (C) 2009-2023, 2025 by Thomas R. Wyant, III
1707
1708							This program is free software; you can redistribute it and/or modify it
1709							under the same terms as Perl 5.10.0. For more details, see the full text
1710							of the licenses in the directory LICENSES.
1711
1712							This program is distributed in the hope that it will be useful, but
1713							without any warranty; without even the implied warranty of
1714							merchantability or fitness for a particular purpose.
1715
1716							=cut
1717
1718							# ex: set textwidth=72 :