File Coverage

blib/lib/PPIx/Regexp/Token/Modifier.pm

Criterion	Covered	Total	%
statement	117	127	92.1
branch	55	72	76.3
condition	37	50	74.0
subroutine	21	21	100.0
pod	7	7	100.0
total	237	277	85.5

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Modifier - Represent modifiers.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
9							->print();
10
11							The trailing C<smx> will be represented by this class.
12
13							This class also represents the whole of things like C<(?ismx)>. But the
14							modifiers in something like C<(?i:foo)> are represented by a
15							L<PPIx::Regexp::Token::GroupType::Modifier\|PPIx::Regexp::Token::GroupType::Modifier>.
16
17							=head1 INHERITANCE
18
19							C<PPIx::Regexp::Token::Modifier> is a
20							L<PPIx::Regexp::Token\|PPIx::Regexp::Token>.
21
22							C<PPIx::Regexp::Token::Modifier> is the parent of
23							L<PPIx::Regexp::Token::GroupType::Modifier\|PPIx::Regexp::Token::GroupType::Modifier>.
24
25							=head1 DESCRIPTION
26
27							This class represents modifier characters at the end of the regular
28							expression. For example, in C<qr{foo}smx> this class would represent
29							the terminal C<smx>.
30
31							=head2 The C<a>, C<aa>, C<d>, C<l>, and C<u> modifiers
32
33							The C<a>, C<aa>, C<d>, C<l>, and C<u> modifiers, introduced starting in
34							Perl 5.13.6, are used to force either Unicode pattern semantics (C<u>),
35							locale semantics (C<l>) default semantics (C<d> the traditional Perl
36							semantics, which can also mean 'dual' since it means Unicode if the
37							string's UTF-8 bit is on, and locale if the UTF-8 bit is off), or
38							restricted default semantics (C<a>). These are mutually exclusive, and
39							only one can be asserted at a time. Asserting any of these overrides
40							the inherited value of any of the others. The C<asserted()> method
41							reports as asserted the last one it sees, or none of them if it has seen
42							none.
43
44							For example, given C<PPIx::Regexp::Token::Modifier> C<$elem>
45							representing the invalid regular expression fragment C<(?dul)>,
46							C<< $elem->asserted( 'l' ) >> would return true, but
47							C<< $elem->asserted( 'u' ) >> would return false. Note that
48							C<< $elem->negated( 'u' ) >> would also return false, since C<u> is not
49							explicitly negated.
50
51							If C<$elem> represented regular expression fragment C<(?i)>,
52							C<< $elem->asserted( 'd' ) >> would return false, since even though C<d>
53							represents the default behavior it is not explicitly asserted.
54
55							=head2 The caret (C<^>) modifier
56
57							Calling C<^> a modifier is a bit of a misnomer. The C<(?^...)>
58							construction was introduced in Perl 5.13.6, to prevent the inheritance
59							of modifiers. The documentation calls the caret a shorthand equivalent
60							for C<d-imsx>, and that it the way this class handles it.
61
62							For example, given C<PPIx::Regexp::Token::Modifier> C<$elem>
63							representing regular expression fragment C<(?^i)>,
64							C<< $elem->asserts( 'd' ) >> would return true, since in the absence of
65							an explicit C<l> or C<u> this class considers the C<^> to explicitly
66							assert C<d>.
67
68							The caret handling is complicated by the fact that the C<'n'> modifier
69							was introduced in 5.21.8, at which point the caret became equivalent to
70							C<d-imnsx>. I did not feel I could unconditionally add the C<-n> to the
71							expansion of the caret, because that would produce confusing output from
72							methods like L<explain()\|PPIx::Regexp::Element/explain>. Nor could I
73							make it conditional on the minimum perl version, because that
74							information is not available early enough in the parse. What I did was
75							to expand the caret into C<d-imnsx> if and only if C<'n'> was in effect
76							at some point in the scope in which the modifier was parsed.
77
78							Continuing the above example, C<< $elem->asserts( 'n' ) >> and
79							C<< $elem->modifier_asserted( 'n' ) >> would both return false, but
80							C<< $elem->negates( 'n' ) >> would return true if and only if the C</m>
81							modifier has been asserted somewhere before and in-scope from this
82							token. The
83							L<modifier_asserted( 'n' )\|PPIx::Regexp::Element/modifier_asserted>
84							method is inherited from L<PPIx::Regexp::Element\|PPIx::Regexp::Element>.
85
86							=head1 METHODS
87
88							This class provides the following public methods. Methods not documented
89							here are private, and unsupported in the sense that the author reserves
90							the right to change or remove them without notice.
91
92							=cut
93
94							package PPIx::Regexp::Token::Modifier;
95
96	9			9		45	use strict;
	9					11
	9					244
97	9			9		30	use warnings;
	9					11
	9					332
98
99	9			9		73	use base qw{ PPIx::Regexp::Token };
	9					12
	9					690
100
101	9			9		41	use Carp;
	9					12
	9					675
102	9					1756	use PPIx::Regexp::Constant qw{
103							MINIMUM_PERL
104							MODIFIER_GROUP_MATCH_SEMANTICS
105							@CARP_NOT
106	9			9		45	};
	9					20
107
108							our $VERSION = '0.091';
109
110							# Define modifiers that are to be aggregated internally for ease of
111							# computation.
112							my %aggregate = (
113							a => MODIFIER_GROUP_MATCH_SEMANTICS,
114							aa => MODIFIER_GROUP_MATCH_SEMANTICS,
115							d => MODIFIER_GROUP_MATCH_SEMANTICS,
116							l => MODIFIER_GROUP_MATCH_SEMANTICS,
117							u => MODIFIER_GROUP_MATCH_SEMANTICS,
118							);
119							my %de_aggregate;
120							foreach my $value ( values %aggregate ) {
121							$de_aggregate{$value}++;
122							}
123
124							# Note that we do NOT want the /o modifier on regexen that make use of
125							# this, because it is already compiled.
126							my $capture_group_leader = qr{ [?/(] }smx; # );
127
128	9			9		47	use constant TOKENIZER_ARGUMENT_REQUIRED => 1;
	9					13
	9					17665
129
130							sub __new {
131	589			589		1942	my ( $class, $content, %arg ) = @_;
132
133	589	50				2089	my $self = $class->SUPER::__new( $content, %arg )
134							or return;
135
136							$content =~ m{ \A $capture_group_leader* \^ }smx # no /o!
137							and defined $arg{tokenizer}->modifier_seen( 'n' )
138	589	100	100			4580	and $self->{__caret_undoes_n} = 1;
139
140	589					2075	$arg{tokenizer}->modifier_modify( $self->modifiers() );
141
142	589					1498	return $self;
143							}
144
145							=head2 asserts
146
147							$token->asserts( 'i' ) and print "token asserts i";
148							foreach ( $token->asserts() ) { print "token asserts $_\n" }
149
150							This method returns true if the token explicitly asserts the given
151							modifier. The example would return true for the modifier in
152							C<(?i:foo)>, but false for C<(?-i:foo)>.
153
154							Starting with version 0.036_01, if the argument is a
155							single-character modifier followed by an asterisk (intended as a wild
156							card character), the return is the number of times that modifier
157							appears. In this case an exception will be thrown if you specify a
158							multi-character modifier (e.g. C<'ee*'>).
159
160							If called without an argument, or with an undef argument, all modifiers
161							explicitly asserted by this token are returned.
162
163							=cut
164
165							sub asserts {
166	410			410	1	590	my ( $self, $modifier ) = @_;
167	410		33			685	$self->{modifiers} \|\|= $self->_decode();
168	410	50				536	if ( defined $modifier ) {
169	410					617	return __asserts( $self->{modifiers}, $modifier );
170							} else {
171	0	0				0	return ( sort grep { defined $_ && $self->{modifiers}{$_} }
172	0	0				0	map { $de_aggregate{$_} ? $self->{modifiers}{$_} : $_ }
173	0					0	keys %{ $self->{modifiers} } );
	0					0
174							}
175							}
176
177							# This is a kluge for both determining whether the object asserts
178							# modifiers (hence the 'ductype') and determining whether the given
179							# modifier is actually asserted. The signature is the invocant and the
180							# modifier name, which must not be undef. The return is a boolean.
181							*__ducktype_modifier_asserted = \&asserts;
182
183							sub __asserts {
184	5357			5357		7001	my ( $present, $modifier ) = @_;
185	5357					13452	my $wild = $modifier =~ s/ [*] \z //smx;
186	5357	50	66			11975	not $wild
187							or 1 == length $modifier
188							or croak "Can not use wild card on multi-character modifier '$modifier*'";
189	5357	100				9003	if ( my $bin = $aggregate{$modifier} ) {
190	15					27	my $aggr = $present->{$bin};
191	15	50	100			85	$wild
192							or return ( defined $aggr && $modifier eq $aggr );
193	0	0				0	defined $aggr
194							or return 0;
195	0	0				0	$aggr =~ m/ \A ( (?: \Q$modifier\E )* ) \z /smx
196							or return 0;
197	0					0	return length $1;
198							}
199	5342	100				7652	if ( $wild ) {
200	2529		100			12934	return $present->{$modifier} \|\| 0;
201							}
202	2813					3402	my $len = length $modifier;
203	2813					3929	$modifier = substr $modifier, 0, 1;
204	2813		100			9198	return $present->{$modifier} && $len == $present->{$modifier};
205							}
206
207	8			8	1	20	sub can_be_quantified { return };
208
209							{
210							my %explanation = (
211							'm' => 'm: ^ and $ match within string',
212							'-m' => '-m: ^ and $ match only at ends of string',
213							's' => 's: . can match newline',
214							'-s' => '-s: . can not match newline',
215							'i' => 'i: do case-insensitive matching',
216							'-i' => '-i: do case-sensitive matching',
217							'x' => 'x: ignore whitespace and comments',
218							'xx' => 'xx: ignore whitespace even in bracketed character classes',
219							'-x' => '-x: regard whitespace as literal',
220							'p' => 'p: provide ${^PREMATCH} etc (pre 5.20)',
221							'-p' => '-p: no ${^PREMATCH} etc (pre 5.20)',
222							'a' => 'a: restrict non-Unicode classes to ASCII',
223							'aa' => 'aa: restrict non-Unicode classes & ASCII-Unicode matches',
224							'd' => 'd: match using default semantics',
225							'l' => 'l: match using locale semantics',
226							'u' => 'u: match using Unicode semantics',
227							'n' => 'n: parentheses do not capture',
228							'-n' => '-n: parentheses capture',
229							'c' => 'c: preserve current position on match failure',
230							'g' => 'g: match repeatedly',
231							'e' => 'e: substitution string is an expression',
232							'ee' => 'ee: substitution is expression to eval()',
233							'o' => 'o: only interpolate once',
234							'r' => 'r: aubstitution returns modified string',
235							);
236
237							sub explain {
238	5			5	1	11	my ( $self ) = @_;
239	5					6	my @rslt;
240	5					14	my %modifier = $self->modifiers();
241	5	100				18	if ( defined( my $val = delete $modifier{match_semantics} ) ) {
242	4					10	push @rslt, $explanation{$val};
243							}
244	5					16	foreach my $mod ( sort keys %modifier ) {
245	15					18	my $val = $modifier{$mod};
246	15	100				26	my $key = $val ? $mod x $val : "-$mod";
247	15					23	push @rslt, $explanation{$key};
248	15	100				26	unless ( defined $rslt[-1] ) {
249	1	50				7	if ( my $code = $self->can( "_explain_$mod" ) ) {
250	1					3	$rslt[-1] = "$key: " . $code->( $self, $mod, $val );
251							} else {
252	0					0	$rslt[-1] = "$key: unknown modifier";
253							}
254							}
255							}
256	5	50				24	return wantarray ? @rslt : join '; ', @rslt;
257							}
258							}
259
260							# Called dynamically from explain(), above. This explanation is per
261							# Commit 040a4d7 (perlop: properly document s///e modifier) by mauke,
262							# which makes perlop explicitly state that more than 2 'e' modifiers are
263							# permitted, and cause the result of the expression to be eval-ed n-1
264							# times, where n is the number of 'e' modifiers.
265							sub _explain_e { ## no critic (ProhibitUnusedPrivateSubroutines)
266	1			1		3	my ( undef, undef, $val ) = @_;
267	1					2	--$val;
268	1					3	return "substitution is expression whose result is eval()-ed $val times";
269							}
270
271							=head2 match_semantics
272
273							my $sem = $token->match_semantics();
274							defined $sem or $sem = 'undefined';
275							print "This token has $sem match semantics\n";
276
277							This method returns the match semantics asserted by the token, as one of
278							the strings C<'a'>, C<'aa'>, C<'d'>, C<'l'>, or C<'u'>. If no explicit
279							match semantics are asserted, this method returns C<undef>.
280
281							=cut
282
283							sub match_semantics {
284	96			96	1	160	my ( $self ) = @_;
285	96		33			318	$self->{modifiers} \|\|= $self->_decode();
286	96					197	return $self->{modifiers}{ MODIFIER_GROUP_MATCH_SEMANTICS() };
287							}
288
289							=head2 modifiers
290
291							my %mods = $token->modifiers();
292
293							Returns all modifiers asserted or negated by this token, and the values
294							set (true for asserted, false for negated). If called in scalar context,
295							returns a reference to a hash containing the values.
296
297							=cut
298
299							sub modifiers {
300	595			595	1	995	my ( $self ) = @_;
301	595		66			2702	$self->{modifiers} \|\|= $self->_decode();
302	595					772	my %mods = %{ $self->{modifiers} };
	595					1588
303	595					1342	foreach my $bin ( keys %de_aggregate ) {
304	595	100				1749	defined ( my $val = delete $mods{$bin} )
305							or next;
306	30					78	$mods{$bin} = $val;
307							}
308	595	50				2850	return wantarray ? %mods : \%mods;
309							}
310
311							=head2 negates
312
313							$token->negates( 'i' ) and print "token negates i\n";
314							foreach ( $token->negates() ) { print "token negates $_\n" }
315
316							This method returns true if the token explicitly negates the given
317							modifier. The example would return true for the modifier in
318							C<(?-i:foo)>, but false for C<(?i:foo)>.
319
320							If called without an argument, or with an undef argument, all modifiers
321							explicitly negated by this token are returned.
322
323							=cut
324
325							sub negates {
326	5			5	1	13	my ( $self, $modifier ) = @_;
327	5		33			13	$self->{modifiers} \|\|= $self->_decode();
328							# Note that since the values of hash entries that represent
329							# aggregated modifiers will never be false (at least, not unless '0'
330							# becomes a modifier) we need no special logic to handle them.
331							defined $modifier
332	0					0	or return ( sort grep { ! $self->{modifiers}{$_} }
333	5	50				12	keys %{ $self->{modifiers} } );
	0					0
334							return exists $self->{modifiers}{$modifier}
335	5		66			22	&& ! $self->{modifiers}{$modifier};
336							}
337
338							sub perl_version_introduced {
339	132			132	1	8096	my ( $self ) = @_;
340							return ( $self->{perl_version_introduced} \|\|=
341	132		66			587	$self->_perl_version_introduced() );
342							}
343
344							sub _perl_version_introduced {
345	93			93		157	my ( $self ) = @_;
346	93					229	my $content = $self->content();
347	93					277	my $is_statement_modifier = ( $content !~ m/ \A [(]? [?] /smx );
348	93					243	my $match_semantics = $self->match_semantics();
349
350	93	100				294	$self->asserts( 'xx' )
351							and return '5.025009';
352
353							# Disabling capture with /n was introduced in 5.21.8
354	92	100				169	$self->asserts( 'n' )
355							and return '5.021008';
356
357							# Match semantics modifiers became available as regular expression
358							# modifiers in 5.13.10.
359	91	100	100			297	defined $match_semantics
360							and $is_statement_modifier
361							and return '5.013010';
362
363							# /aa was introduced in 5.13.10.
364	85	100	100			229	defined $match_semantics
365							and 'aa' eq $match_semantics
366							and return '5.013010';
367
368							# /a was introduced in 5.13.9, but only in (?...), not as modifier
369							# of the entire regular expression.
370	84	100	66			220	defined $match_semantics
			100
371							and not $is_statement_modifier
372							and 'a' eq $match_semantics
373							and return '5.013009';
374
375							# /d, /l, and /u were introduced in 5.13.6, but only in (?...), not
376							# as modifiers of the entire regular expression.
377	83	100	66			245	defined $match_semantics
378							and not $is_statement_modifier
379							and return '5.013006';
380
381							# The '^' reassert-defaults modifier in embedded modifiers was
382							# introduced in 5.13.6.
383	73	50	66			179	not $is_statement_modifier
384							and $content =~ m/ \^ /smx
385							and return '5.013006';
386
387	73	100				133	$self->asserts( 'r' ) and return '5.013002';
388	70	100				137	$self->asserts( 'p' ) and return '5.009005';
389	68	100				155	$self->content() =~ m/ \A [(]? [?] .* - /smx
390							and return '5.005';
391	66	100				134	$self->asserts( 'c' ) and return '5.004';
392	65					257	return MINIMUM_PERL;
393							}
394
395							# Return true if the token can be quantified, and false otherwise
396							# sub can_be_quantified { return };
397
398							# $present => __aggregate_modifiers( 'modifiers', ... );
399							#
400							# This subroutine is private to the PPIx::Regexp package. It may change
401							# or be retracted without notice. Its purpose is to support defaulted
402							# modifiers.
403							#
404							# Aggregate the given modifiers left-to-right, returning a hash of those
405							# present and their values.
406
407							sub __aggregate_modifiers {
408	1116			1116		1891	my ( @mods ) = @_;
409	1116					1449	my %present;
410	1116					1540	foreach my $content ( @mods ) {
411	1124					3781	$content =~ s{ \A $capture_group_leader+ }{}smxg; # no /o!
412	1124	100				2276	if ( $content =~ m/ \A \^ /smx ) {
413	8					48	@present{ MODIFIER_GROUP_MATCH_SEMANTICS(), qw{ i s m x } }
414							= qw{ d 0 0 0 0 };
415							}
416
417							# Have to do the global match rather than a split, because the
418							# expression modifiers come through here too, and we need to
419							# distinguish between s/.../.../e and s/.../.../ee. But the
420							# modifiers can be randomized (that is, /eie is the same as
421							# /eei), so we reorder the content first.
422
423							# The following line is WRONG because it ignores the
424							# significance of '-'. This bug was introduced in version 0.035,
425							# specifically by the change that handled multi-character
426							# modifiers.
427							# $content = join '', sort split qr{}smx, $content;
428
429							# The following is better because it re-orders the modifiers
430							# separately. It does not recognize multiple dashes as
431							# representing an error (though it could!), and modifiers that
432							# are both asserted and negated (e.g. '(?i-i:foo)') are simply
433							# considered to be negated (as Perl does as of 5.20.0).
434							$content = join '-',
435	1124					4979	map { join '', sort split qr{}smx }
	253					1662
436							split qr{-}smx, $content;
437	1124					1738	my $value = 1;
438	1124					3174	while ( $content =~ m/ ( ( [[:alpha:]-] ) \2* ) /smxg ) {
439	418	100				1209	if ( '-' eq $1 ) {
		100
440	11					38	$value = 0;
441							} elsif ( my $bin = $aggregate{$1} ) {
442							# Yes, technically the match semantics stuff can't be
443							# negated in a regex. But it can in a 'use re', which
444							# also comes through here, so we have to handle it.
445	26	100				110	$present{$bin} = $value ? $1 : undef;
446							} else {
447							# TODO have to think about this, since I need asserts(
448							# 'e' ) to be 2 if we in fact have 'ee'. Is this
449							# correct?
450							# $present{$1} = $value;
451	381					1297	$present{$2} = $value * length $1;
452							}
453							}
454							}
455	1116					2424	return \%present;
456							}
457
458							# This must be implemented by tokens which do not recognize themselves.
459							# The return is a list of list references. Each list reference must
460							# contain a regular expression that recognizes the token, and optionally
461							# a reference to a hash to pass to make_token as the class-specific
462							# arguments. The regular expression MUST be anchored to the beginning of
463							# the string.
464							sub __PPIX_TOKEN__recognize {
465							return (
466	9			9		34	[ qr{ \A [(] [?] [[:lower:]]* -? [[:lower:]]* [)] }smx ],
467							[ qr{ \A [(] [?] \^ [[:lower:]]* [)] }smx ],
468							);
469							}
470
471							{
472
473							# Called by the tokenizer to modify the current modifiers with a new
474							# set. Both are passed as hash references, and a reference to the
475							# new hash is returned.
476							sub __PPIX_TOKENIZER__modifier_modify {
477	595			595		1102	my ( @args ) = @_;
478
479	595					758	my %merged;
480	595					945	foreach my $hash ( @args ) {
481	1190					1565	while ( my ( $key, $val ) = each %{ $hash } ) {
	1478					3747
482	288	100				446	if ( $val ) {
483	245					502	$merged{$key} = $val;
484							} else {
485	43					65	delete $merged{$key};
486							}
487							}
488							}
489
490	595					1609	return \%merged;
491
492							}
493
494							# Decode modifiers from the content of the token.
495							sub _decode {
496	589			589		949	my ( $self ) = @_;
497	589					1290	my $mod = __aggregate_modifiers( $self->content() );
498							$self->{__caret_undoes_n}
499	589	100				1358	and $mod->{n} = 0;
500	589					1601	return $mod;
501							}
502							}
503
504							1;
505
506							__END__
507
508							=head1 SUPPORT
509
510							Support is by the author. Please file bug reports at
511							L<https://rt.cpan.org/Public/Dist/Display.html?Name=PPIx-Regexp>,
512							L<https://github.com/trwyant/perl-PPIx-Regexp/issues>, or in
513							electronic mail to the author.
514
515							=head1 AUTHOR
516
517							Thomas R. Wyant, III F<wyant at cpan dot org>
518
519							=head1 COPYRIGHT AND LICENSE
520
521							Copyright (C) 2009-2023, 2025 by Thomas R. Wyant, III
522
523							This program is free software; you can redistribute it and/or modify it
524							under the same terms as Perl 5.10.0. For more details, see the full text
525							of the licenses in the directory LICENSES.
526
527							This program is distributed in the hope that it will be useful, but
528							without any warranty; without even the implied warranty of
529							merchantability or fitness for a particular purpose.
530
531							=cut
532
533							# ex: set textwidth=72 :