File Coverage

blib/lib/PPIx/Regexp/Token/CharClass/Simple.pm

Criterion	Covered	Total	%
statement	73	73	100.0
branch	29	34	85.2
condition			n/a
subroutine	15	15	100.0
pod	3	3	100.0
total	120	125	96.0

line	stmt	bran	sub	pod	time	code
1						=head1 NAME
2
3						PPIx::Regexp::Token::CharClass::Simple - This class represents a simple character class
4
5						=head1 SYNOPSIS
6
7						use PPIx::Regexp::Dumper;
8						PPIx::Regexp::Dumper->new( 'qr{\w}smx' )
9						->print();
10
11						=head1 INHERITANCE
12
13						C<PPIx::Regexp::Token::CharClass::Simple> is a
14						L<PPIx::Regexp::Token::CharClass\|PPIx::Regexp::Token::CharClass>.
15
16						C<PPIx::Regexp::Token::CharClass::Simple> has no descendants.
17
18						=head1 DESCRIPTION
19
20						This class represents one of the simple character classes that can occur
21						anywhere in a regular expression. This includes not only the truly
22						simple things like \w, but also Unicode properties, including properties
23						with wildcard values.
24
25						=head1 METHODS
26
27						This class provides no public methods beyond those provided by its
28						superclass.
29
30						=cut
31
32						package PPIx::Regexp::Token::CharClass::Simple;
33
34	9		9		45	use strict;
	9				14
	9				293
35	9		9		35	use warnings;
	9				16
	9				358
36
37	9		9		35	use base qw{ PPIx::Regexp::Token::CharClass };
	9				15
	9				785
38
39	9				3243	use PPIx::Regexp::Constant qw{
40						COOKIE_CLASS
41						LITERAL_LEFT_CURLY_REMOVED_PHASE_1
42						LITERAL_LEFT_CURLY_REMOVED_PHASE_2
43						MINIMUM_PERL
44						TOKEN_LITERAL
45						TOKEN_UNKNOWN
46						@CARP_NOT
47	9		9		41	};
	9				14
48
49						our $VERSION = '0.091';
50
51	9				713	use constant UNICODE_PROPERTY_LITERAL_VALUE => qr/
52						\{ \s* \^? \w [\w:=\s-]* \} \|
53						[CLMNPSZ] # perluniprops for 5.26.1
54	9		9		54	/smx;
	9				13
55	9				14	use constant UNICODE_PROPERTY_LITERAL =>
56						qr/ \A \\ [Pp] (?:
57	9				2639	@{[ UNICODE_PROPERTY_LITERAL_VALUE ]}
58	9		9		36	) /smx;
	9				12
59
60						# CAVEAT: The following regular expression, despite its name, matches
61						# ALL unicode property values. To actually match a wildcard property you
62						# must first eliminate anything that matches UNICODE_PROPERTY_LITERAL or
63						# UNICODE_PROPERTY_NAME_MATCH
64	9				784	use constant UNICODE_PROPERTY_WILDCARD =>
65	9		9		46	qr/ \A \\ [Pp] \{ \s* [\w\s-]+ [:=] [^}]+ \} /smx;
	9				17
66
67	9				622	use constant UNICODE_PROPERTY_NAME_MATCH =>
68	9		9		39	qr< \A \\ [Pp] \{ \s* na (?: me? )? [:=] / [^/]+ / \} >smx;
	9				13
69
70	9				12	use constant UNICODE_PROPERTY =>
71	9				18	qr/ @{[ UNICODE_PROPERTY_LITERAL ]} \|
72	9				16	@{[ UNICODE_PROPERTY_NAME_MATCH ]} \|
73	9		9		36	@{[ UNICODE_PROPERTY_WILDCARD ]} /smx;
	9				13
	9				8510
74
75						{
76
77						my %kind_of_match = (
78						p => 'with',
79						P => 'without',
80						);
81
82						my %explanation = (
83						'.' => 'Match any character',
84						'\\C' => 'Match a single octet (removed in 5.23.0)',
85						'\\D' => 'Match any character but a decimal digit',
86						'\\H' => 'Match a non-horizontal-white-space character',
87						'\\N' => 'Match any character but a new-line character',
88						'\\R' => 'Match a generic new-line character',
89						'\\S' => 'Match non-white-space character',
90						'\\V' => 'Match a non-vertical-white-space character',
91						'\\W' => 'Match non-word character',
92						'\\X' => 'Match a Unicode extended grapheme cluster',
93						'\\d' => 'Match decimal digit',
94						'\\h' => 'Match a horizontal-white-space character',
95						'\\s' => 'Match white-space character',
96						'\\v' => 'Match a vertical-white-space character',
97						'\\w' => 'Match word character',
98						);
99
100						sub __explanation {
101	15		15		24	return \%explanation;
102						}
103
104						sub explain {
105	18		18	1	32	my ( $self ) = @_;
106	18	100			59	if ( $self->content() =~ m/ \A \\ ( [Pp] ) ( [{] .* [}] \| . ) \z /smx ) {
107	3				11	my ( $kind, $prop ) = ( $1, $2 );
108
109	3				7	my $literal = ( $prop =~
110	3				234	m/ \A @{[ UNICODE_PROPERTY_LITERAL_VALUE ]} \z /smx );
111
112	3	50			14	if ( 1 < length $prop ) {
113	3				11	$prop =~ s/ \A [{] //smx;
114	3				12	$prop =~ s/ [}] \z //smx;
115						}
116
117						$literal
118						and return sprintf
119						q<Match character %s Unicode or custom property '%s'>,
120	3	100			14	$kind_of_match{$kind}, $prop;
121
122						return sprintf
123						q<Match character %s Unicode wildcard property '%s'>,
124	1				6	$kind_of_match{$kind}, $prop;
125						}
126	15				50	return $self->SUPER::explain();
127						}
128
129						}
130
131						##=head2 is_case_sensitive
132						##
133						##This override of the superclass method returns true for Unicode
134						##properties that specify case, and false (but defined) for all
135						##other character classes.
136						##
137						##The classes that specify case are documented in
138						##L<perluniprops\|/perluniprops>.
139						##
140						##B<Known bug:> This method returns false (but defined) for user-defined
141						##Unicode properties. It should return C<undef>. This bug B<may> be fixed
142						##if I find a way to identify all system-defined Unicode properties.
143						##
144						##=cut
145						##
146						##sub is_case_sensitive {
147						## my ( $self ) = @_;
148						## exists $self->{is_case_sensitive}
149						## and return $self->{is_case_sensitive};
150						## return ( $self->{is_case_sensitive} = $self->_is_case_sensitive() );
151						##}
152
153						##{
154						## my %case_sensitive = map { $_ => 1 } qw{
155						## generalcategory=lowercaseletter generalcategory=ll
156						## gc=lowercaseletter gc=ll
157						## generalcategory=titlecaseletter generalcategory=lt
158						## gc=titlecaseletter gc=lt
159						## generalcategory=uppercaseletter generalcategory=lu
160						## gc=uppercaseletter gc=lu
161						## lowercaseletter lowercase lower ll
162						## titlecaseletter titlecase title lt
163						## uppercaseletter uppercase upper lu
164						## lowercase=y lower=y lowercase=n lower=n
165						## titlecase=y title=y titlecase=n title=n
166						## uppercase=y upper=y uppercase=n upper=n
167						## };
168						##
169						## sub _is_case_sensitive {
170						## my ( $self ) = @_;
171						## my $content = $self->content();
172						## $content =~ m/ \A \\ p [{] ( .* ) [}] /smxi
173						## or return 0;
174						## $content = lc $1;
175						## $content =~ s/ \A ^ //smx;
176						## $content =~ s/ [\s_-] //smxg;
177						## $content =~ s/ \A is //smx;
178						## $content =~ s/ : /=/smxg;
179						## $content =~ s/ = (?: yes \| t \| true ) \b /=y/smxg;
180						## $content =~ s/ = (?: no \| f \| false ) \b /=n/smxg;
181						## return $case_sensitive{$content} \|\| 0;
182						## }
183						##
184						##}
185
186						{
187
188						my %introduced = (
189						'\\h' => '5.009005', # Before this, parsed as 'h'
190						'\\v' => '5.009005', # Before this, parsed as 'v'
191						'\\H' => '5.009005', # Before this, parsed as 'H'
192						'\\N' => '5.011', # Before this, an error.
193						'\\V' => '5.009005', # Before this, parsed as 'V'
194						'\\R' => '5.009005',
195						'\\C' => '5.006',
196						'\\X' => '5.006',
197						);
198
199						sub perl_version_introduced {
200	31		31	1	4547	my ( $self ) = @_;
201	31				96	my $content = $self->content();
202	31	100			94	if ( defined( my $minver = $introduced{$content} ) ) {
203	9				20	return $minver;
204						}
205						# I must have read perl5113delta and thought this
206						# represented the change they were talking about, but I sure
207						# don't see it now. So, until things become clearer ...
208						# $content =~ m/ \G .*? [\s=-] /smxgc
209						# and return '5.011003';
210	22	100			75	$content =~ UNICODE_PROPERTY_LITERAL
211						and return '5.006001';
212	19	50			41	$content =~ UNICODE_PROPERTY_NAME_MATCH
213						and return '5.031010';
214	19	100			46	$content =~ UNICODE_PROPERTY_WILDCARD
215						and return '5.029009';
216	18				52	return MINIMUM_PERL;
217						}
218
219						}
220
221						{
222						my %removed = (
223						'\\C' => '5.023', # Before this, matched an octet
224						);
225
226						sub perl_version_removed {
227	31		31	1	10059	my ( $self ) = @_;
228	31				96	return $removed{ $self->content() };
229						}
230						}
231
232						# This is one of the larger complications of
233						# https://rt.perl.org/Public/Bug/Display.html?id=128213
234						# where it transpired that un-escaped literal left curlies were not
235						# giving warnings/errors in /.{/, /\p{...}{/, and /\P{...}{/, but were
236						# for all the others that bin into this class (e.g. /\s{/).
237						# Note that the perldelta for 5.25.1 and 5.26.0 do not acknowledge tha
238						# phased deprecation, and pretend that everything was done on the phase
239						# 1 schedule. This appears to be deliberate per
240						# https://rt.perl.org/Ticket/Display.html?id=131352
241						sub __following_literal_left_curly_disallowed_in {
242	4		4		6	my ( $self ) = @_;
243	4	100			8	q<.> eq ( my $content = $self->content() )
244						and return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
245	3	100			37	$content =~ m/ \A \\ p \{ /smxi
246						and return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
247	1				5	return LITERAL_LEFT_CURLY_REMOVED_PHASE_1;
248						}
249
250						sub __PPIX_TOKENIZER__regexp {
251	1286		1286		1960	my ( undef, $tokenizer, $character ) = @_;
252
253	1286				2177	my $in_class = $tokenizer->cookie( COOKIE_CLASS );
254
255	1286	100			2647	if ( $character eq '.' ) {
256	18	50			41	$in_class
257						and return $tokenizer->make_token( 1, TOKEN_LITERAL );
258	18				38	return 1;
259						}
260
261	1268	100			4399	if ( my $accept = $tokenizer->find_regexp(
262						qr{ \A \\ [wWsSdDvVhHXRNC] }smx
263						) ) {
264	83	100			202	if ( $in_class ) {
265	9				41	my $match = $tokenizer->match();
266						# As of Perl 5.11.5, [\N] is a fatal error.
267	9	50			31	'\\N' eq $match
268						and return $tokenizer->make_token(
269						$accept, TOKEN_UNKNOWN, {
270						error => '\\N invalid inside character class',
271						},
272						);
273						# \R is not recognized inside a character class. It
274						# eventually ends up as a literal.
275	9	50			22	'\\R' eq $match and return;
276						}
277	83				199	return $accept;
278						}
279
280	1185	100			2750	if ( my $accept = $tokenizer->find_regexp( UNICODE_PROPERTY ) ) {
281	13				48	return $accept;
282						}
283
284	1172	100			2773	if ( my $accept = $tokenizer->find_regexp( qr< \A \\ p [{] \s* [}] >smx )
285						) {
286	1				4	return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
287						error => 'Empty \\p{} is an error',
288						},
289						);
290						}
291
292	1171				2332	return;
293						}
294
295						1;
296
297						__END__
298
299						=head1 SUPPORT
300
301						Support is by the author. Please file bug reports at
302						L<https://rt.cpan.org/Public/Dist/Display.html?Name=PPIx-Regexp>,
303						L<https://github.com/trwyant/perl-PPIx-Regexp/issues>, or in
304						electronic mail to the author.
305
306						=head1 AUTHOR
307
308						Thomas R. Wyant, III F<wyant at cpan dot org>
309
310						=head1 COPYRIGHT AND LICENSE
311
312						Copyright (C) 2009-2023, 2025 by Thomas R. Wyant, III
313
314						This program is free software; you can redistribute it and/or modify it
315						under the same terms as Perl 5.10.0. For more details, see the full text
316						of the licenses in the directory LICENSES.
317
318						This program is distributed in the hope that it will be useful, but
319						without any warranty; without even the implied warranty of
320						merchantability or fitness for a particular purpose.
321
322						=cut
323
324						# ex: set textwidth=72 :