File Coverage

blib/lib/PPIx/Regexp/Token/CharClass/Simple.pm
Criterion Covered Total %
statement 73 73 100.0
branch 29 34 85.2
condition n/a
subroutine 15 15 100.0
pod 3 3 100.0
total 120 125 96.0


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::CharClass::Simple - This class represents a simple character class
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{\w}smx' )
9             ->print();
10              
11             =head1 INHERITANCE
12              
13             C<PPIx::Regexp::Token::CharClass::Simple> is a
14             L<PPIx::Regexp::Token::CharClass|PPIx::Regexp::Token::CharClass>.
15              
16             C<PPIx::Regexp::Token::CharClass::Simple> has no descendants.
17              
18             =head1 DESCRIPTION
19              
20             This class represents one of the simple character classes that can occur
21             anywhere in a regular expression. This includes not only the truly
22             simple things like \w, but also Unicode properties, including properties
23             with wildcard values.
24              
25             =head1 METHODS
26              
27             This class provides no public methods beyond those provided by its
28             superclass.
29              
30             =cut
31              
32             package PPIx::Regexp::Token::CharClass::Simple;
33              
34 9     9   45 use strict;
  9         14  
  9         293  
35 9     9   35 use warnings;
  9         16  
  9         358  
36              
37 9     9   35 use base qw{ PPIx::Regexp::Token::CharClass };
  9         15  
  9         785  
38              
39 9         3243 use PPIx::Regexp::Constant qw{
40             COOKIE_CLASS
41             LITERAL_LEFT_CURLY_REMOVED_PHASE_1
42             LITERAL_LEFT_CURLY_REMOVED_PHASE_2
43             MINIMUM_PERL
44             TOKEN_LITERAL
45             TOKEN_UNKNOWN
46             @CARP_NOT
47 9     9   41 };
  9         14  
48              
49             our $VERSION = '0.091';
50              
51 9         713 use constant UNICODE_PROPERTY_LITERAL_VALUE => qr/
52             \{ \s* \^? \w [\w:=\s-]* \} |
53             [CLMNPSZ] # perluniprops for 5.26.1
54 9     9   54 /smx;
  9         13  
55 9         14 use constant UNICODE_PROPERTY_LITERAL =>
56             qr/ \A \\ [Pp] (?:
57 9         2639 @{[ UNICODE_PROPERTY_LITERAL_VALUE ]}
58 9     9   36 ) /smx;
  9         12  
59              
60             # CAVEAT: The following regular expression, despite its name, matches
61             # ALL unicode property values. To actually match a wildcard property you
62             # must first eliminate anything that matches UNICODE_PROPERTY_LITERAL or
63             # UNICODE_PROPERTY_NAME_MATCH
64 9         784 use constant UNICODE_PROPERTY_WILDCARD =>
65 9     9   46 qr/ \A \\ [Pp] \{ \s* [\w\s-]+ [:=] [^}]+ \} /smx;
  9         17  
66              
67 9         622 use constant UNICODE_PROPERTY_NAME_MATCH =>
68 9     9   39 qr< \A \\ [Pp] \{ \s* na (?: me? )? [:=] / [^/]+ / \} >smx;
  9         13  
69              
70 9         12 use constant UNICODE_PROPERTY =>
71 9         18 qr/ @{[ UNICODE_PROPERTY_LITERAL ]} |
72 9         16 @{[ UNICODE_PROPERTY_NAME_MATCH ]} |
73 9     9   36 @{[ UNICODE_PROPERTY_WILDCARD ]} /smx;
  9         13  
  9         8510  
74              
75             {
76              
77             my %kind_of_match = (
78             p => 'with',
79             P => 'without',
80             );
81              
82             my %explanation = (
83             '.' => 'Match any character',
84             '\\C' => 'Match a single octet (removed in 5.23.0)',
85             '\\D' => 'Match any character but a decimal digit',
86             '\\H' => 'Match a non-horizontal-white-space character',
87             '\\N' => 'Match any character but a new-line character',
88             '\\R' => 'Match a generic new-line character',
89             '\\S' => 'Match non-white-space character',
90             '\\V' => 'Match a non-vertical-white-space character',
91             '\\W' => 'Match non-word character',
92             '\\X' => 'Match a Unicode extended grapheme cluster',
93             '\\d' => 'Match decimal digit',
94             '\\h' => 'Match a horizontal-white-space character',
95             '\\s' => 'Match white-space character',
96             '\\v' => 'Match a vertical-white-space character',
97             '\\w' => 'Match word character',
98             );
99              
100             sub __explanation {
101 15     15   24 return \%explanation;
102             }
103              
104             sub explain {
105 18     18 1 32 my ( $self ) = @_;
106 18 100       59 if ( $self->content() =~ m/ \A \\ ( [Pp] ) ( [{] .* [}] | . ) \z /smx ) {
107 3         11 my ( $kind, $prop ) = ( $1, $2 );
108              
109 3         7 my $literal = ( $prop =~
110 3         234 m/ \A @{[ UNICODE_PROPERTY_LITERAL_VALUE ]} \z /smx );
111              
112 3 50       14 if ( 1 < length $prop ) {
113 3         11 $prop =~ s/ \A [{] //smx;
114 3         12 $prop =~ s/ [}] \z //smx;
115             }
116              
117             $literal
118             and return sprintf
119             q<Match character %s Unicode or custom property '%s'>,
120 3 100       14 $kind_of_match{$kind}, $prop;
121              
122             return sprintf
123             q<Match character %s Unicode wildcard property '%s'>,
124 1         6 $kind_of_match{$kind}, $prop;
125             }
126 15         50 return $self->SUPER::explain();
127             }
128              
129             }
130              
131             ##=head2 is_case_sensitive
132             ##
133             ##This override of the superclass method returns true for Unicode
134             ##properties that specify case, and false (but defined) for all
135             ##other character classes.
136             ##
137             ##The classes that specify case are documented in
138             ##L<perluniprops|/perluniprops>.
139             ##
140             ##B<Known bug:> This method returns false (but defined) for user-defined
141             ##Unicode properties. It should return C<undef>. This bug B<may> be fixed
142             ##if I find a way to identify all system-defined Unicode properties.
143             ##
144             ##=cut
145             ##
146             ##sub is_case_sensitive {
147             ## my ( $self ) = @_;
148             ## exists $self->{is_case_sensitive}
149             ## and return $self->{is_case_sensitive};
150             ## return ( $self->{is_case_sensitive} = $self->_is_case_sensitive() );
151             ##}
152              
153             ##{
154             ## my %case_sensitive = map { $_ => 1 } qw{
155             ## generalcategory=lowercaseletter generalcategory=ll
156             ## gc=lowercaseletter gc=ll
157             ## generalcategory=titlecaseletter generalcategory=lt
158             ## gc=titlecaseletter gc=lt
159             ## generalcategory=uppercaseletter generalcategory=lu
160             ## gc=uppercaseletter gc=lu
161             ## lowercaseletter lowercase lower ll
162             ## titlecaseletter titlecase title lt
163             ## uppercaseletter uppercase upper lu
164             ## lowercase=y lower=y lowercase=n lower=n
165             ## titlecase=y title=y titlecase=n title=n
166             ## uppercase=y upper=y uppercase=n upper=n
167             ## };
168             ##
169             ## sub _is_case_sensitive {
170             ## my ( $self ) = @_;
171             ## my $content = $self->content();
172             ## $content =~ m/ \A \\ p [{] ( .* ) [}] /smxi
173             ## or return 0;
174             ## $content = lc $1;
175             ## $content =~ s/ \A ^ //smx;
176             ## $content =~ s/ [\s_-] //smxg;
177             ## $content =~ s/ \A is //smx;
178             ## $content =~ s/ : /=/smxg;
179             ## $content =~ s/ = (?: yes | t | true ) \b /=y/smxg;
180             ## $content =~ s/ = (?: no | f | false ) \b /=n/smxg;
181             ## return $case_sensitive{$content} || 0;
182             ## }
183             ##
184             ##}
185              
186             {
187              
188             my %introduced = (
189             '\\h' => '5.009005', # Before this, parsed as 'h'
190             '\\v' => '5.009005', # Before this, parsed as 'v'
191             '\\H' => '5.009005', # Before this, parsed as 'H'
192             '\\N' => '5.011', # Before this, an error.
193             '\\V' => '5.009005', # Before this, parsed as 'V'
194             '\\R' => '5.009005',
195             '\\C' => '5.006',
196             '\\X' => '5.006',
197             );
198              
199             sub perl_version_introduced {
200 31     31 1 4547 my ( $self ) = @_;
201 31         96 my $content = $self->content();
202 31 100       94 if ( defined( my $minver = $introduced{$content} ) ) {
203 9         20 return $minver;
204             }
205             # I must have read perl5113delta and thought this
206             # represented the change they were talking about, but I sure
207             # don't see it now. So, until things become clearer ...
208             # $content =~ m/ \G .*? [\s=-] /smxgc
209             # and return '5.011003';
210 22 100       75 $content =~ UNICODE_PROPERTY_LITERAL
211             and return '5.006001';
212 19 50       41 $content =~ UNICODE_PROPERTY_NAME_MATCH
213             and return '5.031010';
214 19 100       46 $content =~ UNICODE_PROPERTY_WILDCARD
215             and return '5.029009';
216 18         52 return MINIMUM_PERL;
217             }
218              
219             }
220              
221             {
222             my %removed = (
223             '\\C' => '5.023', # Before this, matched an octet
224             );
225              
226             sub perl_version_removed {
227 31     31 1 10059 my ( $self ) = @_;
228 31         96 return $removed{ $self->content() };
229             }
230             }
231              
232             # This is one of the larger complications of
233             # https://rt.perl.org/Public/Bug/Display.html?id=128213
234             # where it transpired that un-escaped literal left curlies were not
235             # giving warnings/errors in /.{/, /\p{...}{/, and /\P{...}{/, but were
236             # for all the others that bin into this class (e.g. /\s{/).
237             # Note that the perldelta for 5.25.1 and 5.26.0 do not acknowledge tha
238             # phased deprecation, and pretend that everything was done on the phase
239             # 1 schedule. This appears to be deliberate per
240             # https://rt.perl.org/Ticket/Display.html?id=131352
241             sub __following_literal_left_curly_disallowed_in {
242 4     4   6 my ( $self ) = @_;
243 4 100       8 q<.> eq ( my $content = $self->content() )
244             and return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
245 3 100       37 $content =~ m/ \A \\ p \{ /smxi
246             and return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
247 1         5 return LITERAL_LEFT_CURLY_REMOVED_PHASE_1;
248             }
249              
250             sub __PPIX_TOKENIZER__regexp {
251 1286     1286   1960 my ( undef, $tokenizer, $character ) = @_;
252              
253 1286         2177 my $in_class = $tokenizer->cookie( COOKIE_CLASS );
254              
255 1286 100       2647 if ( $character eq '.' ) {
256 18 50       41 $in_class
257             and return $tokenizer->make_token( 1, TOKEN_LITERAL );
258 18         38 return 1;
259             }
260              
261 1268 100       4399 if ( my $accept = $tokenizer->find_regexp(
262             qr{ \A \\ [wWsSdDvVhHXRNC] }smx
263             ) ) {
264 83 100       202 if ( $in_class ) {
265 9         41 my $match = $tokenizer->match();
266             # As of Perl 5.11.5, [\N] is a fatal error.
267 9 50       31 '\\N' eq $match
268             and return $tokenizer->make_token(
269             $accept, TOKEN_UNKNOWN, {
270             error => '\\N invalid inside character class',
271             },
272             );
273             # \R is not recognized inside a character class. It
274             # eventually ends up as a literal.
275 9 50       22 '\\R' eq $match and return;
276             }
277 83         199 return $accept;
278             }
279              
280 1185 100       2750 if ( my $accept = $tokenizer->find_regexp( UNICODE_PROPERTY ) ) {
281 13         48 return $accept;
282             }
283              
284 1172 100       2773 if ( my $accept = $tokenizer->find_regexp( qr< \A \\ p [{] \s* [}] >smx )
285             ) {
286 1         4 return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
287             error => 'Empty \\p{} is an error',
288             },
289             );
290             }
291              
292 1171         2332 return;
293             }
294              
295             1;
296              
297             __END__
298              
299             =head1 SUPPORT
300              
301             Support is by the author. Please file bug reports at
302             L<https://rt.cpan.org/Public/Dist/Display.html?Name=PPIx-Regexp>,
303             L<https://github.com/trwyant/perl-PPIx-Regexp/issues>, or in
304             electronic mail to the author.
305              
306             =head1 AUTHOR
307              
308             Thomas R. Wyant, III F<wyant at cpan dot org>
309              
310             =head1 COPYRIGHT AND LICENSE
311              
312             Copyright (C) 2009-2023, 2025 by Thomas R. Wyant, III
313              
314             This program is free software; you can redistribute it and/or modify it
315             under the same terms as Perl 5.10.0. For more details, see the full text
316             of the licenses in the directory LICENSES.
317              
318             This program is distributed in the hope that it will be useful, but
319             without any warranty; without even the implied warranty of
320             merchantability or fitness for a particular purpose.
321              
322             =cut
323              
324             # ex: set textwidth=72 :