File Coverage

blib/lib/PPIx/Regexp/Token/Literal.pm

Criterion	Covered	Total	%
statement	98	104	94.2
branch	87	100	87.0
condition	18	23	78.2
subroutine	18	19	94.7
pod	5	5	100.0
total	226	251	90.0

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Literal - Represent a literal character
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
9							->print();
10
11							=head1 INHERITANCE
12
13							C is a
14							L.
15
16							C has no descendants.
17
18							=head1 DESCRIPTION
19
20							This class represents a literal character, no matter how specified.
21
22							=head1 METHODS
23
24							This class provides the following public methods. Methods not documented
25							here are private, and unsupported in the sense that the author reserves
26							the right to change or remove them without notice.
27
28							=cut
29
30							package PPIx::Regexp::Token::Literal;
31
32	9			9		69	use strict;
	9					26
	9					281
33	9			9		46	use warnings;
	9					28
	9					246
34
35	9			9		45	use base qw{ PPIx::Regexp::Token };
	9					19
	9					859
36
37	9					1137	use PPIx::Regexp::Constant qw{
38							COOKIE_CLASS COOKIE_REGEX_SET
39							LITERAL_LEFT_CURLY_ALLOWED
40							LITERAL_LEFT_CURLY_REMOVED_PHASE_1
41							LITERAL_LEFT_CURLY_REMOVED_PHASE_2
42							LITERAL_LEFT_CURLY_REMOVED_PHASE_3
43							MINIMUM_PERL MSG_PROHIBITED_BY_STRICT
44							TOKEN_UNKNOWN
45							@CARP_NOT
46	9			9		60	};
	9					19
47
48	9			9		70	use PPIx::Regexp::Util qw{ :width_one };
	9					20
	9					26872
49
50							our $VERSION = '0.087';
51
52							sub __new {
53	1275			1275		4630	my ( $class, $content, %arg ) = @_;
54
55	1275	50				4998	my $self = $class->SUPER::__new( $content, %arg )
56							or return;
57
58							defined $arg{ordinal}
59	1275	100				3937	and $self->{ordinal} = $arg{ordinal};
60
61	1275					3804	return $self;
62							}
63
64							# Return true if the token can be quantified, and false otherwise
65							# sub can_be_quantified { return };
66
67							sub explain {
68	1			1	1	5	return 'Literal character';
69							}
70
71							=head2 is_matcher
72
73							This method returns a true value because a literal matches itself.
74
75							=cut
76
77	8			8	1	36	sub is_matcher { return 1; }
78
79							sub perl_version_introduced {
80	258			258	1	9103	my ( $self ) = @_;
81							exists $self->{perl_version_introduced}
82	258	100				908	and return $self->{perl_version_introduced};
83	176					466	my $content = $self->content();
84	176					582	my $main = $self->main_structure();
85							$main
86							and $content =~ m/ \A \\ N \{ /smx
87							and not $main->interpolates()
88	176	100	100			825	and return ( $self->{perl_version_introduced} = '5.029010' );
			66
89							$content =~ m/ \A \\ o /smx
90	175	100				453	and return ( $self->{perl_version_introduced} = '5.013003' );
91							$content =~ m/ \A \\ N [{] U [+] /smx
92	174	100				363	and return ( $self->{perl_version_introduced} = '5.008' );
93							$content =~ m/ \A \\ x [{] /smx # }
94	173	100				374	and return ( $self->{perl_version_introduced} = '5.006' );
95							$content =~ m/ \A \\ N /smx
96	172	100				352	and return ( $self->{perl_version_introduced} = '5.006001' );
97	171					546	return ( $self->{perl_version_introduced} = MINIMUM_PERL );
98							}
99
100							{
101							my %removed = (
102							q<{> => sub {
103							my ( $self ) = @_;
104
105							my $prev;
106
107							if ( $prev = $self->sprevious_sibling() ) {
108							# When an unescaped left curly follows something else in
109							# the same structure, the logic on whether it is allowed
110							# lives, for better or worse, on the sibling.
111							return $prev->__following_literal_left_curly_disallowed_in();
112							} elsif ( $prev = $self->sprevious_element() ) {
113							# Perl 5.27.8 deprecated unescaped literal left curlys
114							# after a left paren that introduces a group. Therefore
115							# the left curly has no previous sibling. But the curly
116							# is still legal at the beginning of a regex, even one
117							# delimited by parens, so we can not return when we find
118							# a PPIx::Regexp::Token::Delimiter, which is a subclass
119							# of PPIx::Regexp::Token::Structure.
120							$prev->isa( 'PPIx::Regexp::Token::Structure' )
121							and q<(> eq $prev->content()
122							and not $prev->isa( 'PPIx::Regexp::Token::Delimiter' )
123							and return LITERAL_LEFT_CURLY_REMOVED_PHASE_3;
124							}
125							# When this mess started, the idea was to always allow
126							# unescaped literal left curlies that started a regex or a
127							# group
128							return LITERAL_LEFT_CURLY_ALLOWED;
129							},
130							);
131
132							sub perl_version_removed {
133	227			227	1	16183	my ( $self ) = @_;
134							exists $self->{perl_version_removed}
135	227	100				838	and return $self->{perl_version_removed};
136	145					256	my $code;
137							return ( $self->{perl_version_removed} =
138	145	100				353	( $code = $removed{$self->content()} ) ?
139							scalar $code->( $self ) : undef
140							);
141							}
142							}
143
144							# Some characters may or may not be literals depending on whether we are
145							# inside a character class. The following hash identifies those
146							# characters and says what we should return when outside (index 0) or
147							# inside (index 1) a character class, as judged by the presence of the
148							# relevant cookie.
149							my %double_agent = (
150							'.' => [ undef, 1 ],
151							'*' => [ undef, 1 ],
152							'?' => [ undef, 1 ],
153							'+' => [ undef, 1 ],
154							'-' => [ 1, undef ],
155							'\|' => [ undef, 1 ],
156							);
157
158							# These are the characters that other external tokenizers need to see,
159							# or at least that we need to take a closer look at. All others can be
160							# unconditionally made into single-character literals.
161							my %extra_ordinary = map { $_ => 1 }
162							split qr{}smx, '$@*+?.\\(){}[]^\|-#';
163							# $ -> Token::Interpolation, Token::Assertion
164							# @ -> Token::Interpolation
165							# * -> Token::Quantifier
166							# + ? -> Token::Quantifier, Token::Greediness
167							# . -> Token::CharClass::Simple
168							# \ -> Token::Control, Token::CharClass::Simple, Token::Assertion,
169							# Token::Backreference
170							# ( ) { } [ ] -> Token::Structure
171							# ^ -> Token::Assertion
172							# \| - -> Token::Operator
173
174							my %regex_set_operator = map { $_ => 1 } qw{ & + \| - ^ ! };
175
176							# The regex for the extended white space available under regex sets in
177							# Perl 5.17.8 and in general in perl 5.17.9. I have been unable to get
178							# this to work under Perl 5.6.2, so for that we fall back to ASCII white
179							# space. The stringy eval is because I have been unable to get
180							# satisfaction out of either interpolated characters (in general) or
181							# eval-ed "\N{U+...}" (under 5.6.2) or \x{...} (ditto).
182							#
183							# See PPIx::Regexp::Structure::RegexSet for the documentation of this
184							# mess.
185							# my $white_space_re = $] >= 5.008 ?
186							# 'qr< \\A [\\s\\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' :
187							# 'qr< \\A \\s+ >smx';
188							#
189							# RT #91798
190							# The above turns out to be wrong, because \s matches too many
191							# characters. We need the following to get the right match. Note that
192							# \cK was added experimentally in 5.17.0 and made it into 5.18. The \N{}
193							# characters were NOT added (as I originally thought) but were simply
194							# made characters that generated warnings when escaped, in preparation
195							# for adding them. When they actually get added, I will have to add back
196							# the trinary operator. Sigh.
197							# my $white_space_re = 'qr< \A [\t\n\cK\f\r ] >smx';
198							#
199							# The extended white space characters came back in Perl 5.21.1.
200							my $white_space_re = $] >= 5.008 ?
201							'qr< \\A [\\t\\n\\cK\\f\\r \\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' :
202							'qr< \\A [\\t\\n\\cK\\f\\r ]+ >smx';
203							$white_space_re = eval $white_space_re; ## no critic (ProhibitStringyEval)
204
205							my %regex_pass_on = map { $_ => 1 } qw{ [ ] ( ) $ \ };
206
207							sub __PPIX_TOKENIZER__regexp {
208	2824			2824		6301	my ( undef, $tokenizer, $character ) = @_; # Invocant, $char_type unused
209
210	2824	100				7990	if ( $tokenizer->cookie( COOKIE_REGEX_SET ) ) {
211							# If we're inside a regex set no literals are allowed, but not
212							# all characters that get here are seen as literals.
213
214	105	100				366	$regex_set_operator{$character}
215							and return $tokenizer->make_token(
216							length $character, 'PPIx::Regexp::Token::Operator' );
217
218	92					161	my $accept;
219
220							# As of 5.23.4, only space and horizontal tab are legal white
221							# space inside a bracketed class inside an extended character
222							# class
223	92	100				208	$accept = $tokenizer->find_regexp(
		100
224							$tokenizer->cookie( COOKIE_CLASS ) ?
225							qr{ \A [ \t] }smx :
226							$white_space_re
227							)
228							and return $tokenizer->make_token(
229							$accept, 'PPIx::Regexp::Token::Whitespace' );
230
231	59	50				193	$accept = _escaped( $tokenizer, $character )
232							and return $accept;
233
234	59	100				246	$regex_pass_on{$character}
235							and return;
236
237							# At this point we have a single character which is poised to be
238							# interpreted as a literal. These are not legal in a regex set
239							# except when also in a bracketed class.
240	8	50				46	return $tokenizer->cookie( COOKIE_CLASS ) ?
241							length $character :
242							$tokenizer->make_token(
243							length $character, TOKEN_UNKNOWN, {
244							error => 'Literal not valid in Regex set',
245							},
246							);
247
248							} else {
249
250							# Otherwise handle the characters that may or may not be
251							# literals depending on whether or not we are in a character
252							# class.
253	2719	100				8363	if ( my $class = $double_agent{$character} ) {
254	178	100				739	my $inx = $tokenizer->cookie( COOKIE_CLASS ) ? 1 : 0;
255	178					906	return $class->[$inx];
256							}
257							}
258
259							# If /x is in effect _and_ we are not inside a character class, \s
260							# is whitespace, and '#' introduces a comment. Otherwise they are
261							# both literals.
262	2541	100	100			6925	if ( $tokenizer->modifier( 'x*' ) &&
		100	66
263							! $tokenizer->cookie( COOKIE_CLASS ) ) {
264	276					511	my $accept;
265	276	100				782	$accept = $tokenizer->find_regexp( $white_space_re )
266							and return $tokenizer->make_token(
267							$accept, 'PPIx::Regexp::Token::Whitespace' );
268	181	100				860	$accept = $tokenizer->find_regexp(
269							qr{ \A \# [^\n]* (?: \n \| \z) }smx )
270							and return $tokenizer->make_token(
271							$accept, 'PPIx::Regexp::Token::Comment' );
272							} elsif ( $tokenizer->modifier( 'xx' ) &&
273							$tokenizer->cookie( COOKIE_CLASS ) ) {
274	10					18	my $accept;
275	10	100				51	$accept = $tokenizer->find_regexp( qr{ \A [ \t] }smx )
276							and return $tokenizer->make_token(
277							$accept, 'PPIx::Regexp::Token::Whitespace',
278							{ perl_version_introduced => '5.025009' },
279							);
280							} else {
281	2255	100	100			11151	( $character eq '#' \|\| $character =~ m/ \A \s \z /smx )
282							and return 1;
283							}
284
285	2382					4283	my $accept;
286	2382	100				6032	$accept = _escaped( $tokenizer, $character )
287							and return $accept;
288
289							# All other characters which are not extra ordinary get accepted.
290	2305	100				7818	$extra_ordinary{$character} or return 1;
291
292	1233					3028	return;
293							}
294
295							=begin comment
296
297							The following is from perlop:
298
299							The character following "\c" is mapped to some other character by
300							converting letters to upper case and then (on ASCII systems) by
301							inverting the 7th bit (0x40). The most interesting range is from '@' to
302							'_' (0x40 through 0x5F), resulting in a control character from 0x00
303							through 0x1F. A '?' maps to the DEL character. On EBCDIC systems only
304							'@', the letters, '[', '\', ']', '^', '_' and '?' will work, resulting
305							in 0x00 through 0x1F and 0x7F.
306
307							=end comment
308
309							=cut
310
311							# Recognize all the escaped constructions that generate literal
312							# characters in one gigantic regexp. Technically \1.. through \7.. are
313							# octal literals too, but we can not disambiguate these from back
314							# references until we know how many there are. So the lexer gets another
315							# dirty job.
316
317							{
318							my %special = (
319							'\\N{}' => sub {
320							my ( $tokenizer, $accept ) = @_;
321
322							=begin comment
323
324							$tokenizer->strict()
325							or return $tokenizer->make_token( $accept,
326							'PPIx::Regexp::Token::NoOp', {
327							perl_version_removed => '5.027001',
328							},
329							);
330							return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
331							error => join( ' ',
332							'Empty Unicode character name',
333							MSG_PROHIBITED_BY_STRICT ),
334							perl_version_introduced => '5.023008',
335							perl_version_removed => '5.027001',
336							},
337							);
338
339							=end comment
340
341							=cut
342
343							return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
344							error => 'Empty Unicode character name',
345							perl_version_introduced => '5.023008',
346							perl_version_removed => '5.027001',
347							},
348							);
349							},
350
351							'\\o{}' => sub {
352							my ( $tokenizer, $accept ) = @_;
353							return $tokenizer->make_token( $accept, TOKEN_UNKNOWN, {
354							error => q,
355							},
356							);
357							},
358
359							'\\x{}' => sub {
360							my ( $tokenizer, $accept ) = @_;
361							$tokenizer->strict()
362							and return $tokenizer->make_token( $accept,
363							TOKEN_UNKNOWN, {
364							error =>
365							q,
366							},
367							);
368							return $accept;
369							},
370							);
371
372							sub _escaped {
373	2441			2441		4800	my ( $tokenizer, $character ) = @_;
374
375	2441	100				7124	$character eq '\\'
376							or return;
377
378	304	100				1753	if ( my $accept = $tokenizer->find_regexp( # {
379							qr< \A \\ ( [ox] ) [{] ( [^}]* ) [}] >smx
380							) ) {
381	8					42	my $match = $tokenizer->match();
382	8					22	my $code;
383	8	100				74	$code = $special{$match}
384							and return $code->( $tokenizer, $accept );
385	5					39	my ( $kind, $value ) = $tokenizer->capture();
386							my $invalid = {
387							o => qr<[^0-7]>smx,
388							x => qr<[[:^xdigit:]]>smx,
389	5					46	}->{$kind};
390	5	100				169	$value =~ m/ $invalid /smxg # /g for pos()
391							or return $accept;
392							$tokenizer->strict()
393							and return $tokenizer->make_token( $accept,
394							TOKEN_UNKNOWN, {
395							error => sprintf(
396							'Non-%s character in \\%s{...}',
397							{
398							o => 'octal',
399							x => 'hex',
400	2	100				137	}->{$kind},
401							$kind,
402							),
403							},
404							);
405							return $tokenizer->make_token( $accept, __PACKAGE__, {
406							ordinal => {
407	1			1		8	o => sub { oct $_[0] },
408	0			0		0	x => sub { hex $_[0] },
409	1		50			25	}->{$kind}->( substr( $value, 0, pos $value ) \|\| 0 ),
410							},
411							);
412							}
413
414	296	100				1628	if ( my $accept = $tokenizer->find_regexp(
415							qr< \A \\ (?:
416							[^\w\s] \| # delimiters/metas
417							[tnrfae] \| # C-style escapes
418							0 [01234567]{0,2} \| # octal
419							# [01234567]{1,3} \| # made from backref by lexer
420							c [][\@[:alpha:]\\^_?] \| # control characters
421							## x (?: \{ [[:xdigit:]]* \} \| [[:xdigit:]]{0,2} ) \| # hex
422							## o [{] [01234567]+ [}] \| # octal as of 5.13.3
423							x [[:xdigit:]]{0,2} \| # hex - brackets handled above
424							## N (?: \{ (?: [[:alpha:]] [\w\s:()-]* \| # must begin w/ alpha
425							## U [+] [[:xdigit:]]+ ) \} ) \| # unicode
426							N (?: [{] (?= [^0-9] ) [^\}]* [}] ) # unicode
427							) >smx ) ) {
428	69					320	my $match = $tokenizer->match();
429	69					125	my $code;
430	69	100				311	$code = $special{$match}
431							and return $code->( $tokenizer, $accept );
432	63					334	return $accept;
433							}
434
435	227					822	return;
436							}
437							}
438
439							=head2 ordinal
440
441							print 'The ordinal of ', $token->content(),
442							' is ', $token->ordinal(), "\n";
443
444							This method returns the ordinal of the literal if it can figure it out.
445							It is analogous to the C built-in.
446
447							It will not attempt to determine the ordinal of a unicode name
448							(C<\N{...}>) unless L has been loaded, and supports
449							the L)> function.
450							Instead, it will return C. Users of Perl 5.6.2 and older may be
451							out of luck here.
452
453							Unicode code points (e.g. C<\N{U+abcd}>) should work independently of
454							L, and just return the value of C.
455
456							It will never attempt to return the ordinal of an octet (C<\C{...}>)
457							because I don't understand the syntax.
458
459							=cut
460
461							{
462
463							my %escapes = (
464							'\\t' => ord "\t",
465							'\\n' => ord "\n",
466							'\\r' => ord "\r",
467							'\\f' => ord "\f",
468							'\\a' => ord "\a",
469							'\\b' => ord "\b",
470							'\\e' => ord "\e",
471							'\\c?' => ord "\c?",
472							'\\c@' => ord "\c@",
473							'\\cA' => ord "\cA",
474							'\\ca' => ord "\cA",
475							'\\cB' => ord "\cB",
476							'\\cb' => ord "\cB",
477							'\\cC' => ord "\cC",
478							'\\cc' => ord "\cC",
479							'\\cD' => ord "\cD",
480							'\\cd' => ord "\cD",
481							'\\cE' => ord "\cE",
482							'\\ce' => ord "\cE",
483							'\\cF' => ord "\cF",
484							'\\cf' => ord "\cF",
485							'\\cG' => ord "\cG",
486							'\\cg' => ord "\cG",
487							'\\cH' => ord "\cH",
488							'\\ch' => ord "\cH",
489							'\\cI' => ord "\cI",
490							'\\ci' => ord "\cI",
491							'\\cJ' => ord "\cJ",
492							'\\cj' => ord "\cJ",
493							'\\cK' => ord "\cK",
494							'\\ck' => ord "\cK",
495							'\\cL' => ord "\cL",
496							'\\cl' => ord "\cL",
497							'\\cM' => ord "\cM",
498							'\\cm' => ord "\cM",
499							'\\cN' => ord "\cN",
500							'\\cn' => ord "\cN",
501							'\\cO' => ord "\cO",
502							'\\co' => ord "\cO",
503							'\\cP' => ord "\cP",
504							'\\cp' => ord "\cP",
505							'\\cQ' => ord "\cQ",
506							'\\cq' => ord "\cQ",
507							'\\cR' => ord "\cR",
508							'\\cr' => ord "\cR",
509							'\\cS' => ord "\cS",
510							'\\cs' => ord "\cS",
511							'\\cT' => ord "\cT",
512							'\\ct' => ord "\cT",
513							'\\cU' => ord "\cU",
514							'\\cu' => ord "\cU",
515							'\\cV' => ord "\cV",
516							'\\cv' => ord "\cV",
517							'\\cW' => ord "\cW",
518							'\\cw' => ord "\cW",
519							'\\cX' => ord "\cX",
520							'\\cx' => ord "\cX",
521							'\\cY' => ord "\cY",
522							'\\cy' => ord "\cY",
523							'\\cZ' => ord "\cZ",
524							'\\cz' => ord "\cZ",
525							'\\c[' => ord "\c[",
526							'\\c\\\\' => ord "\c\\", # " # Get Vim's head straight.
527							'\\c]' => ord "\c]",
528							'\\c^' => ord "\c^",
529							'\\c_' => ord "\c_",
530							);
531
532							sub ordinal {
533	17			17	1	59	my ( $self ) = @_;
534	17	100				57	exists $self->{ordinal} and return $self->{ordinal};
535	16					43	return ( $self->{ordinal} = $self->_ordinal() );
536							}
537
538							my %octal = map {; "$_" => 1 } ( 0 .. 7 );
539
540							sub _ordinal {
541	16			16		33	my ( $self ) = @_;
542	16					42	my $content = $self->content();
543
544	16	100				92	$content =~ m/ \A \\ /smx or return ord $content;
545
546	8	100				50	exists $escapes{$content} and return $escapes{$content};
547
548	6					18	my $indicator = substr $content, 1, 1;
549
550	6	100				31	$octal{$indicator} and return oct substr $content, 1;
551
552	5	100				23	if ( $indicator eq 'x' ) {
553	3	100				32	$content =~ m/ \A \\ x \{ ( [[:xdigit:]]* ) /smx
554							and return hex "0$1";
555	1	50				10	$content =~ m/ \A \\ x ( [[:xdigit:]]{0,2} ) \z /smx
556							and return hex $1;
557	0					0	return;
558							}
559
560	2	50				11	if ( $indicator eq 'o' ) {
561	0	0				0	$content =~ m/ \A \\ o [{] ( [01234567]* ) \z /smx
562							and return oct "0$1";
563	0					0	return; # Shouldn't happen, but ...
564							}
565
566	2	50				9	if ( $indicator eq 'N' ) {
567	2	100				13	$content =~ m/ \A \\ N \{ U [+] ( [[:xdigit:]]+ ) \} \z /smx
568							and return hex $1;
569	1	50				9	$content =~ m/ \A \\ N [{] ( .+ ) [}] \z /smx
		50
570							and return (
571							_have_charnames_vianame() ?
572							charnames::vianame( $1 ) :
573							undef
574							);
575	0					0	return; # Shouldn't happen, but ...
576							}
577
578	0					0	return ord $indicator;
579							}
580
581							}
582
583							sub __following_literal_left_curly_disallowed_in {
584	2			2		12	return LITERAL_LEFT_CURLY_REMOVED_PHASE_2;
585							}
586
587							{
588							my $have_charnames_vianame;
589
590							sub _have_charnames_vianame {
591	1	50		1		5	defined $have_charnames_vianame
592							and return $have_charnames_vianame;
593							return (
594	1	50				22	$have_charnames_vianame =
595							charnames->can( 'vianame' ) ? 1 : 0
596							);
597
598							}
599							}
600
601							sub __perl_requirements_setup {
602	4			4		17	my ( $self ) = @_;
603	4					6	my $prev;
604	4	50	66			12	q<{> eq $self->content() # }
			66
605							and $prev = $self->sprevious_sibling()
606							and $prev->isa( 'PPIx::Regexp::Token::Literal' )
607							or return $self->SUPER::__perl_requirements_setup();
608							return (
609							{
610	1					11	introduced => MINIMUM_PERL,
611							removed => LITERAL_LEFT_CURLY_REMOVED_PHASE_1,
612							},
613							# TODO the following will be needed if this construction is
614							# re-allowed in 5.26.1:
615							# {
616							# introduced => '5.026001',
617							# removed => '6.027000',
618							# },
619							{
620							introduced => '5.027001',
621							removed => LITERAL_LEFT_CURLY_REMOVED_PHASE_2,
622							},
623							);
624							}
625
626							*__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp;
627
628							1;
629
630							__END__