File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm

Criterion	Covered	Total	%
statement	107	108	99.0
branch	47	54	87.0
condition	5	5	100.0
subroutine	18	18	100.0
pod	3	3	100.0
total	180	188	95.7

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Structure - Represent structural elements.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9							->print();
10
11							=head1 INHERITANCE
12
13							C is a
14							L.
15
16							C is the parent of
17							L.
18
19							=head1 DESCRIPTION
20
21							This class represents things that define the structure of the regular
22							expression. This typically means brackets of various sorts, but to
23							prevent proliferation of token classes the type of the regular
24							expression is stored here.
25
26							=head1 METHODS
27
28							This class provides no public methods beyond those provided by its
29							superclass.
30
31							=cut
32
33							package PPIx::Regexp::Token::Structure;
34
35	9			9		58	use strict;
	9					13
	9					231
36	9			9		28	use warnings;
	9					13
	9					366
37
38	9			9		38	use base qw{ PPIx::Regexp::Token };
	9					11
	9					786
39
40	9					1069	use PPIx::Regexp::Constant qw{
41							COOKIE_CLASS
42							COOKIE_QUANT
43							COOKIE_REGEX_SET
44							MINIMUM_PERL
45							TOKEN_LITERAL
46							@CARP_NOT
47	9			9		38	};
	9					10
48
49							# Tokens we are responsible for making, under at least some
50							# circumstances.
51	9			9		48	use PPIx::Regexp::Token::Comment ();
	9					13
	9					132
52	9			9		4146	use PPIx::Regexp::Token::Modifier ();
	9					21
	9					200
53	9			9		74	use PPIx::Regexp::Token::Backreference ();
	9					12
	9					99
54	9			9		28	use PPIx::Regexp::Token::Backtrack ();
	9					13
	9					77
55	9			9		3410	use PPIx::Regexp::Token::Recursion ();
	9					21
	9					9026
56
57							our $VERSION = '0.091_01';
58
59							# Return true if the token can be quantified, and false otherwise
60
61							my %quant = map { $_ => 1 } ')', ']';
62							sub can_be_quantified {
63	524			524	1	750	my ( $self ) = @_;
64	524	50				964	ref $self or return;
65	524					1300	return $quant{ $self->content() };
66							};
67
68							{
69
70							my %explanation = (
71							'' => 'Match regexp',
72							'(' => 'Capture or grouping',
73							'(?[' => 'Extended character class',
74							')' => 'End capture or grouping',
75							'[' => 'Character class',
76							']' => 'End character class',
77							'])' => 'End extended character class',
78							'm' => 'Match regexp',
79							'qr' => 'Regexp object definition',
80							's' => 'Replace regexp with string or expression',
81							'{' => 'Explicit quantifier',
82							'}' => 'End explicit quantifier',
83							);
84
85							sub __explanation {
86	7			7		15	return \%explanation;
87							}
88
89							}
90
91							sub is_quantifier {
92	540			540	1	849	my ( $self ) = @_;
93	540	50				901	ref $self or return;
94	540					1535	return $self->{is_quantifier};
95							}
96
97							{
98
99							# Note that the implementation equivocates on the ::Token::Structure
100							# class, using it both for the initial token that determines the
101							# type of the regex and things like parentheses internal to the
102							# regex. Rather than sort out this equivocation, I have relied on
103							# the currently-true assumption that 'qr' will not satisfy the
104							# ::Token::Structure recognition logic, and the only way this class
105							# can acquire this content is by the brute-force approach used to
106							# generate the initial token object.
107
108							my %perl_version_introduced = (
109							qr => '5.005',
110							'(?[' => '5.017008',
111							);
112
113							sub perl_version_introduced {
114	222			222	1	3379	my ( $self ) = @_;
115							return $self->{perl_version_introduced} \|\|
116	222		100			783	$perl_version_introduced{ $self->content() } \|\|
117							MINIMUM_PERL;
118							}
119							}
120
121							{
122
123							my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124
125							# Regular expressions to match various parenthesized tokens, and the
126							# classes to make them into.
127
128							my @paren_token = map {
129							[ $_ => $_->__PPIX_TOKEN__recognize() ]
130							}
131							'PPIx::Regexp::Token::Comment',
132							'PPIx::Regexp::Token::Modifier',
133							'PPIx::Regexp::Token::Backreference',
134							'PPIx::Regexp::Token::Backtrack',
135							'PPIx::Regexp::Token::Recursion',
136							;
137
138							sub __PPIX_TOKENIZER__regexp {
139	1046			1046		1509	my ( undef, $tokenizer, $character ) = @_;
140
141							# We are not interested in anything but delimiters.
142	1046	100				2146	$delim{$character} or return;
143
144							# Inside a character class, all the delimiters are normal characters
145							# except for the close square bracket.
146	893	100				1472	if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147	59	100				144	$character eq ']'
148							or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149	53					130	$tokenizer->cookie( COOKIE_CLASS, undef );
150	53					221	return 1;
151							}
152
153							# Open parentheses have various interesting possibilities ...
154	834	100				1512	if ( $character eq '(' ) {
155
156							# Sometimes the whole bunch of parenthesized characters seems
157							# naturally to be a token.
158	347					643	foreach ( @paren_token ) {
159	1659					1769	my ( $class, @recognize ) = @{ $_ };
	1659					2824
160	1659					1990	foreach ( @recognize ) {
161	2595					2610	my ( $regexp, $arg ) = @{ $_ };
	2595					3256
162	2595	100				3758	my $accept = $tokenizer->find_regexp( $regexp ) or next;
163	55					159	return $tokenizer->make_token( $accept, $class, $arg );
164							}
165							}
166
167							# Modifier changes are local to this parenthesis group
168	292					810	$tokenizer->modifier_duplicate();
169
170							# The regex-set functionality introduced with 5.17.8 is most
171							# conveniently handled by treating the initial '(?[' and
172							# final '])' as ::Structure tokens. Fortunately for us,
173							# perl5178delta documents that these may not have interior
174							# spaces.
175
176	292	100				776	if ( my $accept = $tokenizer->find_regexp(
177							qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178							)
179							) {
180	8			105		38	$tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
	105					206
181	8					26	$tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182	8					20	return $accept;
183							}
184
185							# We expect certain tokens only after a left paren.
186							$tokenizer->expect(
187	284					950	'PPIx::Regexp::Token::GroupType::Modifier',
188							'PPIx::Regexp::Token::GroupType::NamedCapture',
189							'PPIx::Regexp::Token::GroupType::Assertion',
190							'PPIx::Regexp::Token::GroupType::Code',
191							'PPIx::Regexp::Token::GroupType::BranchReset',
192							'PPIx::Regexp::Token::GroupType::Subexpression',
193							'PPIx::Regexp::Token::GroupType::Switch',
194							'PPIx::Regexp::Token::GroupType::Script_Run',
195							'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196							);
197
198							# Accept the parenthesis.
199	284					587	return 1;
200							}
201
202							# Close parentheses end modifier localization
203	487	100				920	if ( $character eq ')' ) {
204	288					830	$tokenizer->modifier_pop();
205	288					492	return 1;
206							}
207
208							# Open curlys are complicated because they may or may not represent
209							# the beginning of a quantifier, depending on what comes before the
210							# close curly. So we set a cookie to monitor the token stream for
211							# interlopers. If all goes well, the right curly will find the
212							# cookie and know it is supposed to be a quantifier.
213	199	100				662	if ( $character eq '{' ) {
214
215							# If the prior token can not be quantified, all this is
216							# unnecessary.
217	71	100				138	$tokenizer->prior_significant_token( 'can_be_quantified' )
218							or return 1;
219
220							# We make our token now, before setting the cookie. Otherwise
221							# the cookie has to deal with this token.
222	61					128	my $token = $tokenizer->make_token( 1 );
223
224							# A cookie for the next '}'.
225	61					80	my $commas = 0;
226	61					76	my $allow_digit = 1;
227							$tokenizer->cookie( COOKIE_QUANT, sub {
228	115			115		200	my ( $tokenizer, $token ) = @_;
229	115	50				204	$token or return 1;
230
231							# Code for 5.33.6 and after.
232							# We allow {,...}, and we allow space inside and
233							# adjacent to the curlys, and around the comma if
234							# any. But not interior to the numbers.
235	115	100				268	if ( $token->isa( TOKEN_LITERAL ) ) {
236	105					197	my $character = $token->content();
237	105	100				253	if ( $character =~ m/ \A \s \z /smx ) {
238							# Digits only allowed if the prior
239							# significant was an open curly or a comma.
240	4					8	$allow_digit = $tokenizer->prior_significant_token(
241							'content' ) =~ m/ \A [{,] \z /smx; # }
242	4					11	return 1;
243							}
244	101	100				240	$character eq ','
245							and return( ! $commas++ );
246	68	50				201	$allow_digit
247							or return;
248	68					267	return $character =~ m/ \A [0-9] \z /smx;
249							}
250
251							# Since we do not know what is in an interpolation, we
252							# trustingly accept it.
253	10	100				95	if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254							) {
255	5					12	return 1;
256							}
257
258	5					49	return;
259							},
260	61					379	);
261
262	61					157	return $token;
263							}
264
265							# The close curly bracket is a little complicated because if the
266							# cookie posted by the left curly bracket is still around, we are a
267							# quantifier, otherwise not.
268	128	100				290	if ( $character eq '}' ) {
269	63	100				164	$tokenizer->cookie( COOKIE_QUANT, undef )
270							or return 1;
271	50	100				357	$tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272							and return 1;
273	47					116	my $token = $tokenizer->make_token( 1 );
274	47					86	$token->{is_quantifier} = 1;
275	47					95	return $token;
276							}
277
278							# The parse rules are different inside a character class, so we set
279							# another cookie. Sigh. If your tool is a hammer ...
280	65	100				158	if ( $character eq '[' ) {
281
282							# Set our cookie. Since it always returns 1, it does not matter
283							# where in the following mess we set it.
284	53			226		238	$tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
	226					443
285
286							# Make our token now, since the easiest place to deal with the
287							# beginning-of-character-class strangeness seems to be right
288							# here.
289	53					128	my @tokens = $tokenizer->make_token( 1 );
290
291							# Get the next character, returning tokens if there is none.
292	53	50				161	defined ( $character = $tokenizer->peek() )
293							or return @tokens;
294
295							# If we have a caret, it is a negation operator. Make its token
296							# and fetch the next character, returning if none.
297	53	100				122	if ( $character eq '^' ) {
298	5					14	push @tokens, $tokenizer->make_token(
299							1, 'PPIx::Regexp::Token::Operator' );
300	5	50				14	defined ( $character = $tokenizer->peek() )
301							or return @tokens;
302							}
303
304							# If we have a close square at this point, it is not the end of
305							# the class, but just a literal. Make its token.
306	53	100				138	$character eq ']'
307							and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308
309							# Return all tokens made.
310	53					144	return @tokens;
311							}
312							# per perlop, the metas inside a [] are -]\^$.
313							# per perlop, the metas outside a [] are {}[]()^$.\|*+?\
314							# The difference is that {}[().\|*+? are not metas in [], but - is.
315
316							# Close bracket is complicated by the addition of regex sets.
317							# And more complicated by the fact that you can have an
318							# old-style character class inside a regex set. Fortunately they
319							# have not (yet!) permitted nested regex sets.
320	12	50				34	if ( $character eq ']' ) {
321
322							# If we find '])' and COOKIE_REGEX_SET is present, we have a
323							# regex set. We need to delete the cookie and accept both
324							# characters.
325	12	100	100			32	if ( ( my $accept = $tokenizer->find_regexp(
326							# help vim - ( [
327							qr{ \A []] [)] }smx
328							) )
329							&& $tokenizer->cookie( COOKIE_REGEX_SET )
330
331							) {
332	8					30	$tokenizer->cookie( COOKIE_REGEX_SET, undef );
333	8					33	return $accept;
334							}
335
336							# Otherwise we assume we're in a bracketed character class,
337							# delete the cookie, and accept the close bracket.
338	4					11	$tokenizer->cookie( COOKIE_CLASS, undef );
339	4					8	return 1;
340							}
341
342	0					0	return 1;
343							}
344
345							}
346
347							# Called by the lexer once it has done its worst to all the tokens.
348							# Called as a method with no arguments. The return is the number of
349							# parse failures discovered when finalizing.
350							sub __PPIX_LEXER__finalize {
351	1460			1460		1823	my ( $self ) = @_;
352	1460					1666	delete $self->{is_quantifier};
353	1460					2171	return 0;
354							}
355
356							1;
357
358							__END__