File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm

Criterion	Covered	Total	%
statement	107	108	99.0
branch	47	54	87.0
condition	5	5	100.0
subroutine	18	18	100.0
pod	3	3	100.0
total	180	188	95.7

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Structure - Represent structural elements.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9							->print();
10
11							=head1 INHERITANCE
12
13							C is a
14							L.
15
16							C is the parent of
17							L.
18
19							=head1 DESCRIPTION
20
21							This class represents things that define the structure of the regular
22							expression. This typically means brackets of various sorts, but to
23							prevent proliferation of token classes the type of the regular
24							expression is stored here.
25
26							=head1 METHODS
27
28							This class provides no public methods beyond those provided by its
29							superclass.
30
31							=cut
32
33							package PPIx::Regexp::Token::Structure;
34
35	9			9		81	use strict;
	9					17
	9					272
36	9			9		46	use warnings;
	9					19
	9					263
37
38	9			9		47	use base qw{ PPIx::Regexp::Token };
	9					21
	9					853
39
40	9					985	use PPIx::Regexp::Constant qw{
41							COOKIE_CLASS
42							COOKIE_QUANT
43							COOKIE_REGEX_SET
44							MINIMUM_PERL
45							TOKEN_LITERAL
46							@CARP_NOT
47	9			9		78	};
	9					22
48
49							# Tokens we are responsible for making, under at least some
50							# circumstances.
51	9			9		62	use PPIx::Regexp::Token::Comment ();
	9					18
	9					283
52	9			9		4883	use PPIx::Regexp::Token::Modifier ();
	9					26
	9					227
53	9			9		87	use PPIx::Regexp::Token::Backreference ();
	9					20
	9					129
54	9			9		46	use PPIx::Regexp::Token::Backtrack ();
	9					21
	9					126
55	9			9		5410	use PPIx::Regexp::Token::Recursion ();
	9					29
	9					11296
56
57							our $VERSION = '0.087';
58
59							# Return true if the token can be quantified, and false otherwise
60
61							my %quant = map { $_ => 1 } ')', ']';
62							sub can_be_quantified {
63	524			524	1	1180	my ( $self ) = @_;
64	524	50				1391	ref $self or return;
65	524					1811	return $quant{ $self->content() };
66							};
67
68							{
69
70							my %explanation = (
71							'' => 'Match regexp',
72							'(' => 'Capture or grouping',
73							'(?[' => 'Extended character class',
74							')' => 'End capture or grouping',
75							'[' => 'Character class',
76							']' => 'End character class',
77							'])' => 'End extended character class',
78							'm' => 'Match regexp',
79							'qr' => 'Regexp object definition',
80							's' => 'Replace regexp with string or expression',
81							'{' => 'Explicit quantifier',
82							'}' => 'End explicit quantifier',
83							);
84
85							sub __explanation {
86	7			7		18	return \%explanation;
87							}
88
89							}
90
91							sub is_quantifier {
92	540			540	1	1766	my ( $self ) = @_;
93	540	50				1503	ref $self or return;
94	540					2196	return $self->{is_quantifier};
95							}
96
97							{
98
99							# Note that the implementation equivocates on the ::Token::Structure
100							# class, using it both for the initial token that determines the
101							# type of the regex and things like parentheses internal to the
102							# regex. Rather than sort out this equivocation, I have relied on
103							# the currently-true assumption that 'qr' will not satisfy the
104							# ::Token::Structure recognition logic, and the only way this class
105							# can acquire this content is by the brute-force approach used to
106							# generate the initial token object.
107
108							my %perl_version_introduced = (
109							qr => '5.005',
110							'(?[' => '5.017008',
111							);
112
113							sub perl_version_introduced {
114	222			222	1	3878	my ( $self ) = @_;
115							return $self->{perl_version_introduced} \|\|
116	222		100			935	$perl_version_introduced{ $self->content() } \|\|
117							MINIMUM_PERL;
118							}
119							}
120
121							{
122
123							my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124
125							# Regular expressions to match various parenthesized tokens, and the
126							# classes to make them into.
127
128							my @paren_token = map {
129							[ $_ => $_->__PPIX_TOKEN__recognize() ]
130							}
131							'PPIx::Regexp::Token::Comment',
132							'PPIx::Regexp::Token::Modifier',
133							'PPIx::Regexp::Token::Backreference',
134							'PPIx::Regexp::Token::Backtrack',
135							'PPIx::Regexp::Token::Recursion',
136							;
137
138							sub __PPIX_TOKENIZER__regexp {
139	1046			1046		2510	my ( undef, $tokenizer, $character ) = @_;
140
141							# We are not interested in anything but delimiters.
142	1046	100				3318	$delim{$character} or return;
143
144							# Inside a character class, all the delimiters are normal characters
145							# except for the close square bracket.
146	893	100				2459	if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147	59	100				322	$character eq ']'
148							or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149	53					214	$tokenizer->cookie( COOKIE_CLASS, undef );
150	53					283	return 1;
151							}
152
153							# Open parentheses have various interesting possibilities ...
154	834	100				2614	if ( $character eq '(' ) {
155
156							# Sometimes the whole bunch of parenthesized characters seems
157							# naturally to be a token.
158	347					931	foreach ( @paren_token ) {
159	1659					2604	my ( $class, @recognize ) = @{ $_ };
	1659					4300
160	1659					2855	foreach ( @recognize ) {
161	2595					3582	my ( $regexp, $arg ) = @{ $_ };
	2595					4904
162	2595	100				5291	my $accept = $tokenizer->find_regexp( $regexp ) or next;
163	55					279	return $tokenizer->make_token( $accept, $class, $arg );
164							}
165							}
166
167							# Modifier changes are local to this parenthesis group
168	292					1307	$tokenizer->modifier_duplicate();
169
170							# The regex-set functionality introduced with 5.17.8 is most
171							# conveniently handled by treating the initial '(?[' and
172							# final '])' as ::Structure tokens. Fortunately for us,
173							# perl5178delta documents that these may not have interior
174							# spaces.
175
176	292	100				1348	if ( my $accept = $tokenizer->find_regexp(
177							qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178							)
179							) {
180	8			105		77	$tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
	105					310
181	8					63	$tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182	8					89	return $accept;
183							}
184
185							# We expect certain tokens only after a left paren.
186							$tokenizer->expect(
187	284					1730	'PPIx::Regexp::Token::GroupType::Modifier',
188							'PPIx::Regexp::Token::GroupType::NamedCapture',
189							'PPIx::Regexp::Token::GroupType::Assertion',
190							'PPIx::Regexp::Token::GroupType::Code',
191							'PPIx::Regexp::Token::GroupType::BranchReset',
192							'PPIx::Regexp::Token::GroupType::Subexpression',
193							'PPIx::Regexp::Token::GroupType::Switch',
194							'PPIx::Regexp::Token::GroupType::Script_Run',
195							'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196							);
197
198							# Accept the parenthesis.
199	284					946	return 1;
200							}
201
202							# Close parentheses end modifier localization
203	487	100				2111	if ( $character eq ')' ) {
204	288					1405	$tokenizer->modifier_pop();
205	288					794	return 1;
206							}
207
208							# Open curlys are complicated because they may or may not represent
209							# the beginning of a quantifier, depending on what comes before the
210							# close curly. So we set a cookie to monitor the token stream for
211							# interlopers. If all goes well, the right curly will find the
212							# cookie and know it is supposed to be a quantifier.
213	199	100				573	if ( $character eq '{' ) {
214
215							# If the prior token can not be quantified, all this is
216							# unnecessary.
217	71	100				218	$tokenizer->prior_significant_token( 'can_be_quantified' )
218							or return 1;
219
220							# We make our token now, before setting the cookie. Otherwise
221							# the cookie has to deal with this token.
222	61					301	my $token = $tokenizer->make_token( 1 );
223
224							# A cookie for the next '}'.
225	61					150	my $commas = 0;
226	61					142	my $allow_digit = 1;
227							$tokenizer->cookie( COOKIE_QUANT, sub {
228	115			115		400	my ( $tokenizer, $token ) = @_;
229	115	50				319	$token or return 1;
230
231							# Code for 5.33.6 and after.
232							# We allow {,...}, and we allow space inside and
233							# adjacent to the curlys, and around the comma if
234							# any. But not interior to the numbers.
235	115	100				440	if ( $token->isa( TOKEN_LITERAL ) ) {
236	105					270	my $character = $token->content();
237	105	100				424	if ( $character =~ m/ \A \s \z /smx ) {
238							# Digits only allowed if the prior
239							# significant was an open curly or a comma.
240	4					13	$allow_digit = $tokenizer->prior_significant_token(
241							'content' ) =~ m/ \A [{,] \z /smx; # }
242	4					15	return 1;
243							}
244	101	100				400	$character eq ','
245							and return( ! $commas++ );
246	68	50				206	$allow_digit
247							or return;
248	68					446	return $character =~ m/ \A [0-9] \z /smx;
249							}
250
251							# Since we do not know what is in an interpolation, we
252							# trustingly accept it.
253	10	100				81	if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254							) {
255	5					21	return 1;
256							}
257
258	5					55	return;
259							},
260	61					582	);
261
262	61					191	return $token;
263							}
264
265							# The close curly bracket is a little complicated because if the
266							# cookie posted by the left curly bracket is still around, we are a
267							# quantifier, otherwise not.
268	128	100				552	if ( $character eq '}' ) {
269	63	100				215	$tokenizer->cookie( COOKIE_QUANT, undef )
270							or return 1;
271	50	100				449	$tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272							and return 1;
273	47					168	my $token = $tokenizer->make_token( 1 );
274	47					197	$token->{is_quantifier} = 1;
275	47					166	return $token;
276							}
277
278							# The parse rules are different inside a character class, so we set
279							# another cookie. Sigh. If your tool is a hammer ...
280	65	100				257	if ( $character eq '[' ) {
281
282							# Set our cookie. Since it always returns 1, it does not matter
283							# where in the following mess we set it.
284	53			226		385	$tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
	226					700
285
286							# Make our token now, since the easiest place to deal with the
287							# beginning-of-character-class strangeness seems to be right
288							# here.
289	53					294	my @tokens = $tokenizer->make_token( 1 );
290
291							# Get the next character, returning tokens if there is none.
292	53	50				338	defined ( $character = $tokenizer->peek() )
293							or return @tokens;
294
295							# If we have a caret, it is a negation operator. Make its token
296							# and fetch the next character, returning if none.
297	53	100				215	if ( $character eq '^' ) {
298	5					21	push @tokens, $tokenizer->make_token(
299							1, 'PPIx::Regexp::Token::Operator' );
300	5	50				47	defined ( $character = $tokenizer->peek() )
301							or return @tokens;
302							}
303
304							# If we have a close square at this point, it is not the end of
305							# the class, but just a literal. Make its token.
306	53	100				244	$character eq ']'
307							and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308
309							# Return all tokens made.
310	53					219	return @tokens;
311							}
312							# per perlop, the metas inside a [] are -]\^$.
313							# per perlop, the metas outside a [] are {}[]()^$.\|*+?\
314							# The difference is that {}[().\|*+? are not metas in [], but - is.
315
316							# Close bracket is complicated by the addition of regex sets.
317							# And more complicated by the fact that you can have an
318							# old-style character class inside a regex set. Fortunately they
319							# have not (yet!) permitted nested regex sets.
320	12	50				79	if ( $character eq ']' ) {
321
322							# If we find '])' and COOKIE_REGEX_SET is present, we have a
323							# regex set. We need to delete the cookie and accept both
324							# characters.
325	12	100	100			64	if ( ( my $accept = $tokenizer->find_regexp(
326							# help vim - ( [
327							qr{ \A []] [)] }smx
328							) )
329							&& $tokenizer->cookie( COOKIE_REGEX_SET )
330
331							) {
332	8					76	$tokenizer->cookie( COOKIE_REGEX_SET, undef );
333	8					52	return $accept;
334							}
335
336							# Otherwise we assume we're in a bracketed character class,
337							# delete the cookie, and accept the close bracket.
338	4					20	$tokenizer->cookie( COOKIE_CLASS, undef );
339	4					10	return 1;
340							}
341
342	0					0	return 1;
343							}
344
345							}
346
347							# Called by the lexer once it has done its worst to all the tokens.
348							# Called as a method with no arguments. The return is the number of
349							# parse failures discovered when finalizing.
350							sub __PPIX_LEXER__finalize {
351	1452			1452		2466	my ( $self ) = @_;
352	1452					2348	delete $self->{is_quantifier};
353	1452					3288	return 0;
354							}
355
356							1;
357
358							__END__