File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm

Criterion	Covered	Total	%
statement	107	108	99.0
branch	47	54	87.0
condition	5	5	100.0
subroutine	18	18	100.0
pod	3	3	100.0
total	180	188	95.7

line	stmt	bran	cond	sub	pod	time	code
1							=head1 NAME
2
3							PPIx::Regexp::Token::Structure - Represent structural elements.
4
5							=head1 SYNOPSIS
6
7							use PPIx::Regexp::Dumper;
8							PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9							->print();
10
11							=head1 INHERITANCE
12
13							C is a
14							L.
15
16							C is the parent of
17							L.
18
19							=head1 DESCRIPTION
20
21							This class represents things that define the structure of the regular
22							expression. This typically means brackets of various sorts, but to
23							prevent proliferation of token classes the type of the regular
24							expression is stored here.
25
26							=head1 METHODS
27
28							This class provides no public methods beyond those provided by its
29							superclass.
30
31							=cut
32
33							package PPIx::Regexp::Token::Structure;
34
35	9			9		67	use strict;
	9					12
	9					245
36	9			9		28	use warnings;
	9					13
	9					329
37
38	9			9		30	use base qw{ PPIx::Regexp::Token };
	9					13
	9					720
39
40	9					1055	use PPIx::Regexp::Constant qw{
41							COOKIE_CLASS
42							COOKIE_QUANT
43							COOKIE_REGEX_SET
44							MINIMUM_PERL
45							TOKEN_LITERAL
46							@CARP_NOT
47	9			9		39	};
	9					14
48
49							# Tokens we are responsible for making, under at least some
50							# circumstances.
51	9			9		54	use PPIx::Regexp::Token::Comment ();
	9					15
	9					139
52	9			9		4098	use PPIx::Regexp::Token::Modifier ();
	9					25
	9					190
53	9			9		49	use PPIx::Regexp::Token::Backreference ();
	9					13
	9					92
54	9			9		26	use PPIx::Regexp::Token::Backtrack ();
	9					12
	9					77
55	9			9		3271	use PPIx::Regexp::Token::Recursion ();
	9					24
	9					8487
56
57							our $VERSION = '0.092';
58
59							# Return true if the token can be quantified, and false otherwise
60
61							my %quant = map { $_ => 1 } ')', ']';
62							sub can_be_quantified {
63	524			524	1	829	my ( $self ) = @_;
64	524	50				960	ref $self or return;
65	524					1203	return $quant{ $self->content() };
66							};
67
68							{
69
70							my %explanation = (
71							'' => 'Match regexp',
72							'(' => 'Capture or grouping',
73							'(?[' => 'Extended character class',
74							')' => 'End capture or grouping',
75							'[' => 'Character class',
76							']' => 'End character class',
77							'])' => 'End extended character class',
78							'm' => 'Match regexp',
79							'qr' => 'Regexp object definition',
80							's' => 'Replace regexp with string or expression',
81							'{' => 'Explicit quantifier',
82							'}' => 'End explicit quantifier',
83							);
84
85							sub __explanation {
86	7			7		13	return \%explanation;
87							}
88
89							}
90
91							sub is_quantifier {
92	540			540	1	759	my ( $self ) = @_;
93	540	50				852	ref $self or return;
94	540					1389	return $self->{is_quantifier};
95							}
96
97							{
98
99							# Note that the implementation equivocates on the ::Token::Structure
100							# class, using it both for the initial token that determines the
101							# type of the regex and things like parentheses internal to the
102							# regex. Rather than sort out this equivocation, I have relied on
103							# the currently-true assumption that 'qr' will not satisfy the
104							# ::Token::Structure recognition logic, and the only way this class
105							# can acquire this content is by the brute-force approach used to
106							# generate the initial token object.
107
108							my %perl_version_introduced = (
109							qr => '5.005',
110							'(?[' => '5.017008',
111							);
112
113							sub perl_version_introduced {
114	222			222	1	3156	my ( $self ) = @_;
115							return $self->{perl_version_introduced} \|\|
116	222		100			826	$perl_version_introduced{ $self->content() } \|\|
117							MINIMUM_PERL;
118							}
119							}
120
121							{
122
123							my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124
125							# Regular expressions to match various parenthesized tokens, and the
126							# classes to make them into.
127
128							my @paren_token = map {
129							[ $_ => $_->__PPIX_TOKEN__recognize() ]
130							}
131							'PPIx::Regexp::Token::Comment',
132							'PPIx::Regexp::Token::Modifier',
133							'PPIx::Regexp::Token::Backreference',
134							'PPIx::Regexp::Token::Backtrack',
135							'PPIx::Regexp::Token::Recursion',
136							;
137
138							sub __PPIX_TOKENIZER__regexp {
139	1046			1046		1483	my ( undef, $tokenizer, $character ) = @_;
140
141							# We are not interested in anything but delimiters.
142	1046	100				2366	$delim{$character} or return;
143
144							# Inside a character class, all the delimiters are normal characters
145							# except for the close square bracket.
146	893	100				1392	if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147	59	100				117	$character eq ']'
148							or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149	53					119	$tokenizer->cookie( COOKIE_CLASS, undef );
150	53					190	return 1;
151							}
152
153							# Open parentheses have various interesting possibilities ...
154	834	100				1495	if ( $character eq '(' ) {
155
156							# Sometimes the whole bunch of parenthesized characters seems
157							# naturally to be a token.
158	347					652	foreach ( @paren_token ) {
159	1659					1751	my ( $class, @recognize ) = @{ $_ };
	1659					2681
160	1659					1929	foreach ( @recognize ) {
161	2595					2538	my ( $regexp, $arg ) = @{ $_ };
	2595					3155
162	2595	100				3648	my $accept = $tokenizer->find_regexp( $regexp ) or next;
163	55					169	return $tokenizer->make_token( $accept, $class, $arg );
164							}
165							}
166
167							# Modifier changes are local to this parenthesis group
168	292					777	$tokenizer->modifier_duplicate();
169
170							# The regex-set functionality introduced with 5.17.8 is most
171							# conveniently handled by treating the initial '(?[' and
172							# final '])' as ::Structure tokens. Fortunately for us,
173							# perl5178delta documents that these may not have interior
174							# spaces.
175
176	292	100				720	if ( my $accept = $tokenizer->find_regexp(
177							qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178							)
179							) {
180	8			105		46	$tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
	105					175
181	8					30	$tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182	8					24	return $accept;
183							}
184
185							# We expect certain tokens only after a left paren.
186							$tokenizer->expect(
187	284					909	'PPIx::Regexp::Token::GroupType::Modifier',
188							'PPIx::Regexp::Token::GroupType::NamedCapture',
189							'PPIx::Regexp::Token::GroupType::Assertion',
190							'PPIx::Regexp::Token::GroupType::Code',
191							'PPIx::Regexp::Token::GroupType::BranchReset',
192							'PPIx::Regexp::Token::GroupType::Subexpression',
193							'PPIx::Regexp::Token::GroupType::Switch',
194							'PPIx::Regexp::Token::GroupType::Script_Run',
195							'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196							);
197
198							# Accept the parenthesis.
199	284					671	return 1;
200							}
201
202							# Close parentheses end modifier localization
203	487	100				869	if ( $character eq ')' ) {
204	288					738	$tokenizer->modifier_pop();
205	288					515	return 1;
206							}
207
208							# Open curlys are complicated because they may or may not represent
209							# the beginning of a quantifier, depending on what comes before the
210							# close curly. So we set a cookie to monitor the token stream for
211							# interlopers. If all goes well, the right curly will find the
212							# cookie and know it is supposed to be a quantifier.
213	199	100				402	if ( $character eq '{' ) {
214
215							# If the prior token can not be quantified, all this is
216							# unnecessary.
217	71	100				374	$tokenizer->prior_significant_token( 'can_be_quantified' )
218							or return 1;
219
220							# We make our token now, before setting the cookie. Otherwise
221							# the cookie has to deal with this token.
222	61					130	my $token = $tokenizer->make_token( 1 );
223
224							# A cookie for the next '}'.
225	61					110	my $commas = 0;
226	61					83	my $allow_digit = 1;
227							$tokenizer->cookie( COOKIE_QUANT, sub {
228	115			115		220	my ( $tokenizer, $token ) = @_;
229	115	50				213	$token or return 1;
230
231							# Code for 5.33.6 and after.
232							# We allow {,...}, and we allow space inside and
233							# adjacent to the curlys, and around the comma if
234							# any. But not interior to the numbers.
235	115	100				292	if ( $token->isa( TOKEN_LITERAL ) ) {
236	105					253	my $character = $token->content();
237	105	100				244	if ( $character =~ m/ \A \s \z /smx ) {
238							# Digits only allowed if the prior
239							# significant was an open curly or a comma.
240	4					8	$allow_digit = $tokenizer->prior_significant_token(
241							'content' ) =~ m/ \A [{,] \z /smx; # }
242	4					10	return 1;
243							}
244	101	100				251	$character eq ','
245							and return( ! $commas++ );
246	68	50				151	$allow_digit
247							or return;
248	68					262	return $character =~ m/ \A [0-9] \z /smx;
249							}
250
251							# Since we do not know what is in an interpolation, we
252							# trustingly accept it.
253	10	100				44	if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254							) {
255	5					13	return 1;
256							}
257
258	5					42	return;
259							},
260	61					369	);
261
262	61					147	return $token;
263							}
264
265							# The close curly bracket is a little complicated because if the
266							# cookie posted by the left curly bracket is still around, we are a
267							# quantifier, otherwise not.
268	128	100				280	if ( $character eq '}' ) {
269	63	100				117	$tokenizer->cookie( COOKIE_QUANT, undef )
270							or return 1;
271	50	100				348	$tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272							and return 1;
273	47					175	my $token = $tokenizer->make_token( 1 );
274	47					89	$token->{is_quantifier} = 1;
275	47					112	return $token;
276							}
277
278							# The parse rules are different inside a character class, so we set
279							# another cookie. Sigh. If your tool is a hammer ...
280	65	100				147	if ( $character eq '[' ) {
281
282							# Set our cookie. Since it always returns 1, it does not matter
283							# where in the following mess we set it.
284	53			226		234	$tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
	226					414
285
286							# Make our token now, since the easiest place to deal with the
287							# beginning-of-character-class strangeness seems to be right
288							# here.
289	53					130	my @tokens = $tokenizer->make_token( 1 );
290
291							# Get the next character, returning tokens if there is none.
292	53	50				144	defined ( $character = $tokenizer->peek() )
293							or return @tokens;
294
295							# If we have a caret, it is a negation operator. Make its token
296							# and fetch the next character, returning if none.
297	53	100				150	if ( $character eq '^' ) {
298	5					13	push @tokens, $tokenizer->make_token(
299							1, 'PPIx::Regexp::Token::Operator' );
300	5	50				13	defined ( $character = $tokenizer->peek() )
301							or return @tokens;
302							}
303
304							# If we have a close square at this point, it is not the end of
305							# the class, but just a literal. Make its token.
306	53	100				115	$character eq ']'
307							and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308
309							# Return all tokens made.
310	53					145	return @tokens;
311							}
312							# per perlop, the metas inside a [] are -]\^$.
313							# per perlop, the metas outside a [] are {}[]()^$.\|*+?\
314							# The difference is that {}[().\|*+? are not metas in [], but - is.
315
316							# Close bracket is complicated by the addition of regex sets.
317							# And more complicated by the fact that you can have an
318							# old-style character class inside a regex set. Fortunately they
319							# have not (yet!) permitted nested regex sets.
320	12	50				30	if ( $character eq ']' ) {
321
322							# If we find '])' and COOKIE_REGEX_SET is present, we have a
323							# regex set. We need to delete the cookie and accept both
324							# characters.
325	12	100	100			36	if ( ( my $accept = $tokenizer->find_regexp(
326							# help vim - ( [
327							qr{ \A []] [)] }smx
328							) )
329							&& $tokenizer->cookie( COOKIE_REGEX_SET )
330
331							) {
332	8					24	$tokenizer->cookie( COOKIE_REGEX_SET, undef );
333	8					31	return $accept;
334							}
335
336							# Otherwise we assume we're in a bracketed character class,
337							# delete the cookie, and accept the close bracket.
338	4					12	$tokenizer->cookie( COOKIE_CLASS, undef );
339	4					9	return 1;
340							}
341
342	0					0	return 1;
343							}
344
345							}
346
347							# Called by the lexer once it has done its worst to all the tokens.
348							# Called as a method with no arguments. The return is the number of
349							# parse failures discovered when finalizing.
350							sub __PPIX_LEXER__finalize {
351	1460			1460		1791	my ( $self ) = @_;
352	1460					1596	delete $self->{is_quantifier};
353	1460					2105	return 0;
354							}
355
356							1;
357
358							__END__