File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm
Criterion Covered Total %
statement 107 108 99.0
branch 47 54 87.0
condition 5 5 100.0
subroutine 18 18 100.0
pod 3 3 100.0
total 180 188 95.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::Structure - Represent structural elements.
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9             ->print();
10              
11             =head1 INHERITANCE
12              
13             C is a
14             L.
15              
16             C is the parent of
17             L.
18              
19             =head1 DESCRIPTION
20              
21             This class represents things that define the structure of the regular
22             expression. This typically means brackets of various sorts, but to
23             prevent proliferation of token classes the type of the regular
24             expression is stored here.
25              
26             =head1 METHODS
27              
28             This class provides no public methods beyond those provided by its
29             superclass.
30              
31             =cut
32              
33             package PPIx::Regexp::Token::Structure;
34              
35 9     9   81 use strict;
  9         17  
  9         272  
36 9     9   46 use warnings;
  9         19  
  9         263  
37              
38 9     9   47 use base qw{ PPIx::Regexp::Token };
  9         21  
  9         853  
39              
40 9         985 use PPIx::Regexp::Constant qw{
41             COOKIE_CLASS
42             COOKIE_QUANT
43             COOKIE_REGEX_SET
44             MINIMUM_PERL
45             TOKEN_LITERAL
46             @CARP_NOT
47 9     9   78 };
  9         22  
48              
49             # Tokens we are responsible for making, under at least some
50             # circumstances.
51 9     9   62 use PPIx::Regexp::Token::Comment ();
  9         18  
  9         283  
52 9     9   4883 use PPIx::Regexp::Token::Modifier ();
  9         26  
  9         227  
53 9     9   87 use PPIx::Regexp::Token::Backreference ();
  9         20  
  9         129  
54 9     9   46 use PPIx::Regexp::Token::Backtrack ();
  9         21  
  9         126  
55 9     9   5410 use PPIx::Regexp::Token::Recursion ();
  9         29  
  9         11296  
56              
57             our $VERSION = '0.087';
58              
59             # Return true if the token can be quantified, and false otherwise
60              
61             my %quant = map { $_ => 1 } ')', ']';
62             sub can_be_quantified {
63 524     524 1 1180 my ( $self ) = @_;
64 524 50       1391 ref $self or return;
65 524         1811 return $quant{ $self->content() };
66             };
67              
68             {
69              
70             my %explanation = (
71             '' => 'Match regexp',
72             '(' => 'Capture or grouping',
73             '(?[' => 'Extended character class',
74             ')' => 'End capture or grouping',
75             '[' => 'Character class',
76             ']' => 'End character class',
77             '])' => 'End extended character class',
78             'm' => 'Match regexp',
79             'qr' => 'Regexp object definition',
80             's' => 'Replace regexp with string or expression',
81             '{' => 'Explicit quantifier',
82             '}' => 'End explicit quantifier',
83             );
84              
85             sub __explanation {
86 7     7   18 return \%explanation;
87             }
88              
89             }
90              
91             sub is_quantifier {
92 540     540 1 1766 my ( $self ) = @_;
93 540 50       1503 ref $self or return;
94 540         2196 return $self->{is_quantifier};
95             }
96              
97             {
98              
99             # Note that the implementation equivocates on the ::Token::Structure
100             # class, using it both for the initial token that determines the
101             # type of the regex and things like parentheses internal to the
102             # regex. Rather than sort out this equivocation, I have relied on
103             # the currently-true assumption that 'qr' will not satisfy the
104             # ::Token::Structure recognition logic, and the only way this class
105             # can acquire this content is by the brute-force approach used to
106             # generate the initial token object.
107              
108             my %perl_version_introduced = (
109             qr => '5.005',
110             '(?[' => '5.017008',
111             );
112              
113             sub perl_version_introduced {
114 222     222 1 3878 my ( $self ) = @_;
115             return $self->{perl_version_introduced} ||
116 222   100     935 $perl_version_introduced{ $self->content() } ||
117             MINIMUM_PERL;
118             }
119             }
120            
121             {
122              
123             my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124              
125             # Regular expressions to match various parenthesized tokens, and the
126             # classes to make them into.
127              
128             my @paren_token = map {
129             [ $_ => $_->__PPIX_TOKEN__recognize() ]
130             }
131             'PPIx::Regexp::Token::Comment',
132             'PPIx::Regexp::Token::Modifier',
133             'PPIx::Regexp::Token::Backreference',
134             'PPIx::Regexp::Token::Backtrack',
135             'PPIx::Regexp::Token::Recursion',
136             ;
137              
138             sub __PPIX_TOKENIZER__regexp {
139 1046     1046   2510 my ( undef, $tokenizer, $character ) = @_;
140              
141             # We are not interested in anything but delimiters.
142 1046 100       3318 $delim{$character} or return;
143              
144             # Inside a character class, all the delimiters are normal characters
145             # except for the close square bracket.
146 893 100       2459 if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147 59 100       322 $character eq ']'
148             or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149 53         214 $tokenizer->cookie( COOKIE_CLASS, undef );
150 53         283 return 1;
151             }
152              
153             # Open parentheses have various interesting possibilities ...
154 834 100       2614 if ( $character eq '(' ) {
155              
156             # Sometimes the whole bunch of parenthesized characters seems
157             # naturally to be a token.
158 347         931 foreach ( @paren_token ) {
159 1659         2604 my ( $class, @recognize ) = @{ $_ };
  1659         4300  
160 1659         2855 foreach ( @recognize ) {
161 2595         3582 my ( $regexp, $arg ) = @{ $_ };
  2595         4904  
162 2595 100       5291 my $accept = $tokenizer->find_regexp( $regexp ) or next;
163 55         279 return $tokenizer->make_token( $accept, $class, $arg );
164             }
165             }
166              
167             # Modifier changes are local to this parenthesis group
168 292         1307 $tokenizer->modifier_duplicate();
169              
170             # The regex-set functionality introduced with 5.17.8 is most
171             # conveniently handled by treating the initial '(?[' and
172             # final '])' as ::Structure tokens. Fortunately for us,
173             # perl5178delta documents that these may not have interior
174             # spaces.
175              
176 292 100       1348 if ( my $accept = $tokenizer->find_regexp(
177             qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178             )
179             ) {
180 8     105   77 $tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
  105         310  
181 8         63 $tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182 8         89 return $accept;
183             }
184              
185             # We expect certain tokens only after a left paren.
186             $tokenizer->expect(
187 284         1730 'PPIx::Regexp::Token::GroupType::Modifier',
188             'PPIx::Regexp::Token::GroupType::NamedCapture',
189             'PPIx::Regexp::Token::GroupType::Assertion',
190             'PPIx::Regexp::Token::GroupType::Code',
191             'PPIx::Regexp::Token::GroupType::BranchReset',
192             'PPIx::Regexp::Token::GroupType::Subexpression',
193             'PPIx::Regexp::Token::GroupType::Switch',
194             'PPIx::Regexp::Token::GroupType::Script_Run',
195             'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196             );
197              
198             # Accept the parenthesis.
199 284         946 return 1;
200             }
201              
202             # Close parentheses end modifier localization
203 487 100       2111 if ( $character eq ')' ) {
204 288         1405 $tokenizer->modifier_pop();
205 288         794 return 1;
206             }
207              
208             # Open curlys are complicated because they may or may not represent
209             # the beginning of a quantifier, depending on what comes before the
210             # close curly. So we set a cookie to monitor the token stream for
211             # interlopers. If all goes well, the right curly will find the
212             # cookie and know it is supposed to be a quantifier.
213 199 100       573 if ( $character eq '{' ) {
214              
215             # If the prior token can not be quantified, all this is
216             # unnecessary.
217 71 100       218 $tokenizer->prior_significant_token( 'can_be_quantified' )
218             or return 1;
219              
220             # We make our token now, before setting the cookie. Otherwise
221             # the cookie has to deal with this token.
222 61         301 my $token = $tokenizer->make_token( 1 );
223              
224             # A cookie for the next '}'.
225 61         150 my $commas = 0;
226 61         142 my $allow_digit = 1;
227             $tokenizer->cookie( COOKIE_QUANT, sub {
228 115     115   400 my ( $tokenizer, $token ) = @_;
229 115 50       319 $token or return 1;
230              
231             # Code for 5.33.6 and after.
232             # We allow {,...}, and we allow space inside and
233             # adjacent to the curlys, and around the comma if
234             # any. But not interior to the numbers.
235 115 100       440 if ( $token->isa( TOKEN_LITERAL ) ) {
236 105         270 my $character = $token->content();
237 105 100       424 if ( $character =~ m/ \A \s \z /smx ) {
238             # Digits only allowed if the prior
239             # significant was an open curly or a comma.
240 4         13 $allow_digit = $tokenizer->prior_significant_token(
241             'content' ) =~ m/ \A [{,] \z /smx; # }
242 4         15 return 1;
243             }
244 101 100       400 $character eq ','
245             and return( ! $commas++ );
246 68 50       206 $allow_digit
247             or return;
248 68         446 return $character =~ m/ \A [0-9] \z /smx;
249             }
250              
251             # Since we do not know what is in an interpolation, we
252             # trustingly accept it.
253 10 100       81 if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254             ) {
255 5         21 return 1;
256             }
257              
258 5         55 return;
259             },
260 61         582 );
261              
262 61         191 return $token;
263             }
264              
265             # The close curly bracket is a little complicated because if the
266             # cookie posted by the left curly bracket is still around, we are a
267             # quantifier, otherwise not.
268 128 100       552 if ( $character eq '}' ) {
269 63 100       215 $tokenizer->cookie( COOKIE_QUANT, undef )
270             or return 1;
271 50 100       449 $tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272             and return 1;
273 47         168 my $token = $tokenizer->make_token( 1 );
274 47         197 $token->{is_quantifier} = 1;
275 47         166 return $token;
276             }
277              
278             # The parse rules are different inside a character class, so we set
279             # another cookie. Sigh. If your tool is a hammer ...
280 65 100       257 if ( $character eq '[' ) {
281              
282             # Set our cookie. Since it always returns 1, it does not matter
283             # where in the following mess we set it.
284 53     226   385 $tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
  226         700  
285              
286             # Make our token now, since the easiest place to deal with the
287             # beginning-of-character-class strangeness seems to be right
288             # here.
289 53         294 my @tokens = $tokenizer->make_token( 1 );
290              
291             # Get the next character, returning tokens if there is none.
292 53 50       338 defined ( $character = $tokenizer->peek() )
293             or return @tokens;
294              
295             # If we have a caret, it is a negation operator. Make its token
296             # and fetch the next character, returning if none.
297 53 100       215 if ( $character eq '^' ) {
298 5         21 push @tokens, $tokenizer->make_token(
299             1, 'PPIx::Regexp::Token::Operator' );
300 5 50       47 defined ( $character = $tokenizer->peek() )
301             or return @tokens;
302             }
303              
304             # If we have a close square at this point, it is not the end of
305             # the class, but just a literal. Make its token.
306 53 100       244 $character eq ']'
307             and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308              
309             # Return all tokens made.
310 53         219 return @tokens;
311             }
312             # per perlop, the metas inside a [] are -]\^$.
313             # per perlop, the metas outside a [] are {}[]()^$.|*+?\
314             # The difference is that {}[().|*+? are not metas in [], but - is.
315              
316             # Close bracket is complicated by the addition of regex sets.
317             # And more complicated by the fact that you can have an
318             # old-style character class inside a regex set. Fortunately they
319             # have not (yet!) permitted nested regex sets.
320 12 50       79 if ( $character eq ']' ) {
321              
322             # If we find '])' and COOKIE_REGEX_SET is present, we have a
323             # regex set. We need to delete the cookie and accept both
324             # characters.
325 12 100 100     64 if ( ( my $accept = $tokenizer->find_regexp(
326             # help vim - ( [
327             qr{ \A []] [)] }smx
328             ) )
329             && $tokenizer->cookie( COOKIE_REGEX_SET )
330              
331             ) {
332 8         76 $tokenizer->cookie( COOKIE_REGEX_SET, undef );
333 8         52 return $accept;
334             }
335              
336             # Otherwise we assume we're in a bracketed character class,
337             # delete the cookie, and accept the close bracket.
338 4         20 $tokenizer->cookie( COOKIE_CLASS, undef );
339 4         10 return 1;
340             }
341              
342 0         0 return 1;
343             }
344              
345             }
346              
347             # Called by the lexer once it has done its worst to all the tokens.
348             # Called as a method with no arguments. The return is the number of
349             # parse failures discovered when finalizing.
350             sub __PPIX_LEXER__finalize {
351 1452     1452   2466 my ( $self ) = @_;
352 1452         2348 delete $self->{is_quantifier};
353 1452         3288 return 0;
354             }
355              
356             1;
357              
358             __END__