File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm
Criterion Covered Total %
statement 107 108 99.0
branch 47 54 87.0
condition 5 5 100.0
subroutine 18 18 100.0
pod 3 3 100.0
total 180 188 95.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::Structure - Represent structural elements.
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9             ->print();
10              
11             =head1 INHERITANCE
12              
13             C is a
14             L.
15              
16             C is the parent of
17             L.
18              
19             =head1 DESCRIPTION
20              
21             This class represents things that define the structure of the regular
22             expression. This typically means brackets of various sorts, but to
23             prevent proliferation of token classes the type of the regular
24             expression is stored here.
25              
26             =head1 METHODS
27              
28             This class provides no public methods beyond those provided by its
29             superclass.
30              
31             =cut
32              
33             package PPIx::Regexp::Token::Structure;
34              
35 9     9   67 use strict;
  9         12  
  9         245  
36 9     9   28 use warnings;
  9         13  
  9         329  
37              
38 9     9   30 use base qw{ PPIx::Regexp::Token };
  9         13  
  9         720  
39              
40 9         1055 use PPIx::Regexp::Constant qw{
41             COOKIE_CLASS
42             COOKIE_QUANT
43             COOKIE_REGEX_SET
44             MINIMUM_PERL
45             TOKEN_LITERAL
46             @CARP_NOT
47 9     9   39 };
  9         14  
48              
49             # Tokens we are responsible for making, under at least some
50             # circumstances.
51 9     9   54 use PPIx::Regexp::Token::Comment ();
  9         15  
  9         139  
52 9     9   4098 use PPIx::Regexp::Token::Modifier ();
  9         25  
  9         190  
53 9     9   49 use PPIx::Regexp::Token::Backreference ();
  9         13  
  9         92  
54 9     9   26 use PPIx::Regexp::Token::Backtrack ();
  9         12  
  9         77  
55 9     9   3271 use PPIx::Regexp::Token::Recursion ();
  9         24  
  9         8487  
56              
57             our $VERSION = '0.092';
58              
59             # Return true if the token can be quantified, and false otherwise
60              
61             my %quant = map { $_ => 1 } ')', ']';
62             sub can_be_quantified {
63 524     524 1 829 my ( $self ) = @_;
64 524 50       960 ref $self or return;
65 524         1203 return $quant{ $self->content() };
66             };
67              
68             {
69              
70             my %explanation = (
71             '' => 'Match regexp',
72             '(' => 'Capture or grouping',
73             '(?[' => 'Extended character class',
74             ')' => 'End capture or grouping',
75             '[' => 'Character class',
76             ']' => 'End character class',
77             '])' => 'End extended character class',
78             'm' => 'Match regexp',
79             'qr' => 'Regexp object definition',
80             's' => 'Replace regexp with string or expression',
81             '{' => 'Explicit quantifier',
82             '}' => 'End explicit quantifier',
83             );
84              
85             sub __explanation {
86 7     7   13 return \%explanation;
87             }
88              
89             }
90              
91             sub is_quantifier {
92 540     540 1 759 my ( $self ) = @_;
93 540 50       852 ref $self or return;
94 540         1389 return $self->{is_quantifier};
95             }
96              
97             {
98              
99             # Note that the implementation equivocates on the ::Token::Structure
100             # class, using it both for the initial token that determines the
101             # type of the regex and things like parentheses internal to the
102             # regex. Rather than sort out this equivocation, I have relied on
103             # the currently-true assumption that 'qr' will not satisfy the
104             # ::Token::Structure recognition logic, and the only way this class
105             # can acquire this content is by the brute-force approach used to
106             # generate the initial token object.
107              
108             my %perl_version_introduced = (
109             qr => '5.005',
110             '(?[' => '5.017008',
111             );
112              
113             sub perl_version_introduced {
114 222     222 1 3156 my ( $self ) = @_;
115             return $self->{perl_version_introduced} ||
116 222   100     826 $perl_version_introduced{ $self->content() } ||
117             MINIMUM_PERL;
118             }
119             }
120            
121             {
122              
123             my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124              
125             # Regular expressions to match various parenthesized tokens, and the
126             # classes to make them into.
127              
128             my @paren_token = map {
129             [ $_ => $_->__PPIX_TOKEN__recognize() ]
130             }
131             'PPIx::Regexp::Token::Comment',
132             'PPIx::Regexp::Token::Modifier',
133             'PPIx::Regexp::Token::Backreference',
134             'PPIx::Regexp::Token::Backtrack',
135             'PPIx::Regexp::Token::Recursion',
136             ;
137              
138             sub __PPIX_TOKENIZER__regexp {
139 1046     1046   1483 my ( undef, $tokenizer, $character ) = @_;
140              
141             # We are not interested in anything but delimiters.
142 1046 100       2366 $delim{$character} or return;
143              
144             # Inside a character class, all the delimiters are normal characters
145             # except for the close square bracket.
146 893 100       1392 if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147 59 100       117 $character eq ']'
148             or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149 53         119 $tokenizer->cookie( COOKIE_CLASS, undef );
150 53         190 return 1;
151             }
152              
153             # Open parentheses have various interesting possibilities ...
154 834 100       1495 if ( $character eq '(' ) {
155              
156             # Sometimes the whole bunch of parenthesized characters seems
157             # naturally to be a token.
158 347         652 foreach ( @paren_token ) {
159 1659         1751 my ( $class, @recognize ) = @{ $_ };
  1659         2681  
160 1659         1929 foreach ( @recognize ) {
161 2595         2538 my ( $regexp, $arg ) = @{ $_ };
  2595         3155  
162 2595 100       3648 my $accept = $tokenizer->find_regexp( $regexp ) or next;
163 55         169 return $tokenizer->make_token( $accept, $class, $arg );
164             }
165             }
166              
167             # Modifier changes are local to this parenthesis group
168 292         777 $tokenizer->modifier_duplicate();
169              
170             # The regex-set functionality introduced with 5.17.8 is most
171             # conveniently handled by treating the initial '(?[' and
172             # final '])' as ::Structure tokens. Fortunately for us,
173             # perl5178delta documents that these may not have interior
174             # spaces.
175              
176 292 100       720 if ( my $accept = $tokenizer->find_regexp(
177             qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178             )
179             ) {
180 8     105   46 $tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
  105         175  
181 8         30 $tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182 8         24 return $accept;
183             }
184              
185             # We expect certain tokens only after a left paren.
186             $tokenizer->expect(
187 284         909 'PPIx::Regexp::Token::GroupType::Modifier',
188             'PPIx::Regexp::Token::GroupType::NamedCapture',
189             'PPIx::Regexp::Token::GroupType::Assertion',
190             'PPIx::Regexp::Token::GroupType::Code',
191             'PPIx::Regexp::Token::GroupType::BranchReset',
192             'PPIx::Regexp::Token::GroupType::Subexpression',
193             'PPIx::Regexp::Token::GroupType::Switch',
194             'PPIx::Regexp::Token::GroupType::Script_Run',
195             'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196             );
197              
198             # Accept the parenthesis.
199 284         671 return 1;
200             }
201              
202             # Close parentheses end modifier localization
203 487 100       869 if ( $character eq ')' ) {
204 288         738 $tokenizer->modifier_pop();
205 288         515 return 1;
206             }
207              
208             # Open curlys are complicated because they may or may not represent
209             # the beginning of a quantifier, depending on what comes before the
210             # close curly. So we set a cookie to monitor the token stream for
211             # interlopers. If all goes well, the right curly will find the
212             # cookie and know it is supposed to be a quantifier.
213 199 100       402 if ( $character eq '{' ) {
214              
215             # If the prior token can not be quantified, all this is
216             # unnecessary.
217 71 100       374 $tokenizer->prior_significant_token( 'can_be_quantified' )
218             or return 1;
219              
220             # We make our token now, before setting the cookie. Otherwise
221             # the cookie has to deal with this token.
222 61         130 my $token = $tokenizer->make_token( 1 );
223              
224             # A cookie for the next '}'.
225 61         110 my $commas = 0;
226 61         83 my $allow_digit = 1;
227             $tokenizer->cookie( COOKIE_QUANT, sub {
228 115     115   220 my ( $tokenizer, $token ) = @_;
229 115 50       213 $token or return 1;
230              
231             # Code for 5.33.6 and after.
232             # We allow {,...}, and we allow space inside and
233             # adjacent to the curlys, and around the comma if
234             # any. But not interior to the numbers.
235 115 100       292 if ( $token->isa( TOKEN_LITERAL ) ) {
236 105         253 my $character = $token->content();
237 105 100       244 if ( $character =~ m/ \A \s \z /smx ) {
238             # Digits only allowed if the prior
239             # significant was an open curly or a comma.
240 4         8 $allow_digit = $tokenizer->prior_significant_token(
241             'content' ) =~ m/ \A [{,] \z /smx; # }
242 4         10 return 1;
243             }
244 101 100       251 $character eq ','
245             and return( ! $commas++ );
246 68 50       151 $allow_digit
247             or return;
248 68         262 return $character =~ m/ \A [0-9] \z /smx;
249             }
250              
251             # Since we do not know what is in an interpolation, we
252             # trustingly accept it.
253 10 100       44 if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254             ) {
255 5         13 return 1;
256             }
257              
258 5         42 return;
259             },
260 61         369 );
261              
262 61         147 return $token;
263             }
264              
265             # The close curly bracket is a little complicated because if the
266             # cookie posted by the left curly bracket is still around, we are a
267             # quantifier, otherwise not.
268 128 100       280 if ( $character eq '}' ) {
269 63 100       117 $tokenizer->cookie( COOKIE_QUANT, undef )
270             or return 1;
271 50 100       348 $tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272             and return 1;
273 47         175 my $token = $tokenizer->make_token( 1 );
274 47         89 $token->{is_quantifier} = 1;
275 47         112 return $token;
276             }
277              
278             # The parse rules are different inside a character class, so we set
279             # another cookie. Sigh. If your tool is a hammer ...
280 65 100       147 if ( $character eq '[' ) {
281              
282             # Set our cookie. Since it always returns 1, it does not matter
283             # where in the following mess we set it.
284 53     226   234 $tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
  226         414  
285              
286             # Make our token now, since the easiest place to deal with the
287             # beginning-of-character-class strangeness seems to be right
288             # here.
289 53         130 my @tokens = $tokenizer->make_token( 1 );
290              
291             # Get the next character, returning tokens if there is none.
292 53 50       144 defined ( $character = $tokenizer->peek() )
293             or return @tokens;
294              
295             # If we have a caret, it is a negation operator. Make its token
296             # and fetch the next character, returning if none.
297 53 100       150 if ( $character eq '^' ) {
298 5         13 push @tokens, $tokenizer->make_token(
299             1, 'PPIx::Regexp::Token::Operator' );
300 5 50       13 defined ( $character = $tokenizer->peek() )
301             or return @tokens;
302             }
303              
304             # If we have a close square at this point, it is not the end of
305             # the class, but just a literal. Make its token.
306 53 100       115 $character eq ']'
307             and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308              
309             # Return all tokens made.
310 53         145 return @tokens;
311             }
312             # per perlop, the metas inside a [] are -]\^$.
313             # per perlop, the metas outside a [] are {}[]()^$.|*+?\
314             # The difference is that {}[().|*+? are not metas in [], but - is.
315              
316             # Close bracket is complicated by the addition of regex sets.
317             # And more complicated by the fact that you can have an
318             # old-style character class inside a regex set. Fortunately they
319             # have not (yet!) permitted nested regex sets.
320 12 50       30 if ( $character eq ']' ) {
321              
322             # If we find '])' and COOKIE_REGEX_SET is present, we have a
323             # regex set. We need to delete the cookie and accept both
324             # characters.
325 12 100 100     36 if ( ( my $accept = $tokenizer->find_regexp(
326             # help vim - ( [
327             qr{ \A []] [)] }smx
328             ) )
329             && $tokenizer->cookie( COOKIE_REGEX_SET )
330              
331             ) {
332 8         24 $tokenizer->cookie( COOKIE_REGEX_SET, undef );
333 8         31 return $accept;
334             }
335              
336             # Otherwise we assume we're in a bracketed character class,
337             # delete the cookie, and accept the close bracket.
338 4         12 $tokenizer->cookie( COOKIE_CLASS, undef );
339 4         9 return 1;
340             }
341              
342 0         0 return 1;
343             }
344              
345             }
346              
347             # Called by the lexer once it has done its worst to all the tokens.
348             # Called as a method with no arguments. The return is the number of
349             # parse failures discovered when finalizing.
350             sub __PPIX_LEXER__finalize {
351 1460     1460   1791 my ( $self ) = @_;
352 1460         1596 delete $self->{is_quantifier};
353 1460         2105 return 0;
354             }
355              
356             1;
357              
358             __END__