File Coverage

blib/lib/PPIx/Regexp/Token/Structure.pm
Criterion Covered Total %
statement 107 108 99.0
branch 47 54 87.0
condition 5 5 100.0
subroutine 18 18 100.0
pod 3 3 100.0
total 180 188 95.7


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             PPIx::Regexp::Token::Structure - Represent structural elements.
4              
5             =head1 SYNOPSIS
6              
7             use PPIx::Regexp::Dumper;
8             PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' )
9             ->print();
10              
11             =head1 INHERITANCE
12              
13             C is a
14             L.
15              
16             C is the parent of
17             L.
18              
19             =head1 DESCRIPTION
20              
21             This class represents things that define the structure of the regular
22             expression. This typically means brackets of various sorts, but to
23             prevent proliferation of token classes the type of the regular
24             expression is stored here.
25              
26             =head1 METHODS
27              
28             This class provides no public methods beyond those provided by its
29             superclass.
30              
31             =cut
32              
33             package PPIx::Regexp::Token::Structure;
34              
35 9     9   58 use strict;
  9         13  
  9         231  
36 9     9   28 use warnings;
  9         13  
  9         366  
37              
38 9     9   38 use base qw{ PPIx::Regexp::Token };
  9         11  
  9         786  
39              
40 9         1069 use PPIx::Regexp::Constant qw{
41             COOKIE_CLASS
42             COOKIE_QUANT
43             COOKIE_REGEX_SET
44             MINIMUM_PERL
45             TOKEN_LITERAL
46             @CARP_NOT
47 9     9   38 };
  9         10  
48              
49             # Tokens we are responsible for making, under at least some
50             # circumstances.
51 9     9   48 use PPIx::Regexp::Token::Comment ();
  9         13  
  9         132  
52 9     9   4146 use PPIx::Regexp::Token::Modifier ();
  9         21  
  9         200  
53 9     9   74 use PPIx::Regexp::Token::Backreference ();
  9         12  
  9         99  
54 9     9   28 use PPIx::Regexp::Token::Backtrack ();
  9         13  
  9         77  
55 9     9   3410 use PPIx::Regexp::Token::Recursion ();
  9         21  
  9         9026  
56              
57             our $VERSION = '0.091_01';
58              
59             # Return true if the token can be quantified, and false otherwise
60              
61             my %quant = map { $_ => 1 } ')', ']';
62             sub can_be_quantified {
63 524     524 1 750 my ( $self ) = @_;
64 524 50       964 ref $self or return;
65 524         1300 return $quant{ $self->content() };
66             };
67              
68             {
69              
70             my %explanation = (
71             '' => 'Match regexp',
72             '(' => 'Capture or grouping',
73             '(?[' => 'Extended character class',
74             ')' => 'End capture or grouping',
75             '[' => 'Character class',
76             ']' => 'End character class',
77             '])' => 'End extended character class',
78             'm' => 'Match regexp',
79             'qr' => 'Regexp object definition',
80             's' => 'Replace regexp with string or expression',
81             '{' => 'Explicit quantifier',
82             '}' => 'End explicit quantifier',
83             );
84              
85             sub __explanation {
86 7     7   15 return \%explanation;
87             }
88              
89             }
90              
91             sub is_quantifier {
92 540     540 1 849 my ( $self ) = @_;
93 540 50       901 ref $self or return;
94 540         1535 return $self->{is_quantifier};
95             }
96              
97             {
98              
99             # Note that the implementation equivocates on the ::Token::Structure
100             # class, using it both for the initial token that determines the
101             # type of the regex and things like parentheses internal to the
102             # regex. Rather than sort out this equivocation, I have relied on
103             # the currently-true assumption that 'qr' will not satisfy the
104             # ::Token::Structure recognition logic, and the only way this class
105             # can acquire this content is by the brute-force approach used to
106             # generate the initial token object.
107              
108             my %perl_version_introduced = (
109             qr => '5.005',
110             '(?[' => '5.017008',
111             );
112              
113             sub perl_version_introduced {
114 222     222 1 3379 my ( $self ) = @_;
115             return $self->{perl_version_introduced} ||
116 222   100     783 $perl_version_introduced{ $self->content() } ||
117             MINIMUM_PERL;
118             }
119             }
120            
121             {
122              
123             my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /;
124              
125             # Regular expressions to match various parenthesized tokens, and the
126             # classes to make them into.
127              
128             my @paren_token = map {
129             [ $_ => $_->__PPIX_TOKEN__recognize() ]
130             }
131             'PPIx::Regexp::Token::Comment',
132             'PPIx::Regexp::Token::Modifier',
133             'PPIx::Regexp::Token::Backreference',
134             'PPIx::Regexp::Token::Backtrack',
135             'PPIx::Regexp::Token::Recursion',
136             ;
137              
138             sub __PPIX_TOKENIZER__regexp {
139 1046     1046   1509 my ( undef, $tokenizer, $character ) = @_;
140              
141             # We are not interested in anything but delimiters.
142 1046 100       2146 $delim{$character} or return;
143              
144             # Inside a character class, all the delimiters are normal characters
145             # except for the close square bracket.
146 893 100       1472 if ( $tokenizer->cookie( COOKIE_CLASS ) ) {
147 59 100       144 $character eq ']'
148             or return $tokenizer->make_token( 1, TOKEN_LITERAL );
149 53         130 $tokenizer->cookie( COOKIE_CLASS, undef );
150 53         221 return 1;
151             }
152              
153             # Open parentheses have various interesting possibilities ...
154 834 100       1512 if ( $character eq '(' ) {
155              
156             # Sometimes the whole bunch of parenthesized characters seems
157             # naturally to be a token.
158 347         643 foreach ( @paren_token ) {
159 1659         1769 my ( $class, @recognize ) = @{ $_ };
  1659         2824  
160 1659         1990 foreach ( @recognize ) {
161 2595         2610 my ( $regexp, $arg ) = @{ $_ };
  2595         3256  
162 2595 100       3758 my $accept = $tokenizer->find_regexp( $regexp ) or next;
163 55         159 return $tokenizer->make_token( $accept, $class, $arg );
164             }
165             }
166              
167             # Modifier changes are local to this parenthesis group
168 292         810 $tokenizer->modifier_duplicate();
169              
170             # The regex-set functionality introduced with 5.17.8 is most
171             # conveniently handled by treating the initial '(?[' and
172             # final '])' as ::Structure tokens. Fortunately for us,
173             # perl5178delta documents that these may not have interior
174             # spaces.
175              
176 292 100       776 if ( my $accept = $tokenizer->find_regexp(
177             qr{ \A [(] [?] [[] }smx # ] ) - help for vim
178             )
179             ) {
180 8     105   38 $tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } );
  105         206  
181 8         26 $tokenizer->modifier_modify( x => 1 ); # Implicitly /x
182 8         20 return $accept;
183             }
184              
185             # We expect certain tokens only after a left paren.
186             $tokenizer->expect(
187 284         950 'PPIx::Regexp::Token::GroupType::Modifier',
188             'PPIx::Regexp::Token::GroupType::NamedCapture',
189             'PPIx::Regexp::Token::GroupType::Assertion',
190             'PPIx::Regexp::Token::GroupType::Code',
191             'PPIx::Regexp::Token::GroupType::BranchReset',
192             'PPIx::Regexp::Token::GroupType::Subexpression',
193             'PPIx::Regexp::Token::GroupType::Switch',
194             'PPIx::Regexp::Token::GroupType::Script_Run',
195             'PPIx::Regexp::Token::GroupType::Atomic_Script_Run',
196             );
197              
198             # Accept the parenthesis.
199 284         587 return 1;
200             }
201              
202             # Close parentheses end modifier localization
203 487 100       920 if ( $character eq ')' ) {
204 288         830 $tokenizer->modifier_pop();
205 288         492 return 1;
206             }
207              
208             # Open curlys are complicated because they may or may not represent
209             # the beginning of a quantifier, depending on what comes before the
210             # close curly. So we set a cookie to monitor the token stream for
211             # interlopers. If all goes well, the right curly will find the
212             # cookie and know it is supposed to be a quantifier.
213 199 100       662 if ( $character eq '{' ) {
214              
215             # If the prior token can not be quantified, all this is
216             # unnecessary.
217 71 100       138 $tokenizer->prior_significant_token( 'can_be_quantified' )
218             or return 1;
219              
220             # We make our token now, before setting the cookie. Otherwise
221             # the cookie has to deal with this token.
222 61         128 my $token = $tokenizer->make_token( 1 );
223              
224             # A cookie for the next '}'.
225 61         80 my $commas = 0;
226 61         76 my $allow_digit = 1;
227             $tokenizer->cookie( COOKIE_QUANT, sub {
228 115     115   200 my ( $tokenizer, $token ) = @_;
229 115 50       204 $token or return 1;
230              
231             # Code for 5.33.6 and after.
232             # We allow {,...}, and we allow space inside and
233             # adjacent to the curlys, and around the comma if
234             # any. But not interior to the numbers.
235 115 100       268 if ( $token->isa( TOKEN_LITERAL ) ) {
236 105         197 my $character = $token->content();
237 105 100       253 if ( $character =~ m/ \A \s \z /smx ) {
238             # Digits only allowed if the prior
239             # significant was an open curly or a comma.
240 4         8 $allow_digit = $tokenizer->prior_significant_token(
241             'content' ) =~ m/ \A [{,] \z /smx; # }
242 4         11 return 1;
243             }
244 101 100       240 $character eq ','
245             and return( ! $commas++ );
246 68 50       201 $allow_digit
247             or return;
248 68         267 return $character =~ m/ \A [0-9] \z /smx;
249             }
250              
251             # Since we do not know what is in an interpolation, we
252             # trustingly accept it.
253 10 100       95 if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' )
254             ) {
255 5         12 return 1;
256             }
257              
258 5         49 return;
259             },
260 61         379 );
261              
262 61         157 return $token;
263             }
264              
265             # The close curly bracket is a little complicated because if the
266             # cookie posted by the left curly bracket is still around, we are a
267             # quantifier, otherwise not.
268 128 100       290 if ( $character eq '}' ) {
269 63 100       164 $tokenizer->cookie( COOKIE_QUANT, undef )
270             or return 1;
271 50 100       357 $tokenizer->prior_significant_token( 'class' )->isa( __PACKAGE__ )
272             and return 1;
273 47         116 my $token = $tokenizer->make_token( 1 );
274 47         86 $token->{is_quantifier} = 1;
275 47         95 return $token;
276             }
277              
278             # The parse rules are different inside a character class, so we set
279             # another cookie. Sigh. If your tool is a hammer ...
280 65 100       158 if ( $character eq '[' ) {
281              
282             # Set our cookie. Since it always returns 1, it does not matter
283             # where in the following mess we set it.
284 53     226   238 $tokenizer->cookie( COOKIE_CLASS, sub { return 1 } );
  226         443  
285              
286             # Make our token now, since the easiest place to deal with the
287             # beginning-of-character-class strangeness seems to be right
288             # here.
289 53         128 my @tokens = $tokenizer->make_token( 1 );
290              
291             # Get the next character, returning tokens if there is none.
292 53 50       161 defined ( $character = $tokenizer->peek() )
293             or return @tokens;
294              
295             # If we have a caret, it is a negation operator. Make its token
296             # and fetch the next character, returning if none.
297 53 100       122 if ( $character eq '^' ) {
298 5         14 push @tokens, $tokenizer->make_token(
299             1, 'PPIx::Regexp::Token::Operator' );
300 5 50       14 defined ( $character = $tokenizer->peek() )
301             or return @tokens;
302             }
303              
304             # If we have a close square at this point, it is not the end of
305             # the class, but just a literal. Make its token.
306 53 100       138 $character eq ']'
307             and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL );
308              
309             # Return all tokens made.
310 53         144 return @tokens;
311             }
312             # per perlop, the metas inside a [] are -]\^$.
313             # per perlop, the metas outside a [] are {}[]()^$.|*+?\
314             # The difference is that {}[().|*+? are not metas in [], but - is.
315              
316             # Close bracket is complicated by the addition of regex sets.
317             # And more complicated by the fact that you can have an
318             # old-style character class inside a regex set. Fortunately they
319             # have not (yet!) permitted nested regex sets.
320 12 50       34 if ( $character eq ']' ) {
321              
322             # If we find '])' and COOKIE_REGEX_SET is present, we have a
323             # regex set. We need to delete the cookie and accept both
324             # characters.
325 12 100 100     32 if ( ( my $accept = $tokenizer->find_regexp(
326             # help vim - ( [
327             qr{ \A []] [)] }smx
328             ) )
329             && $tokenizer->cookie( COOKIE_REGEX_SET )
330              
331             ) {
332 8         30 $tokenizer->cookie( COOKIE_REGEX_SET, undef );
333 8         33 return $accept;
334             }
335              
336             # Otherwise we assume we're in a bracketed character class,
337             # delete the cookie, and accept the close bracket.
338 4         11 $tokenizer->cookie( COOKIE_CLASS, undef );
339 4         8 return 1;
340             }
341              
342 0         0 return 1;
343             }
344              
345             }
346              
347             # Called by the lexer once it has done its worst to all the tokens.
348             # Called as a method with no arguments. The return is the number of
349             # parse failures discovered when finalizing.
350             sub __PPIX_LEXER__finalize {
351 1460     1460   1823 my ( $self ) = @_;
352 1460         1666 delete $self->{is_quantifier};
353 1460         2171 return 0;
354             }
355              
356             1;
357              
358             __END__