line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
=head1 NAME |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
PPIx::Regexp::Lexer - Assemble tokenizer output. |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 SYNOPSIS |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use PPIx::Regexp::Lexer; |
8
|
|
|
|
|
|
|
use PPIx::Regexp::Dumper; |
9
|
|
|
|
|
|
|
my $lex = PPIx::Regexp::Lexer->new('qr{foo}smx'); |
10
|
|
|
|
|
|
|
my $dmp = PPIx::Regexp::Dumper->new( $lex ); |
11
|
|
|
|
|
|
|
$dmp->print(); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 INHERITANCE |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
C is a |
16
|
|
|
|
|
|
|
L. |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
C has no descendants. |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 DESCRIPTION |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
This class takes the token stream generated by |
23
|
|
|
|
|
|
|
L and generates the |
24
|
|
|
|
|
|
|
parse tree. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 METHODS |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
This class provides the following public methods. Methods not documented |
29
|
|
|
|
|
|
|
here are private, and unsupported in the sense that the author reserves |
30
|
|
|
|
|
|
|
the right to change or remove them without notice. |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=cut |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
package PPIx::Regexp::Lexer; |
35
|
|
|
|
|
|
|
|
36
|
9
|
|
|
9
|
|
63
|
use strict; |
|
9
|
|
|
|
|
18
|
|
|
9
|
|
|
|
|
264
|
|
37
|
9
|
|
|
9
|
|
45
|
use warnings; |
|
9
|
|
|
|
|
20
|
|
|
9
|
|
|
|
|
283
|
|
38
|
|
|
|
|
|
|
|
39
|
9
|
|
|
9
|
|
49
|
use base qw{ PPIx::Regexp::Support }; |
|
9
|
|
|
|
|
17
|
|
|
9
|
|
|
|
|
3973
|
|
40
|
|
|
|
|
|
|
|
41
|
9
|
|
|
9
|
|
64
|
use Carp qw{ confess }; |
|
9
|
|
|
|
|
17
|
|
|
9
|
|
|
|
|
442
|
|
42
|
9
|
|
|
|
|
780
|
use PPIx::Regexp::Constant qw{ |
43
|
|
|
|
|
|
|
ARRAY_REF |
44
|
|
|
|
|
|
|
TOKEN_LITERAL |
45
|
|
|
|
|
|
|
TOKEN_UNKNOWN |
46
|
|
|
|
|
|
|
@CARP_NOT |
47
|
9
|
|
|
9
|
|
59
|
}; |
|
9
|
|
|
|
|
19
|
|
48
|
9
|
|
|
9
|
|
3933
|
use PPIx::Regexp::Node::Range (); |
|
9
|
|
|
|
|
25
|
|
|
9
|
|
|
|
|
176
|
|
49
|
9
|
|
|
9
|
|
3425
|
use PPIx::Regexp::Node::Unknown (); |
|
9
|
|
|
|
|
31
|
|
|
9
|
|
|
|
|
195
|
|
50
|
9
|
|
|
9
|
|
4150
|
use PPIx::Regexp::Structure (); |
|
9
|
|
|
|
|
29
|
|
|
9
|
|
|
|
|
192
|
|
51
|
9
|
|
|
9
|
|
3933
|
use PPIx::Regexp::Structure::Assertion (); |
|
9
|
|
|
|
|
23
|
|
|
9
|
|
|
|
|
182
|
|
52
|
9
|
|
|
9
|
|
3732
|
use PPIx::Regexp::Structure::Atomic_Script_Run (); |
|
9
|
|
|
|
|
28
|
|
|
9
|
|
|
|
|
184
|
|
53
|
9
|
|
|
9
|
|
3765
|
use PPIx::Regexp::Structure::BranchReset (); |
|
9
|
|
|
|
|
35
|
|
|
9
|
|
|
|
|
196
|
|
54
|
9
|
|
|
9
|
|
3787
|
use PPIx::Regexp::Structure::Code (); |
|
9
|
|
|
|
|
22
|
|
|
9
|
|
|
|
|
209
|
|
55
|
9
|
|
|
9
|
|
3718
|
use PPIx::Regexp::Structure::Capture (); |
|
9
|
|
|
|
|
137
|
|
|
9
|
|
|
|
|
191
|
|
56
|
9
|
|
|
9
|
|
3820
|
use PPIx::Regexp::Structure::CharClass (); |
|
9
|
|
|
|
|
26
|
|
|
9
|
|
|
|
|
167
|
|
57
|
9
|
|
|
9
|
|
3853
|
use PPIx::Regexp::Structure::Subexpression (); |
|
9
|
|
|
|
|
27
|
|
|
9
|
|
|
|
|
189
|
|
58
|
9
|
|
|
9
|
|
3823
|
use PPIx::Regexp::Structure::Main (); |
|
9
|
|
|
|
|
20
|
|
|
9
|
|
|
|
|
172
|
|
59
|
9
|
|
|
9
|
|
3728
|
use PPIx::Regexp::Structure::Modifier (); |
|
9
|
|
|
|
|
29
|
|
|
9
|
|
|
|
|
176
|
|
60
|
9
|
|
|
9
|
|
3802
|
use PPIx::Regexp::Structure::NamedCapture (); |
|
9
|
|
|
|
|
23
|
|
|
9
|
|
|
|
|
172
|
|
61
|
9
|
|
|
9
|
|
3823
|
use PPIx::Regexp::Structure::Quantifier (); |
|
9
|
|
|
|
|
27
|
|
|
9
|
|
|
|
|
181
|
|
62
|
9
|
|
|
9
|
|
3817
|
use PPIx::Regexp::Structure::Regexp (); |
|
9
|
|
|
|
|
27
|
|
|
9
|
|
|
|
|
168
|
|
63
|
9
|
|
|
9
|
|
3718
|
use PPIx::Regexp::Structure::RegexSet (); |
|
9
|
|
|
|
|
23
|
|
|
9
|
|
|
|
|
175
|
|
64
|
9
|
|
|
9
|
|
3776
|
use PPIx::Regexp::Structure::Replacement (); |
|
9
|
|
|
|
|
22
|
|
|
9
|
|
|
|
|
183
|
|
65
|
9
|
|
|
9
|
|
3818
|
use PPIx::Regexp::Structure::Script_Run (); |
|
9
|
|
|
|
|
24
|
|
|
9
|
|
|
|
|
173
|
|
66
|
9
|
|
|
9
|
|
3787
|
use PPIx::Regexp::Structure::Switch (); |
|
9
|
|
|
|
|
24
|
|
|
9
|
|
|
|
|
179
|
|
67
|
9
|
|
|
9
|
|
3636
|
use PPIx::Regexp::Structure::Unknown (); |
|
9
|
|
|
|
|
26
|
|
|
9
|
|
|
|
|
170
|
|
68
|
9
|
|
|
9
|
|
3786
|
use PPIx::Regexp::Token::Unmatched (); |
|
9
|
|
|
|
|
22
|
|
|
9
|
|
|
|
|
187
|
|
69
|
9
|
|
|
9
|
|
5310
|
use PPIx::Regexp::Tokenizer (); |
|
9
|
|
|
|
|
44
|
|
|
9
|
|
|
|
|
325
|
|
70
|
9
|
|
|
9
|
|
67
|
use PPIx::Regexp::Util qw{ __choose_tokenizer_class __instance }; |
|
9
|
|
|
|
|
23
|
|
|
9
|
|
|
|
|
19324
|
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
our $VERSION = '0.088'; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=head2 new |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
This method instantiates the lexer. It takes as its argument either a |
77
|
|
|
|
|
|
|
L or the text to be |
78
|
|
|
|
|
|
|
parsed. In the latter case the tokenizer is instantiated from the text. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Any optional name/value pairs after the first argument are passed to the |
81
|
|
|
|
|
|
|
tokenizer, which interprets them or not as the case may be. |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=cut |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
{ |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
my $errstr; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub new { |
90
|
332
|
|
|
332
|
1
|
1269
|
my ( $class, $tokenizer, %args ) = @_; |
91
|
332
|
50
|
|
|
|
1094
|
ref $class and $class = ref $class; |
92
|
|
|
|
|
|
|
|
93
|
332
|
50
|
|
|
|
1196
|
unless ( __instance( $tokenizer, 'PPIx::Regexp::Tokenizer' ) ) { |
94
|
|
|
|
|
|
|
my $tokenizer_class = __choose_tokenizer_class( |
95
|
|
|
|
|
|
|
$tokenizer, \%args ) |
96
|
0
|
0
|
|
|
|
0
|
or do { |
97
|
0
|
|
|
|
|
0
|
$errstr = 'Data not supported'; |
98
|
0
|
|
|
|
|
0
|
return; |
99
|
|
|
|
|
|
|
}; |
100
|
|
|
|
|
|
|
$tokenizer = $tokenizer_class->new( $tokenizer, %args ) |
101
|
0
|
0
|
|
|
|
0
|
or do { |
102
|
0
|
|
|
|
|
0
|
$errstr = $tokenizer_class->errstr(); |
103
|
0
|
|
|
|
|
0
|
return; |
104
|
|
|
|
|
|
|
}; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
my $self = { |
108
|
|
|
|
|
|
|
deferred => [], # Deferred tokens |
109
|
|
|
|
|
|
|
failures => 0, |
110
|
|
|
|
|
|
|
strict => $args{strict}, |
111
|
332
|
|
|
|
|
2010
|
tokenizer => $tokenizer, |
112
|
|
|
|
|
|
|
}; |
113
|
|
|
|
|
|
|
|
114
|
332
|
|
|
|
|
868
|
bless $self, $class; |
115
|
332
|
|
|
|
|
931
|
return $self; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub errstr { |
119
|
0
|
|
|
0
|
1
|
0
|
return $errstr; |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=head2 errstr |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
This method returns the error string from the last attempt to |
127
|
|
|
|
|
|
|
instantiate a C. If the last attempt succeeded, the |
128
|
|
|
|
|
|
|
error will be C. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=cut |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# Defined above |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head2 failures |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
print $lexer->failures(), " parse failures\n"; |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
This method returns the number of parse failures encountered. A |
139
|
|
|
|
|
|
|
parse failure is either a tokenization failure (see |
140
|
|
|
|
|
|
|
L<< PPIx::Regexp::Tokenizer->failures()|PPIx::Regexp::Tokenizer/failures >>) |
141
|
|
|
|
|
|
|
or a structural error. |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=cut |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub failures { |
146
|
332
|
|
|
332
|
1
|
844
|
my ( $self ) = @_; |
147
|
332
|
|
|
|
|
983
|
return $self->{failures}; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head2 lex |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
This method lexes the tokens in the text, and returns the lexed list of |
153
|
|
|
|
|
|
|
elements. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=cut |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub lex { |
158
|
332
|
|
|
332
|
1
|
813
|
my ( $self ) = @_; |
159
|
|
|
|
|
|
|
|
160
|
332
|
|
|
|
|
722
|
my @content; |
161
|
332
|
|
|
|
|
882
|
$self->{failures} = 0; |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Accept everything up to the first delimiter. |
164
|
332
|
|
|
|
|
686
|
my $kind; # Initial PPIx::Regexp::Token::Structure |
165
|
|
|
|
|
|
|
{ |
166
|
332
|
100
|
|
|
|
656
|
my $token = $self->_get_token() |
|
668
|
|
|
|
|
2009
|
|
167
|
|
|
|
|
|
|
or return $self->_finalize( @content ); |
168
|
660
|
100
|
|
|
|
2969
|
$token->isa( 'PPIx::Regexp::Token::Delimiter' ) or do { |
169
|
336
|
100
|
100
|
|
|
2333
|
not $kind |
170
|
|
|
|
|
|
|
and $token->isa( 'PPIx::Regexp::Token::Structure' ) |
171
|
|
|
|
|
|
|
and $kind = $token; |
172
|
336
|
|
|
|
|
910
|
push @content, $token; |
173
|
336
|
|
|
|
|
583
|
redo; |
174
|
|
|
|
|
|
|
}; |
175
|
324
|
|
|
|
|
1533
|
$self->_unget_token( $token ); |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
my ( $part_0_class, $part_1_class ) = |
179
|
324
|
|
|
|
|
1488
|
$self->{tokenizer}->__part_classes(); |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Accept the first delimited structure. |
182
|
324
|
|
|
|
|
1341
|
push @content, ( my $part_0 = $self->_get_delimited( |
183
|
|
|
|
|
|
|
$part_0_class ) ); |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# If we are a substitution ... |
186
|
324
|
100
|
|
|
|
1119
|
if ( defined $part_1_class ) { |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# Accept any insignificant stuff. |
189
|
24
|
|
|
|
|
104
|
while ( my $token = $self->_get_token() ) { |
190
|
28
|
100
|
|
|
|
93
|
if ( $token->significant() ) { |
191
|
24
|
|
|
|
|
109
|
$self->_unget_token( $token ); |
192
|
24
|
|
|
|
|
65
|
last; |
193
|
|
|
|
|
|
|
} else { |
194
|
4
|
|
|
|
|
15
|
push @content, $token; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# Figure out if we should expect an opening bracket. |
199
|
24
|
|
100
|
|
|
150
|
my $expect_open_bracket = $self->close_bracket( |
200
|
|
|
|
|
|
|
$part_0->start( 0 ) ) || 0; |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# Accept the next delimited structure. |
203
|
24
|
|
|
|
|
87
|
push @content, $self->_get_delimited( |
204
|
|
|
|
|
|
|
$part_1_class, |
205
|
|
|
|
|
|
|
$expect_open_bracket, |
206
|
|
|
|
|
|
|
); |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# Accept the modifiers (we hope!) plus any trailing white space. |
210
|
324
|
|
|
|
|
1067
|
while ( my $token = $self->_get_token() ) { |
211
|
326
|
|
|
|
|
891
|
push @content, $token; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# Let all the elements finalize themselves, recording any additional |
215
|
|
|
|
|
|
|
# errors as they do so. |
216
|
324
|
|
|
|
|
1942
|
$self->_finalize( @content ); |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# If we found a regular expression (and we should have done so) ... |
219
|
324
|
50
|
33
|
|
|
2452
|
if ( $part_0 && $part_0->can( 'max_capture_number' ) ) { |
220
|
|
|
|
|
|
|
# TODO the above line is really ugly. I'm wondering about |
221
|
|
|
|
|
|
|
# string implementations like: |
222
|
|
|
|
|
|
|
# * return a $part_0_class of undef (but that complicates the |
223
|
|
|
|
|
|
|
# lexing of the structure itself); |
224
|
|
|
|
|
|
|
# * hang this logic on the tokenizer somehow (where it seems out |
225
|
|
|
|
|
|
|
# of place) |
226
|
|
|
|
|
|
|
# * hang this logic on PPIx::Regexp::Structure::Regexp and |
227
|
|
|
|
|
|
|
# ::Replacement. |
228
|
|
|
|
|
|
|
# I also need to figure out how to make \n backreferences come |
229
|
|
|
|
|
|
|
# out as literals. Maybe that is a job best done by the |
230
|
|
|
|
|
|
|
# tokenizer. |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
# Retrieve the maximum capture group. |
233
|
324
|
|
|
|
|
1142
|
my $max_capture = $part_0->max_capture_number(); |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
# Hashify the known capture names |
236
|
|
|
|
|
|
|
my $capture_name = { |
237
|
324
|
|
|
|
|
1216
|
map { $_ => 1 } $part_0->capture_names(), |
|
20
|
|
|
|
|
101
|
|
238
|
|
|
|
|
|
|
}; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# For all the backreferences found |
241
|
324
|
100
|
|
|
|
722
|
foreach my $elem ( @{ $part_0->find( |
|
324
|
|
|
|
|
971
|
|
242
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Backreference' ) || [] } ) { |
243
|
|
|
|
|
|
|
# Rebless them as needed, recording any errors found. |
244
|
|
|
|
|
|
|
$self->{failures} += |
245
|
25
|
|
|
|
|
109
|
$elem->__PPIX_LEXER__rebless( |
246
|
|
|
|
|
|
|
capture_name => $capture_name, |
247
|
|
|
|
|
|
|
max_capture => $max_capture, |
248
|
|
|
|
|
|
|
); |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
324
|
|
|
|
|
1445
|
return @content; |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=head2 strict |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
This method returns true or false based on the value of the C<'strict'> |
259
|
|
|
|
|
|
|
argument to C. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=cut |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
sub strict { |
264
|
13
|
|
|
13
|
1
|
28
|
my ( $self ) = @_; |
265
|
13
|
|
|
|
|
63
|
return $self->{strict}; |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# Finalize the content array, updating the parse failures count as we |
269
|
|
|
|
|
|
|
# go. |
270
|
|
|
|
|
|
|
sub _finalize { |
271
|
332
|
|
|
332
|
|
1071
|
my ( $self, @content ) = @_; |
272
|
332
|
|
|
|
|
749
|
foreach my $elem ( @content ) { |
273
|
1014
|
|
|
|
|
3778
|
$self->{failures} += $elem->__PPIX_LEXER__finalize( $self ); |
274
|
|
|
|
|
|
|
} |
275
|
332
|
100
|
|
|
|
1048
|
defined wantarray and return @content; |
276
|
324
|
|
|
|
|
621
|
return; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
{ |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
my %bracket = ( |
282
|
|
|
|
|
|
|
'{' => '}', |
283
|
|
|
|
|
|
|
'(' => ')', |
284
|
|
|
|
|
|
|
'[' => ']', |
285
|
|
|
|
|
|
|
'(?[' => '])', |
286
|
|
|
|
|
|
|
## '<' => '>', |
287
|
|
|
|
|
|
|
); |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
my %unclosed = ( |
290
|
|
|
|
|
|
|
'{' => '_recover_curly', |
291
|
|
|
|
|
|
|
); |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
sub _get_delimited { |
294
|
348
|
|
|
348
|
|
977
|
my ( $self, $class, $expect_open_bracket ) = @_; |
295
|
348
|
100
|
|
|
|
1166
|
defined $expect_open_bracket or $expect_open_bracket = 1; |
296
|
|
|
|
|
|
|
|
297
|
348
|
|
|
|
|
653
|
my @rslt; |
298
|
348
|
|
|
|
|
1001
|
$self->{_rslt} = \@rslt; |
299
|
|
|
|
|
|
|
|
300
|
348
|
100
|
|
|
|
920
|
if ( $expect_open_bracket ) { |
301
|
329
|
50
|
|
|
|
814
|
if ( my $token = $self->_get_token() ) { |
302
|
329
|
|
|
|
|
889
|
push @rslt, []; |
303
|
329
|
50
|
|
|
|
1187
|
if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) { |
304
|
329
|
|
|
|
|
671
|
push @{ $rslt[-1] }, '', $token; |
|
329
|
|
|
|
|
1388
|
|
305
|
|
|
|
|
|
|
} else { |
306
|
0
|
|
|
|
|
0
|
push @{ $rslt[-1] }, '', undef; |
|
0
|
|
|
|
|
0
|
|
307
|
0
|
|
|
|
|
0
|
$self->_unget_token( $token ); |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
} else { |
310
|
0
|
|
|
|
|
0
|
return; |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
} else { |
313
|
19
|
|
|
|
|
72
|
push @rslt, [ '', undef ]; |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
|
316
|
348
|
|
|
|
|
999
|
while ( my $token = $self->_get_token() ) { |
317
|
2296
|
100
|
|
|
|
8144
|
if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) { |
318
|
348
|
|
|
|
|
1263
|
$self->_unget_token( $token ); |
319
|
348
|
|
|
|
|
643
|
last; |
320
|
|
|
|
|
|
|
} |
321
|
1948
|
100
|
|
|
|
6101
|
if ( $token->isa( 'PPIx::Regexp::Token::Structure' ) ) { |
322
|
555
|
|
|
|
|
1413
|
my $content = $token->content(); |
323
|
|
|
|
|
|
|
|
324
|
555
|
100
|
66
|
|
|
2528
|
if ( my $finish = $bracket{$content} ) { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
325
|
|
|
|
|
|
|
# Open bracket |
326
|
276
|
|
|
|
|
801
|
push @rslt, [ $finish, $token ]; |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
} elsif ( $content eq $rslt[-1][0] ) { |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
# Matched close bracket |
331
|
269
|
|
|
|
|
1114
|
$self->_make_node( $token ); |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
} elsif ( $content ne ')' ) { |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# If the close bracket is not a parenthesis, it becomes |
336
|
|
|
|
|
|
|
# a literal. |
337
|
4
|
|
|
|
|
26
|
TOKEN_LITERAL->__PPIX_ELEM__rebless( $token ); |
338
|
4
|
|
|
|
|
9
|
push @{ $rslt[-1] }, $token; |
|
4
|
|
|
|
|
16
|
|
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
} elsif ( $content eq ')' |
341
|
|
|
|
|
|
|
and @rslt > 1 # Ignore enclosing delimiter |
342
|
|
|
|
|
|
|
and my $recover = $unclosed{$rslt[-1][1]->content()} ) { |
343
|
|
|
|
|
|
|
# If the close bracket is a parenthesis and there is a |
344
|
|
|
|
|
|
|
# recovery procedure, we use it. |
345
|
1
|
|
|
|
|
7
|
$self->$recover( $token ); |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
} else { |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# Unmatched close with no recovery. |
350
|
5
|
|
|
|
|
15
|
$self->{failures}++; |
351
|
5
|
|
|
|
|
65
|
PPIx::Regexp::Token::Unmatched-> |
352
|
|
|
|
|
|
|
__PPIX_ELEM__rebless( $token ); |
353
|
5
|
|
|
|
|
8
|
push @{ $rslt[-1] }, $token; |
|
5
|
|
|
|
|
13
|
|
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
} else { |
357
|
1393
|
|
|
|
|
2130
|
push @{ $rslt[-1] }, $token; |
|
1393
|
|
|
|
|
3346
|
|
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
# We have to hand-roll the Range object. |
361
|
1948
|
100
|
100
|
|
|
5797
|
if ( __instance( $rslt[-1][-2], 'PPIx::Regexp::Token::Operator' ) |
|
|
|
100
|
|
|
|
|
362
|
|
|
|
|
|
|
&& $rslt[-1][-2]->content() eq '-' |
363
|
|
|
|
|
|
|
&& $rslt[-1][0] eq ']' # It's a character class |
364
|
|
|
|
|
|
|
) { |
365
|
13
|
|
|
|
|
32
|
my @tokens = splice @{ $rslt[-1] }, -3; |
|
13
|
|
|
|
|
57
|
|
366
|
13
|
|
|
|
|
36
|
push @{ $rslt[-1] }, |
|
13
|
|
|
|
|
119
|
|
367
|
|
|
|
|
|
|
PPIx::Regexp::Node::Range->__new( @tokens ); |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
|
371
|
348
|
|
|
|
|
1317
|
while ( @rslt > 1 ) { |
372
|
6
|
100
|
|
|
|
34
|
if ( my $recover = $unclosed{$rslt[-1][1]->content()} ) { |
373
|
5
|
|
|
|
|
29
|
$self->$recover(); |
374
|
|
|
|
|
|
|
} else { |
375
|
1
|
|
|
|
|
4
|
$self->{failures}++; |
376
|
1
|
|
|
|
|
4
|
$self->_make_node( undef ); |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
|
380
|
348
|
50
|
|
|
|
998
|
if ( @rslt == 1 ) { |
381
|
348
|
|
|
|
|
650
|
my @last = @{ pop @rslt }; |
|
348
|
|
|
|
|
995
|
|
382
|
348
|
|
|
|
|
741
|
shift @last; |
383
|
348
|
|
|
|
|
895
|
push @last, $self->_get_token(); |
384
|
348
|
|
|
|
|
2053
|
return $class->__new( @last ); |
385
|
|
|
|
|
|
|
} else { |
386
|
0
|
|
|
|
|
0
|
confess "Missing data"; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
# $token = $self->_get_token(); |
394
|
|
|
|
|
|
|
# |
395
|
|
|
|
|
|
|
# This method returns the next token from the tokenizer. |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
sub _get_token { |
398
|
4319
|
|
|
4319
|
|
7839
|
my ( $self ) = @_; |
399
|
|
|
|
|
|
|
|
400
|
4319
|
100
|
|
|
|
6032
|
if ( @{ $self->{deferred} } ) { |
|
4319
|
|
|
|
|
9749
|
|
401
|
697
|
|
|
|
|
1049
|
return shift @{ $self->{deferred} }; |
|
697
|
|
|
|
|
2253
|
|
402
|
|
|
|
|
|
|
} |
403
|
|
|
|
|
|
|
|
404
|
3622
|
100
|
|
|
|
10041
|
my $token = $self->{tokenizer}->next_token() or return; |
405
|
|
|
|
|
|
|
|
406
|
3290
|
|
|
|
|
9891
|
return $token; |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
{ |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
my %handler = ( |
412
|
|
|
|
|
|
|
'(' => '_round', |
413
|
|
|
|
|
|
|
'[' => '_square', |
414
|
|
|
|
|
|
|
'{' => '_curly', |
415
|
|
|
|
|
|
|
'(?[' => '_regex_set', |
416
|
|
|
|
|
|
|
); |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
sub _make_node { |
419
|
270
|
|
|
270
|
|
688
|
my ( $self, $token ) = @_; |
420
|
270
|
|
|
|
|
527
|
my @args = @{ pop @{ $self->{_rslt} } }; |
|
270
|
|
|
|
|
453
|
|
|
270
|
|
|
|
|
875
|
|
421
|
270
|
|
|
|
|
613
|
shift @args; |
422
|
270
|
|
|
|
|
578
|
push @args, $token; |
423
|
270
|
|
|
|
|
435
|
my @node; |
424
|
270
|
50
|
|
|
|
792
|
if ( my $method = $handler{ $args[0]->content() } ) { |
425
|
270
|
|
|
|
|
1319
|
@node = $self->$method( \@args ); |
426
|
|
|
|
|
|
|
} |
427
|
270
|
50
|
|
|
|
906
|
@node or @node = PPIx::Regexp::Structure->__new( @args ); |
428
|
270
|
|
|
|
|
501
|
push @{ $self->{_rslt}[-1] }, @node; |
|
270
|
|
|
|
|
777
|
|
429
|
270
|
|
|
|
|
1084
|
return; |
430
|
|
|
|
|
|
|
} |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
# Called as $self->$method( ... ) in _make_node(), above |
435
|
|
|
|
|
|
|
sub _curly { ## no critic (ProhibitUnusedPrivateSubroutines) |
436
|
35
|
|
|
35
|
|
117
|
my ( $self, $args ) = @_; |
437
|
|
|
|
|
|
|
|
438
|
35
|
100
|
66
|
|
|
224
|
if ( $args->[-1] && $args->[-1]->is_quantifier() ) { |
|
|
50
|
|
|
|
|
|
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# If the tokenizer has marked the right curly as a quantifier, |
441
|
|
|
|
|
|
|
# make the whole thing a quantifier structure. |
442
|
29
|
|
|
|
|
84
|
return PPIx::Regexp::Structure::Quantifier->__new( @{ $args } ); |
|
29
|
|
|
|
|
211
|
|
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
} elsif ( $args->[-1] ) { |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
# If there is a right curly but it is not a quantifier, |
447
|
|
|
|
|
|
|
# make both curlys into literals. |
448
|
6
|
|
|
|
|
38
|
foreach my $inx ( 0, -1 ) { |
449
|
12
|
|
|
|
|
83
|
TOKEN_LITERAL->__PPIX_ELEM__rebless( $args->[$inx] ); |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# Try to recover possible quantifiers not recognized because we |
453
|
|
|
|
|
|
|
# thought this was a structure. |
454
|
6
|
|
|
|
|
50
|
$self->_recover_curly_quantifiers( $args ); |
455
|
|
|
|
|
|
|
|
456
|
6
|
|
|
|
|
16
|
return @{ $args }; |
|
6
|
|
|
|
|
24
|
|
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
} else { |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# If there is no right curly, just make a generic structure |
461
|
|
|
|
|
|
|
# TODO maybe this should be something else? |
462
|
0
|
|
|
|
|
0
|
return PPIx::Regexp::Structure->__new( @{ $args } ); |
|
0
|
|
|
|
|
0
|
|
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
} |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# Recover from an unclosed left curly. |
467
|
|
|
|
|
|
|
# Called as $self->$revover( ... ) in _get_delimited, above |
468
|
|
|
|
|
|
|
sub _recover_curly { ## no critic (ProhibitUnusedPrivateSubroutines) |
469
|
6
|
|
|
6
|
|
25
|
my ( $self, $token ) = @_; |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
# Get all the stuff we have accumulated for this curly. |
472
|
6
|
|
|
|
|
14
|
my @content = @{ pop @{ $self->{_rslt} } }; |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
25
|
|
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
# Lose the right bracket, which we have already failed to match. |
475
|
6
|
|
|
|
|
17
|
shift @content; |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# Rebless the left curly appropriately |
478
|
6
|
100
|
66
|
|
|
64
|
if ( $self->{_rslt}[0][-1]->isa( 'PPIx::Regexp::Token::Assertion' ) |
479
|
|
|
|
|
|
|
&& q<\b> eq $self->{_rslt}[0][-1]->content() ) { |
480
|
|
|
|
|
|
|
# If following \b, it becomes an unknown. |
481
|
1
|
|
|
|
|
7
|
TOKEN_UNKNOWN->__PPIX_ELEM__rebless( $content[0], |
482
|
|
|
|
|
|
|
error => 'Unterminated bound type', |
483
|
|
|
|
|
|
|
); |
484
|
|
|
|
|
|
|
} else { |
485
|
|
|
|
|
|
|
# Rebless the left curly to a literal. |
486
|
5
|
|
|
|
|
43
|
TOKEN_LITERAL->__PPIX_ELEM__rebless( $content[0] ); |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
# Try to recover possible quantifiers not recognized because we |
490
|
|
|
|
|
|
|
# thought this was a structure. |
491
|
6
|
|
|
|
|
33
|
$self->_recover_curly_quantifiers( \@content ); |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# Shove the curly and its putative contents into whatever structure |
494
|
|
|
|
|
|
|
# we have going. |
495
|
|
|
|
|
|
|
# The checks are to try to trap things like RT 56864, though on |
496
|
|
|
|
|
|
|
# further reflection it turned out that you could get here with an |
497
|
|
|
|
|
|
|
# empty $self->{_rslt} on things like 'm{)}'. This one did not get |
498
|
|
|
|
|
|
|
# made into an RT ticket, but was fixed by not calling the recovery |
499
|
|
|
|
|
|
|
# code if $self->{_rslt} contained only the enclosing delimiters. |
500
|
|
|
|
|
|
|
ARRAY_REF eq ref $self->{_rslt} |
501
|
|
|
|
|
|
|
or confess 'Programming error - $self->{_rslt} not array ref, ', |
502
|
6
|
50
|
|
|
|
35
|
"parsing '", $self->{tokenizer}->content(), "' at ", |
503
|
|
|
|
|
|
|
$token->content(); |
504
|
6
|
|
|
|
|
24
|
@{ $self->{_rslt} } |
505
|
|
|
|
|
|
|
or confess 'Programming error - $self->{_rslt} empty, ', |
506
|
6
|
50
|
|
|
|
15
|
"parsing '", $self->{tokenizer}->content(), "' at ", |
507
|
|
|
|
|
|
|
$token->content(); |
508
|
6
|
|
|
|
|
13
|
push @{ $self->{_rslt}[-1] }, @content; |
|
6
|
|
|
|
|
44
|
|
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# Shove the mismatched delimiter back into the input so we can have |
511
|
|
|
|
|
|
|
# another crack at it. |
512
|
6
|
100
|
|
|
|
21
|
$token and $self->_unget_token( $token ); |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
# We gone. |
515
|
6
|
|
|
|
|
21
|
return; |
516
|
|
|
|
|
|
|
} |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
sub _recover_curly_quantifiers { |
519
|
12
|
|
|
12
|
|
36
|
my ( undef, $args ) = @_; # Invocant unused |
520
|
|
|
|
|
|
|
|
521
|
12
|
100
|
100
|
|
|
49
|
if ( __instance( $args->[0], TOKEN_LITERAL ) |
|
|
|
66
|
|
|
|
|
522
|
|
|
|
|
|
|
&& __instance( $args->[1], TOKEN_UNKNOWN ) |
523
|
|
|
|
|
|
|
&& PPIx::Regexp::Token::Quantifier->could_be_quantifier( |
524
|
|
|
|
|
|
|
$args->[1]->content() ) |
525
|
|
|
|
|
|
|
) { |
526
|
2
|
|
|
|
|
24
|
PPIx::Regexp::Token::Quantifier-> |
527
|
|
|
|
|
|
|
__PPIX_ELEM__rebless( $args->[1] ); |
528
|
|
|
|
|
|
|
|
529
|
2
|
50
|
33
|
|
|
8
|
if ( __instance( $args->[2], TOKEN_UNKNOWN ) |
530
|
|
|
|
|
|
|
&& PPIx::Regexp::Token::Greediness->could_be_greediness( |
531
|
|
|
|
|
|
|
$args->[2]->content() ) |
532
|
|
|
|
|
|
|
) { |
533
|
2
|
|
|
|
|
17
|
PPIx::Regexp::Token::Greediness |
534
|
|
|
|
|
|
|
->__PPIX_ELEM__rebless( $args->[2] ); |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
|
539
|
12
|
|
|
|
|
34
|
return; |
540
|
|
|
|
|
|
|
} |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
sub _in_regex_set { |
543
|
193
|
|
|
193
|
|
392
|
my ( $self ) = @_; |
544
|
193
|
|
|
|
|
348
|
foreach my $stack_entry ( reverse @{ $self->{_rslt} } ) { |
|
193
|
|
|
|
|
485
|
|
545
|
302
|
100
|
|
|
|
790
|
$stack_entry->[0] eq '])' |
546
|
|
|
|
|
|
|
and return 1; |
547
|
|
|
|
|
|
|
} |
548
|
189
|
|
|
|
|
538
|
return 0; |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
# Called as $self->$method( ... ) in _make_node(), above |
552
|
|
|
|
|
|
|
sub _round { ## no critic (ProhibitUnusedPrivateSubroutines) |
553
|
193
|
|
|
193
|
|
488
|
my ( $self, $args ) = @_; |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
# If we're inside a regex set, parens do not capture. |
556
|
|
|
|
|
|
|
$self->_in_regex_set() |
557
|
193
|
100
|
|
|
|
606
|
and return PPIx::Regexp::Structure->__new( @{ $args } ); |
|
4
|
|
|
|
|
24
|
|
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
# If /n is asserted, parens do not capture. |
560
|
|
|
|
|
|
|
$self->{tokenizer}->modifier( 'n' ) |
561
|
189
|
100
|
|
|
|
682
|
and return PPIx::Regexp::Structure->__new( @{ $args } ); |
|
7
|
|
|
|
|
58
|
|
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
# The instantiator will rebless based on the first token if need be. |
564
|
182
|
|
|
|
|
460
|
return PPIx::Regexp::Structure::Capture->__new( @{ $args } ); |
|
182
|
|
|
|
|
1062
|
|
565
|
|
|
|
|
|
|
} |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
# Called as $self->$method( ... ) in _make_node(), above |
568
|
|
|
|
|
|
|
sub _square { ## no critic (ProhibitUnusedPrivateSubroutines) |
569
|
36
|
|
|
36
|
|
117
|
my ( undef, $args ) = @_; # Invocant unused |
570
|
36
|
|
|
|
|
66
|
return PPIx::Regexp::Structure::CharClass->__new( @{ $args } ); |
|
36
|
|
|
|
|
307
|
|
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# Called as $self->$method( ... ) in _make_node(), above |
574
|
|
|
|
|
|
|
sub _regex_set { ## no critic (ProhibitUnusedPrivateSubroutines) |
575
|
6
|
|
|
6
|
|
46
|
my ( undef, $args ) = @_; # Invocant unused |
576
|
6
|
|
|
|
|
19
|
return PPIx::Regexp::Structure::RegexSet->__new( @{ $args } ); |
|
6
|
|
|
|
|
83
|
|
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
# $self->_unget_token( $token ); |
580
|
|
|
|
|
|
|
# |
581
|
|
|
|
|
|
|
# This method caches its argument so that it will be returned by |
582
|
|
|
|
|
|
|
# the next call to C<_get_token()>. If more than one argument is |
583
|
|
|
|
|
|
|
# passed, they will be returned in the order given; that is, |
584
|
|
|
|
|
|
|
# _unget_token/_get_token work like unshift/shift. |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
sub _unget_token { |
587
|
697
|
|
|
697
|
|
1844
|
my ( $self, @args ) = @_; |
588
|
697
|
|
|
|
|
1094
|
unshift @{ $self->{deferred} }, @args; |
|
697
|
|
|
|
|
1633
|
|
589
|
697
|
|
|
|
|
1289
|
return $self; |
590
|
|
|
|
|
|
|
} |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
1; |
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
__END__ |