| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package PPIx::Regexp::Tokenizer; |
|
2
|
|
|
|
|
|
|
|
|
3
|
9
|
|
|
9
|
|
47
|
use strict; |
|
|
9
|
|
|
|
|
12
|
|
|
|
9
|
|
|
|
|
282
|
|
|
4
|
9
|
|
|
9
|
|
30
|
use warnings; |
|
|
9
|
|
|
|
|
10
|
|
|
|
9
|
|
|
|
|
345
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
9
|
|
|
9
|
|
33
|
use base qw{ PPIx::Regexp::Support }; |
|
|
9
|
|
|
|
|
12
|
|
|
|
9
|
|
|
|
|
2105
|
|
|
7
|
|
|
|
|
|
|
|
|
8
|
9
|
|
|
9
|
|
50
|
use Carp qw{ carp croak confess }; |
|
|
9
|
|
|
|
|
12
|
|
|
|
9
|
|
|
|
|
485
|
|
|
9
|
9
|
|
|
9
|
|
2079
|
use PPI::Document; |
|
|
9
|
|
|
|
|
789231
|
|
|
|
9
|
|
|
|
|
399
|
|
|
10
|
9
|
|
|
|
|
1224
|
use PPIx::Regexp::Constant qw{ |
|
11
|
|
|
|
|
|
|
ARRAY_REF |
|
12
|
|
|
|
|
|
|
CODE_REF |
|
13
|
|
|
|
|
|
|
HASH_REF |
|
14
|
|
|
|
|
|
|
LOCATION_LINE |
|
15
|
|
|
|
|
|
|
LOCATION_CHARACTER |
|
16
|
|
|
|
|
|
|
LOCATION_COLUMN |
|
17
|
|
|
|
|
|
|
LOCATION_LOGICAL_LINE |
|
18
|
|
|
|
|
|
|
MINIMUM_PERL |
|
19
|
|
|
|
|
|
|
REGEXP_REF |
|
20
|
|
|
|
|
|
|
TOKEN_LITERAL |
|
21
|
|
|
|
|
|
|
TOKEN_UNKNOWN |
|
22
|
|
|
|
|
|
|
@CARP_NOT |
|
23
|
9
|
|
|
9
|
|
82
|
}; |
|
|
9
|
|
|
|
|
12
|
|
|
24
|
9
|
|
|
9
|
|
4160
|
use PPIx::Regexp::Token::Assertion (); |
|
|
9
|
|
|
|
|
27
|
|
|
|
9
|
|
|
|
|
207
|
|
|
25
|
9
|
|
|
9
|
|
3849
|
use PPIx::Regexp::Token::Backreference (); |
|
|
9
|
|
|
|
|
35
|
|
|
|
9
|
|
|
|
|
265
|
|
|
26
|
9
|
|
|
9
|
|
3636
|
use PPIx::Regexp::Token::Backtrack (); |
|
|
9
|
|
|
|
|
25
|
|
|
|
9
|
|
|
|
|
188
|
|
|
27
|
9
|
|
|
9
|
|
3486
|
use PPIx::Regexp::Token::CharClass::POSIX (); |
|
|
9
|
|
|
|
|
23
|
|
|
|
9
|
|
|
|
|
175
|
|
|
28
|
9
|
|
|
9
|
|
3564
|
use PPIx::Regexp::Token::CharClass::POSIX::Unknown (); |
|
|
9
|
|
|
|
|
26
|
|
|
|
9
|
|
|
|
|
186
|
|
|
29
|
9
|
|
|
9
|
|
3827
|
use PPIx::Regexp::Token::CharClass::Simple (); |
|
|
9
|
|
|
|
|
26
|
|
|
|
9
|
|
|
|
|
185
|
|
|
30
|
9
|
|
|
9
|
|
3537
|
use PPIx::Regexp::Token::Code (); |
|
|
9
|
|
|
|
|
26
|
|
|
|
9
|
|
|
|
|
165
|
|
|
31
|
9
|
|
|
9
|
|
3766
|
use PPIx::Regexp::Token::Comment (); |
|
|
9
|
|
|
|
|
28
|
|
|
|
9
|
|
|
|
|
171
|
|
|
32
|
9
|
|
|
9
|
|
3898
|
use PPIx::Regexp::Token::Condition (); |
|
|
9
|
|
|
|
|
25
|
|
|
|
9
|
|
|
|
|
260
|
|
|
33
|
9
|
|
|
9
|
|
3489
|
use PPIx::Regexp::Token::Control (); |
|
|
9
|
|
|
|
|
24
|
|
|
|
9
|
|
|
|
|
204
|
|
|
34
|
9
|
|
|
9
|
|
3403
|
use PPIx::Regexp::Token::Delimiter (); |
|
|
9
|
|
|
|
|
26
|
|
|
|
9
|
|
|
|
|
196
|
|
|
35
|
9
|
|
|
9
|
|
3168
|
use PPIx::Regexp::Token::Greediness (); |
|
|
9
|
|
|
|
|
24
|
|
|
|
9
|
|
|
|
|
161
|
|
|
36
|
9
|
|
|
9
|
|
3427
|
use PPIx::Regexp::Token::GroupType::Assertion (); |
|
|
9
|
|
|
|
|
27
|
|
|
|
9
|
|
|
|
|
225
|
|
|
37
|
9
|
|
|
9
|
|
3569
|
use PPIx::Regexp::Token::GroupType::Atomic_Script_Run (); |
|
|
9
|
|
|
|
|
24
|
|
|
|
9
|
|
|
|
|
193
|
|
|
38
|
9
|
|
|
9
|
|
3421
|
use PPIx::Regexp::Token::GroupType::BranchReset (); |
|
|
9
|
|
|
|
|
23
|
|
|
|
9
|
|
|
|
|
190
|
|
|
39
|
9
|
|
|
9
|
|
3587
|
use PPIx::Regexp::Token::GroupType::Code (); |
|
|
9
|
|
|
|
|
22
|
|
|
|
9
|
|
|
|
|
191
|
|
|
40
|
9
|
|
|
9
|
|
3626
|
use PPIx::Regexp::Token::GroupType::Modifier (); |
|
|
9
|
|
|
|
|
22
|
|
|
|
9
|
|
|
|
|
182
|
|
|
41
|
9
|
|
|
9
|
|
3290
|
use PPIx::Regexp::Token::GroupType::NamedCapture (); |
|
|
9
|
|
|
|
|
25
|
|
|
|
9
|
|
|
|
|
167
|
|
|
42
|
9
|
|
|
9
|
|
3169
|
use PPIx::Regexp::Token::GroupType::Script_Run (); |
|
|
9
|
|
|
|
|
21
|
|
|
|
9
|
|
|
|
|
182
|
|
|
43
|
9
|
|
|
9
|
|
3493
|
use PPIx::Regexp::Token::GroupType::Subexpression (); |
|
|
9
|
|
|
|
|
20
|
|
|
|
9
|
|
|
|
|
179
|
|
|
44
|
9
|
|
|
9
|
|
3507
|
use PPIx::Regexp::Token::GroupType::Switch (); |
|
|
9
|
|
|
|
|
25
|
|
|
|
9
|
|
|
|
|
270
|
|
|
45
|
9
|
|
|
9
|
|
3817
|
use PPIx::Regexp::Token::Interpolation (); |
|
|
9
|
|
|
|
|
26
|
|
|
|
9
|
|
|
|
|
215
|
|
|
46
|
9
|
|
|
9
|
|
4159
|
use PPIx::Regexp::Token::Literal (); |
|
|
9
|
|
|
|
|
25
|
|
|
|
9
|
|
|
|
|
238
|
|
|
47
|
9
|
|
|
9
|
|
54
|
use PPIx::Regexp::Token::Modifier (); |
|
|
9
|
|
|
|
|
14
|
|
|
|
9
|
|
|
|
|
102
|
|
|
48
|
9
|
|
|
9
|
|
3765
|
use PPIx::Regexp::Token::Operator (); |
|
|
9
|
|
|
|
|
20
|
|
|
|
9
|
|
|
|
|
176
|
|
|
49
|
9
|
|
|
9
|
|
3524
|
use PPIx::Regexp::Token::Quantifier (); |
|
|
9
|
|
|
|
|
24
|
|
|
|
9
|
|
|
|
|
183
|
|
|
50
|
9
|
|
|
9
|
|
44
|
use PPIx::Regexp::Token::Recursion (); |
|
|
9
|
|
|
|
|
13
|
|
|
|
9
|
|
|
|
|
94
|
|
|
51
|
9
|
|
|
9
|
|
25
|
use PPIx::Regexp::Token::Structure (); |
|
|
9
|
|
|
|
|
11
|
|
|
|
9
|
|
|
|
|
85
|
|
|
52
|
9
|
|
|
9
|
|
3550
|
use PPIx::Regexp::Token::Unknown (); |
|
|
9
|
|
|
|
|
25
|
|
|
|
9
|
|
|
|
|
153
|
|
|
53
|
9
|
|
|
9
|
|
3390
|
use PPIx::Regexp::Token::Whitespace (); |
|
|
9
|
|
|
|
|
27
|
|
|
|
9
|
|
|
|
|
208
|
|
|
54
|
9
|
|
|
|
|
438
|
use PPIx::Regexp::Util qw{ |
|
55
|
|
|
|
|
|
|
is_ppi_regexp_element |
|
56
|
|
|
|
|
|
|
__instance |
|
57
|
9
|
|
|
9
|
|
40
|
}; |
|
|
9
|
|
|
|
|
15
|
|
|
58
|
|
|
|
|
|
|
|
|
59
|
9
|
|
|
9
|
|
38
|
use Scalar::Util qw{ looks_like_number }; |
|
|
9
|
|
|
|
|
13
|
|
|
|
9
|
|
|
|
|
48528
|
|
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
our $VERSION = '0.091'; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
our $DEFAULT_POSTDEREF; |
|
64
|
|
|
|
|
|
|
defined $DEFAULT_POSTDEREF |
|
65
|
|
|
|
|
|
|
or $DEFAULT_POSTDEREF = 1; |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
{ |
|
68
|
|
|
|
|
|
|
# Names of classes containing tokenization machinery. There are few |
|
69
|
|
|
|
|
|
|
# known ordering requirements, since each class recognizes its own, |
|
70
|
|
|
|
|
|
|
# and I have tried to prevent overlap. Absent such constraints, the |
|
71
|
|
|
|
|
|
|
# order is in perceived frequency of acceptance, to keep the search |
|
72
|
|
|
|
|
|
|
# as short as possible. If I were conscientious I would gather |
|
73
|
|
|
|
|
|
|
# statistics on this. |
|
74
|
|
|
|
|
|
|
my @classes = ( # TODO make readonly when acceptable way appears |
|
75
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Literal', |
|
76
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Interpolation', |
|
77
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Control', # Note 1 |
|
78
|
|
|
|
|
|
|
'PPIx::Regexp::Token::CharClass::Simple', # Note 2 |
|
79
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Quantifier', |
|
80
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Greediness', |
|
81
|
|
|
|
|
|
|
'PPIx::Regexp::Token::CharClass::POSIX', # Note 3 |
|
82
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Structure', |
|
83
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Assertion', |
|
84
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Backreference', |
|
85
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Operator', # Note 4 |
|
86
|
|
|
|
|
|
|
); |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# Note 1: If we are in quote mode ( \Q ... \E ), Control makes a |
|
89
|
|
|
|
|
|
|
# literal out of anything it sees other than \E. So it |
|
90
|
|
|
|
|
|
|
# needs to come before almost all other tokenizers. Not |
|
91
|
|
|
|
|
|
|
# Literal, which already makes literals, and not |
|
92
|
|
|
|
|
|
|
# Interpolation, which is legal in quote mode, but |
|
93
|
|
|
|
|
|
|
# everything else. |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# Note 2: CharClass::Simple must come after Literal, because it |
|
96
|
|
|
|
|
|
|
# relies on Literal to recognize a Unicode named character |
|
97
|
|
|
|
|
|
|
# ( \N{something} ), so any \N that comes through to it |
|
98
|
|
|
|
|
|
|
# must be the \N simple character class (which represents |
|
99
|
|
|
|
|
|
|
# anything but a newline, and was introduced in Perl |
|
100
|
|
|
|
|
|
|
# 5.11.0. |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# Note 3: CharClass::POSIX has to come before Structure, since both |
|
103
|
|
|
|
|
|
|
# look for square brackets, and CharClass::POSIX is the |
|
104
|
|
|
|
|
|
|
# more particular. |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# Note 4: Operator relies on Literal making the characters literal |
|
107
|
|
|
|
|
|
|
# if they appear in a context where they can not be |
|
108
|
|
|
|
|
|
|
# operators, and Control making them literals if quoting, |
|
109
|
|
|
|
|
|
|
# so it must come after both. |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# Return the declared tokenizer classes. |
|
112
|
|
|
|
|
|
|
sub __tokenizer_classes { |
|
113
|
543
|
|
|
543
|
|
2217
|
return @classes; |
|
114
|
|
|
|
|
|
|
} |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
} |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
{ |
|
119
|
|
|
|
|
|
|
my $errstr; |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub new { |
|
122
|
742
|
|
|
742
|
1
|
429994
|
my ( $class, $re, %args ) = @_; |
|
123
|
742
|
50
|
|
|
|
2056
|
ref $class and $class = ref $class; |
|
124
|
|
|
|
|
|
|
|
|
125
|
742
|
|
|
|
|
1281
|
$errstr = undef; |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
exists $args{default_modifiers} |
|
128
|
|
|
|
|
|
|
and ARRAY_REF ne ref $args{default_modifiers} |
|
129
|
742
|
50
|
66
|
|
|
2230
|
and do { |
|
130
|
0
|
|
|
|
|
0
|
$errstr = 'default_modifiers must be an array reference'; |
|
131
|
0
|
|
|
|
|
0
|
return; |
|
132
|
|
|
|
|
|
|
}; |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
my $self = { |
|
135
|
|
|
|
|
|
|
index_locations => $args{index_locations}, # Index locations |
|
136
|
|
|
|
|
|
|
capture => undef, # Captures from find_regexp. |
|
137
|
|
|
|
|
|
|
content => undef, # The string we are tokenizing. |
|
138
|
|
|
|
|
|
|
cookie => {}, # Cookies |
|
139
|
|
|
|
|
|
|
cursor_curr => 0, # The current position in the string. |
|
140
|
|
|
|
|
|
|
cursor_limit => undef, # The end of the portion of the |
|
141
|
|
|
|
|
|
|
# string being tokenized. |
|
142
|
|
|
|
|
|
|
cursor_orig => undef, # Position of cursor when tokenizer |
|
143
|
|
|
|
|
|
|
# called. Used by get_token to prevent |
|
144
|
|
|
|
|
|
|
# recursion. |
|
145
|
|
|
|
|
|
|
cursor_modifiers => undef, # Position of modifiers. |
|
146
|
|
|
|
|
|
|
default_modifiers => $args{default_modifiers} || [], |
|
147
|
|
|
|
|
|
|
delimiter_finish => undef, # Finishing delimiter of regexp. |
|
148
|
|
|
|
|
|
|
delimiter_start => undef, # Starting delimiter of regexp. |
|
149
|
|
|
|
|
|
|
encoding => $args{encoding}, # Character encoding. |
|
150
|
|
|
|
|
|
|
expect => undef, # Extra classes to expect. |
|
151
|
|
|
|
|
|
|
expect_next => undef, # Extra classes as of next parse cycle |
|
152
|
|
|
|
|
|
|
failures => 0, # Number of parse failures. |
|
153
|
|
|
|
|
|
|
find => undef, # String for find_regexp |
|
154
|
|
|
|
|
|
|
known => {}, # Known tokenizers, by mode. |
|
155
|
|
|
|
|
|
|
location => $args{location}, |
|
156
|
|
|
|
|
|
|
match => undef, # Match from find_regexp. |
|
157
|
|
|
|
|
|
|
mode => 'init', # Initialize |
|
158
|
|
|
|
|
|
|
modifiers => [{}], # Modifier hash. |
|
159
|
|
|
|
|
|
|
pending => [], # Tokens made but not returned. |
|
160
|
|
|
|
|
|
|
prior => TOKEN_UNKNOWN, # Prior significant token. |
|
161
|
|
|
|
|
|
|
source => $re, # The object we were initialized with. |
|
162
|
|
|
|
|
|
|
strict => $args{strict}, # like "use re 'strict';". |
|
163
|
|
|
|
|
|
|
trace => __PACKAGE__->__defined_or( |
|
164
|
742
|
|
100
|
|
|
9082
|
$args{trace}, $ENV{PPIX_REGEXP_TOKENIZER_TRACE}, 0 ), |
|
165
|
|
|
|
|
|
|
}; |
|
166
|
|
|
|
|
|
|
|
|
167
|
742
|
100
|
|
|
|
2696
|
if ( __instance( $re, 'PPI::Element' ) ) { |
|
|
|
100
|
|
|
|
|
|
|
168
|
11
|
50
|
|
|
|
34
|
is_ppi_regexp_element( $re ) |
|
169
|
|
|
|
|
|
|
or return __set_errstr( ref $re, 'not supported by', $class ); |
|
170
|
|
|
|
|
|
|
# TODO conditionalizstion on PPI class does not really |
|
171
|
|
|
|
|
|
|
# belong here, but at the moment I have no other idea of |
|
172
|
|
|
|
|
|
|
# where to put it. |
|
173
|
11
|
50
|
|
|
|
82
|
$self->{content} = $re->isa( 'PPI::Token::HereDoc' ) ? |
|
174
|
|
|
|
|
|
|
join( '', $re->content(), "\n", $re->heredoc(), |
|
175
|
|
|
|
|
|
|
$re->terminator(), "\n" ) : |
|
176
|
|
|
|
|
|
|
$re->content(); |
|
177
|
|
|
|
|
|
|
} elsif ( ref $re ) { |
|
178
|
2
|
|
|
|
|
5
|
return __set_errstr( ref $re, 'not supported' ); |
|
179
|
|
|
|
|
|
|
} else { |
|
180
|
729
|
|
|
|
|
1245
|
$self->{content} = $re; |
|
181
|
|
|
|
|
|
|
} |
|
182
|
|
|
|
|
|
|
|
|
183
|
740
|
|
|
|
|
1549
|
bless $self, $class; |
|
184
|
|
|
|
|
|
|
|
|
185
|
740
|
|
|
|
|
2271
|
$self->{content} = $self->decode( $self->{content} ); |
|
186
|
|
|
|
|
|
|
|
|
187
|
740
|
|
|
|
|
1556
|
$self->{cursor_limit} = length $self->{content}; |
|
188
|
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
$self->{trace} |
|
190
|
740
|
50
|
|
|
|
1758
|
and warn "\ntokenizing '$self->{content}'\n"; |
|
191
|
|
|
|
|
|
|
|
|
192
|
740
|
|
|
|
|
2132
|
return $self; |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub __set_errstr { |
|
196
|
2
|
|
|
2
|
|
6
|
$errstr = join ' ', @_; |
|
197
|
2
|
|
|
|
|
12
|
return; |
|
198
|
|
|
|
|
|
|
} |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
sub errstr { |
|
201
|
2
|
|
|
2
|
1
|
5
|
return $errstr; |
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
} |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub capture { |
|
207
|
715
|
|
|
715
|
1
|
1181
|
my ( $self ) = @_; |
|
208
|
715
|
100
|
|
|
|
1433
|
$self->{capture} or return; |
|
209
|
694
|
50
|
|
|
|
1360
|
defined wantarray or return; |
|
210
|
694
|
50
|
|
|
|
1200
|
return wantarray ? @{ $self->{capture} } : $self->{capture}; |
|
|
694
|
|
|
|
|
2681
|
|
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub content { |
|
214
|
1
|
|
|
1
|
1
|
2
|
my ( $self ) = @_; |
|
215
|
1
|
|
|
|
|
4
|
return $self->{content}; |
|
216
|
|
|
|
|
|
|
} |
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub cookie { |
|
219
|
10182
|
|
|
10182
|
1
|
13695
|
my ( $self, $name, @args ) = @_; |
|
220
|
10182
|
50
|
|
|
|
13627
|
defined $name |
|
221
|
|
|
|
|
|
|
or confess "Programming error - undefined cookie name"; |
|
222
|
10182
|
50
|
|
|
|
13755
|
if ( $self->{trace} ) { |
|
223
|
0
|
|
|
|
|
0
|
local $" = ', '; |
|
224
|
0
|
|
|
|
|
0
|
warn "cookie( '$name', @args )\n"; |
|
225
|
|
|
|
|
|
|
} |
|
226
|
10182
|
100
|
|
|
|
25865
|
@args or return $self->{cookie}{$name}; |
|
227
|
721
|
|
|
|
|
1207
|
my $cookie = shift @args; |
|
228
|
721
|
100
|
|
|
|
1764
|
if ( CODE_REF eq ref $cookie ) { |
|
|
|
50
|
|
|
|
|
|
|
229
|
593
|
|
|
|
|
2246
|
return ( $self->{cookie}{$name} = $cookie ); |
|
230
|
|
|
|
|
|
|
} elsif ( defined $cookie ) { |
|
231
|
0
|
|
|
|
|
0
|
confess "Programming error - cookie must be CODE ref or undef"; |
|
232
|
|
|
|
|
|
|
} else { |
|
233
|
128
|
|
|
|
|
410
|
return delete $self->{cookie}{$name}; |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# NOTE: Currently this is called only against |
|
238
|
|
|
|
|
|
|
# COOKIE_LOOKAROUND_ASSERTION, once in PPIx::Token::GroupType::Assertion |
|
239
|
|
|
|
|
|
|
# to prevent the cookie from being remade if it already exists, and once |
|
240
|
|
|
|
|
|
|
# in PPIx::Regexp::Token::Assertion to determine if \K is inside a |
|
241
|
|
|
|
|
|
|
# lookaround assertion. If it gets used other places, or if there is |
|
242
|
|
|
|
|
|
|
# call for it, I should consider removing the underscores and |
|
243
|
|
|
|
|
|
|
# documenting it as public. |
|
244
|
|
|
|
|
|
|
sub __cookie_exists { |
|
245
|
57
|
|
|
57
|
|
109
|
my ( $self, $name ) = @_; |
|
246
|
57
|
50
|
|
|
|
121
|
defined $name |
|
247
|
|
|
|
|
|
|
or confess "Programming error - undefined cookie name"; |
|
248
|
57
|
|
|
|
|
189
|
return $self->{cookie}{$name}; |
|
249
|
|
|
|
|
|
|
} |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
sub default_modifiers { |
|
252
|
0
|
|
|
0
|
1
|
0
|
my ( $self ) = @_; |
|
253
|
0
|
|
|
|
|
0
|
return [ @{ $self->{default_modifiers} } ]; |
|
|
0
|
|
|
|
|
0
|
|
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
sub __effective_modifiers { |
|
257
|
334
|
|
|
334
|
|
597
|
my ( $self ) = @_; |
|
258
|
|
|
|
|
|
|
HASH_REF eq ref $self->{effective_modifiers} |
|
259
|
334
|
100
|
|
|
|
966
|
or return {}; |
|
260
|
326
|
|
|
|
|
508
|
return { %{ $self->{effective_modifiers} } }; |
|
|
326
|
|
|
|
|
1101
|
|
|
261
|
|
|
|
|
|
|
} |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
sub encoding { |
|
264
|
0
|
|
|
0
|
1
|
0
|
my ( $self ) = @_; |
|
265
|
0
|
|
|
|
|
0
|
return $self->{encoding}; |
|
266
|
|
|
|
|
|
|
} |
|
267
|
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
sub expect { |
|
269
|
330
|
|
|
330
|
1
|
1024
|
my ( $self, @args ) = @_; |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
@args |
|
272
|
330
|
50
|
|
|
|
624
|
or return; |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
$self->{expect_next} = [ |
|
275
|
330
|
50
|
|
|
|
603
|
map { m/ \A PPIx::Regexp:: /smx ? $_ : 'PPIx::Regexp::' . $_ } |
|
|
2602
|
|
|
|
|
5136
|
|
|
276
|
|
|
|
|
|
|
@args |
|
277
|
|
|
|
|
|
|
]; |
|
278
|
330
|
|
|
|
|
680
|
$self->{expect} = undef; |
|
279
|
330
|
|
|
|
|
618
|
return; |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
sub failures { |
|
283
|
8
|
|
|
8
|
1
|
16
|
my ( $self ) = @_; |
|
284
|
8
|
|
|
|
|
16
|
return $self->{failures}; |
|
285
|
|
|
|
|
|
|
} |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
sub find_matching_delimiter { |
|
288
|
589
|
|
|
589
|
1
|
937
|
my ( $self ) = @_; |
|
289
|
589
|
|
100
|
|
|
1832
|
$self->{cursor_curr} ||= 0; |
|
290
|
|
|
|
|
|
|
my $start = substr |
|
291
|
|
|
|
|
|
|
$self->{content}, |
|
292
|
|
|
|
|
|
|
$self->{cursor_curr}, |
|
293
|
589
|
|
|
|
|
1276
|
1; |
|
294
|
|
|
|
|
|
|
|
|
295
|
589
|
|
|
|
|
858
|
my $inx = $self->{cursor_curr}; |
|
296
|
589
|
|
66
|
|
|
1623
|
my $finish = ( |
|
297
|
|
|
|
|
|
|
my $bracketed = $self->close_bracket( $start ) ) || $start; |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=begin comment |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
$self->{trace} |
|
302
|
|
|
|
|
|
|
and warn "Find matching delimiter: Start with '$start' at $self->{cursor_curr}, end with '$finish' at or before $self->{cursor_limit}\n"; |
|
303
|
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
=end comment |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=cut |
|
307
|
|
|
|
|
|
|
|
|
308
|
589
|
|
|
|
|
900
|
my $nest = 0; |
|
309
|
|
|
|
|
|
|
|
|
310
|
589
|
|
|
|
|
1599
|
while ( ++$inx < $self->{cursor_limit} ) { |
|
311
|
6144
|
|
|
|
|
7257
|
my $char = substr $self->{content}, $inx, 1; |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
=begin comment |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
$self->{trace} |
|
316
|
|
|
|
|
|
|
and warn " looking at '$char' at $inx, nest level $nest\n"; |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=end comment |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=cut |
|
321
|
|
|
|
|
|
|
|
|
322
|
6144
|
100
|
100
|
|
|
16155
|
if ( $char eq '\\' && $finish ne '\\' ) { |
|
|
|
100
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
323
|
317
|
|
|
|
|
522
|
++$inx; |
|
324
|
|
|
|
|
|
|
} elsif ( $bracketed && $char eq $start ) { |
|
325
|
1
|
|
|
|
|
2
|
++$nest; |
|
326
|
|
|
|
|
|
|
} elsif ( $char eq $finish ) { |
|
327
|
|
|
|
|
|
|
--$nest < 0 |
|
328
|
588
|
100
|
|
|
|
2294
|
and return $inx - $self->{cursor_curr}; |
|
329
|
|
|
|
|
|
|
} |
|
330
|
|
|
|
|
|
|
} |
|
331
|
|
|
|
|
|
|
|
|
332
|
2
|
|
|
|
|
6
|
return; |
|
333
|
|
|
|
|
|
|
} |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
sub find_regexp { |
|
336
|
16627
|
|
|
16627
|
1
|
20345
|
my ( $self, $regexp ) = @_; |
|
337
|
|
|
|
|
|
|
|
|
338
|
16627
|
50
|
0
|
|
|
25574
|
REGEXP_REF eq ref $regexp |
|
339
|
|
|
|
|
|
|
or confess |
|
340
|
|
|
|
|
|
|
'Argument is a ', ( ref $regexp || 'scalar' ), ' not a Regexp'; |
|
341
|
|
|
|
|
|
|
|
|
342
|
16627
|
100
|
|
|
|
27118
|
defined $self->{find} or $self->_remainder(); |
|
343
|
|
|
|
|
|
|
|
|
344
|
16627
|
100
|
|
|
|
66695
|
$self->{find} =~ $regexp |
|
345
|
|
|
|
|
|
|
or return; |
|
346
|
|
|
|
|
|
|
|
|
347
|
1848
|
|
|
|
|
2332
|
my @capture; |
|
348
|
1848
|
|
|
|
|
5190
|
foreach my $inx ( 0 .. $#+ ) { |
|
349
|
4267
|
100
|
66
|
|
|
13690
|
if ( defined $-[$inx] && defined $+[$inx] ) { |
|
350
|
|
|
|
|
|
|
push @capture, $self->{capture} = substr |
|
351
|
|
|
|
|
|
|
$self->{find}, |
|
352
|
3778
|
|
|
|
|
15986
|
$-[$inx], |
|
353
|
|
|
|
|
|
|
$+[$inx] - $-[$inx]; |
|
354
|
|
|
|
|
|
|
} else { |
|
355
|
489
|
|
|
|
|
783
|
push @capture, undef; |
|
356
|
|
|
|
|
|
|
} |
|
357
|
|
|
|
|
|
|
} |
|
358
|
1848
|
|
|
|
|
3405
|
$self->{match} = shift @capture; |
|
359
|
1848
|
|
|
|
|
2871
|
$self->{capture} = \@capture; |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# The following circumlocution seems to be needed under Perl 5.13.0 |
|
362
|
|
|
|
|
|
|
# for reasons I do not fathom -- at least in the case where |
|
363
|
|
|
|
|
|
|
# wantarray is false. RT 56864 details the symptoms, which I was |
|
364
|
|
|
|
|
|
|
# never able to reproduce outside Perl::Critic. But returning $+[0] |
|
365
|
|
|
|
|
|
|
# directly, the value could transmogrify between here and the |
|
366
|
|
|
|
|
|
|
# calling module. |
|
367
|
|
|
|
|
|
|
## my @data = ( $-[0], $+[0] ); |
|
368
|
|
|
|
|
|
|
## return wantarray ? @data : $data[1]; |
|
369
|
1848
|
50
|
|
|
|
6379
|
return wantarray ? ( $-[0] + 0, $+[0] + 0 ) : $+[0] + 0; |
|
370
|
|
|
|
|
|
|
} |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
sub get_mode { |
|
373
|
2598
|
|
|
2598
|
1
|
3260
|
my ( $self ) = @_; |
|
374
|
2598
|
|
|
|
|
5077
|
return $self->{mode}; |
|
375
|
|
|
|
|
|
|
} |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
sub get_start_delimiter { |
|
378
|
1794
|
|
|
1794
|
1
|
2036
|
my ( $self ) = @_; |
|
379
|
1794
|
|
|
|
|
4983
|
return $self->{delimiter_start}; |
|
380
|
|
|
|
|
|
|
} |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
sub get_token { |
|
383
|
4133
|
|
|
4133
|
1
|
5170
|
my ( $self ) = @_; |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
caller eq __PACKAGE__ or $self->{cursor_curr} > $self->{cursor_orig} |
|
386
|
4133
|
50
|
66
|
|
|
10247
|
or confess 'Programming error - get_token() called without ', |
|
387
|
|
|
|
|
|
|
'first calling make_token()'; |
|
388
|
|
|
|
|
|
|
|
|
389
|
4133
|
|
|
|
|
6316
|
my $handler = '__PPIX_TOKENIZER__' . $self->{mode}; |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
my $code = $self->can( $handler ) |
|
392
|
|
|
|
|
|
|
or confess 'Programming error - ', |
|
393
|
|
|
|
|
|
|
"Getting token in mode '$self->{mode}'. ", |
|
394
|
|
|
|
|
|
|
"cursor_curr = $self->{cursor_curr}; ", |
|
395
|
|
|
|
|
|
|
"cursor_limit = $self->{cursor_limit}; ", |
|
396
|
|
|
|
|
|
|
"length( content ) = ", length $self->{content}, |
|
397
|
4133
|
50
|
|
|
|
11779
|
"; content = '$self->{content}'"; |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
my $character = substr( |
|
400
|
|
|
|
|
|
|
$self->{content}, |
|
401
|
|
|
|
|
|
|
$self->{cursor_curr}, |
|
402
|
4133
|
|
|
|
|
7640
|
1 |
|
403
|
|
|
|
|
|
|
); |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
$self->{trace} |
|
406
|
4133
|
50
|
|
|
|
7265
|
and warn "get_token() got '$character' from $self->{cursor_curr}\n"; |
|
407
|
|
|
|
|
|
|
|
|
408
|
4133
|
|
|
|
|
7034
|
return ( $code->( $self, $character ) ); |
|
409
|
|
|
|
|
|
|
} |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
sub interpolates { |
|
412
|
141
|
|
|
141
|
1
|
230
|
my ( $self ) = @_; |
|
413
|
141
|
|
|
|
|
474
|
return $self->{delimiter_start} ne q{'}; |
|
414
|
|
|
|
|
|
|
} |
|
415
|
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
sub make_token { |
|
417
|
5243
|
|
|
5243
|
1
|
8672
|
my ( $self, $length, $class, $arg ) = @_; |
|
418
|
5243
|
100
|
|
|
|
7935
|
defined $class or $class = caller; |
|
419
|
|
|
|
|
|
|
|
|
420
|
5243
|
50
|
|
|
|
9691
|
if ( $length + $self->{cursor_curr} > $self->{cursor_limit} ) { |
|
421
|
|
|
|
|
|
|
$length = $self->{cursor_limit} - $self->{cursor_curr} |
|
422
|
0
|
0
|
|
|
|
0
|
or return; |
|
423
|
|
|
|
|
|
|
} |
|
424
|
|
|
|
|
|
|
|
|
425
|
5243
|
50
|
|
|
|
12819
|
$class =~ m/ \A PPIx::Regexp:: /smx |
|
426
|
|
|
|
|
|
|
or $class = 'PPIx::Regexp::' . $class; |
|
427
|
|
|
|
|
|
|
my $content = substr |
|
428
|
|
|
|
|
|
|
$self->{content}, |
|
429
|
|
|
|
|
|
|
$self->{cursor_curr}, |
|
430
|
5243
|
|
|
|
|
8622
|
$length; |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
$self->{trace} |
|
433
|
5243
|
50
|
|
|
|
7863
|
and warn "make_token( $length, '$class' ) => '$content'\n"; |
|
434
|
5243
|
50
|
|
|
|
7894
|
$self->{trace} > 1 |
|
435
|
|
|
|
|
|
|
and warn " make_token: cursor_curr = $self->{cursor_curr}; ", |
|
436
|
|
|
|
|
|
|
"cursor_limit = $self->{cursor_limit}\n"; |
|
437
|
|
|
|
|
|
|
my $token = $class->__new( $content, |
|
438
|
|
|
|
|
|
|
tokenizer => $self, |
|
439
|
5243
|
100
|
|
|
|
6327
|
%{ $arg || {} } ) |
|
|
5243
|
50
|
|
|
|
23979
|
|
|
440
|
|
|
|
|
|
|
or return; |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
$self->{index_locations} |
|
443
|
5243
|
100
|
|
|
|
11023
|
and $self->_update_location( $token ); |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
$token->significant() |
|
446
|
5243
|
100
|
|
|
|
11087
|
and $self->{expect} = undef; |
|
447
|
|
|
|
|
|
|
|
|
448
|
5243
|
100
|
|
|
|
18333
|
$token->isa( TOKEN_UNKNOWN ) and $self->{failures}++; |
|
449
|
|
|
|
|
|
|
|
|
450
|
5243
|
|
|
|
|
6786
|
$self->{cursor_curr} += $length; |
|
451
|
5243
|
|
|
|
|
6281
|
$self->{find} = undef; |
|
452
|
5243
|
|
|
|
|
5979
|
$self->{match} = undef; |
|
453
|
5243
|
|
|
|
|
6285
|
$self->{capture} = undef; |
|
454
|
|
|
|
|
|
|
|
|
455
|
5243
|
|
|
|
|
6260
|
foreach my $name ( keys %{ $self->{cookie} } ) { |
|
|
5243
|
|
|
|
|
9742
|
|
|
456
|
3615
|
|
|
|
|
4418
|
my $cookie = $self->{cookie}{$name}; |
|
457
|
|
|
|
|
|
|
$cookie->( $self, $token ) |
|
458
|
3615
|
100
|
|
|
|
6492
|
or delete $self->{cookie}{$name}; |
|
459
|
|
|
|
|
|
|
} |
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# Record this token as the prior token if it is significant. We must |
|
462
|
|
|
|
|
|
|
# do this after processing cookies, so that the cookies have access |
|
463
|
|
|
|
|
|
|
# to the old token if they want. |
|
464
|
|
|
|
|
|
|
$token->significant() |
|
465
|
5243
|
100
|
|
|
|
8390
|
and $self->{prior_significant_token} = $token; |
|
466
|
|
|
|
|
|
|
|
|
467
|
5243
|
|
|
|
|
15786
|
return $token; |
|
468
|
|
|
|
|
|
|
} |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
sub match { |
|
471
|
86
|
|
|
86
|
1
|
150
|
my ( $self ) = @_; |
|
472
|
86
|
|
|
|
|
175
|
return $self->{match}; |
|
473
|
|
|
|
|
|
|
} |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
sub modifier { |
|
476
|
4932
|
|
|
4932
|
1
|
6663
|
my ( $self, $modifier ) = @_; |
|
477
|
|
|
|
|
|
|
return PPIx::Regexp::Token::Modifier::__asserts( |
|
478
|
4932
|
|
|
|
|
9618
|
$self->{modifiers}[-1], $modifier ); |
|
479
|
|
|
|
|
|
|
} |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
sub modifier_duplicate { |
|
482
|
292
|
|
|
292
|
1
|
442
|
my ( $self ) = @_; |
|
483
|
292
|
|
|
|
|
573
|
push @{ $self->{modifiers} }, |
|
484
|
292
|
|
|
|
|
397
|
{ %{ $self->{modifiers}[-1] } }; |
|
|
292
|
|
|
|
|
855
|
|
|
485
|
292
|
|
|
|
|
477
|
return; |
|
486
|
|
|
|
|
|
|
} |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
sub modifier_modify { |
|
489
|
595
|
|
|
595
|
1
|
1322
|
my ( $self, %args ) = @_; |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
# Modifier code is centralized in PPIx::Regexp::Token::Modifier |
|
492
|
|
|
|
|
|
|
$self->{modifiers}[-1] = |
|
493
|
|
|
|
|
|
|
PPIx::Regexp::Token::Modifier::__PPIX_TOKENIZER__modifier_modify( |
|
494
|
595
|
|
|
|
|
1915
|
$self->{modifiers}[-1], \%args ); |
|
495
|
|
|
|
|
|
|
|
|
496
|
595
|
|
|
|
|
1015
|
return; |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
} |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
sub modifier_pop { |
|
501
|
288
|
|
|
288
|
1
|
492
|
my ( $self ) = @_; |
|
502
|
288
|
|
|
|
|
747
|
@{ $self->{modifiers} } > 1 |
|
503
|
288
|
100
|
|
|
|
328
|
and pop @{ $self->{modifiers} }; |
|
|
282
|
|
|
|
|
603
|
|
|
504
|
288
|
|
|
|
|
666
|
return; |
|
505
|
|
|
|
|
|
|
} |
|
506
|
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
sub modifier_seen { |
|
508
|
8
|
|
|
8
|
1
|
25
|
my ( $self, $modifier ) = @_; |
|
509
|
8
|
|
|
|
|
15
|
foreach my $mod ( reverse @{ $self->{modifiers} } ) { |
|
|
8
|
|
|
|
|
24
|
|
|
510
|
10
|
100
|
|
|
|
35
|
exists $mod->{$modifier} |
|
511
|
|
|
|
|
|
|
and return 1; |
|
512
|
|
|
|
|
|
|
} |
|
513
|
5
|
|
|
|
|
23
|
return; |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub next_token { |
|
517
|
5780
|
|
|
5780
|
1
|
7011
|
my ( $self ) = @_; |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
{ |
|
520
|
|
|
|
|
|
|
|
|
521
|
5780
|
100
|
|
|
|
6357
|
if ( @{ $self->{pending} } ) { |
|
|
9896
|
|
|
|
|
9693
|
|
|
|
9896
|
|
|
|
|
15389
|
|
|
522
|
5241
|
|
|
|
|
5305
|
return shift @{ $self->{pending} }; |
|
|
5241
|
|
|
|
|
12984
|
|
|
523
|
|
|
|
|
|
|
} |
|
524
|
|
|
|
|
|
|
|
|
525
|
4655
|
100
|
|
|
|
8421
|
if ( $self->{cursor_curr} >= $self->{cursor_limit} ) { |
|
526
|
|
|
|
|
|
|
$self->{cursor_limit} >= length $self->{content} |
|
527
|
1099
|
100
|
|
|
|
3059
|
and return; |
|
528
|
560
|
50
|
|
|
|
1310
|
$self->{mode} eq 'finish' and return; |
|
529
|
560
|
|
|
|
|
1442
|
$self->_set_mode( 'finish' ); |
|
530
|
560
|
|
|
|
|
899
|
$self->{cursor_limit} += length $self->{delimiter_finish}; |
|
531
|
|
|
|
|
|
|
} |
|
532
|
|
|
|
|
|
|
|
|
533
|
4116
|
50
|
|
|
|
7397
|
if ( my @tokens = $self->get_token() ) { |
|
534
|
4116
|
|
|
|
|
4544
|
push @{ $self->{pending} }, @tokens; |
|
|
4116
|
|
|
|
|
6564
|
|
|
535
|
4116
|
|
|
|
|
6007
|
redo; |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
} |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
} |
|
540
|
|
|
|
|
|
|
|
|
541
|
0
|
|
|
|
|
0
|
return; |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
} |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
sub peek { |
|
546
|
379
|
|
|
379
|
1
|
606
|
my ( $self, $offset ) = @_; |
|
547
|
379
|
100
|
|
|
|
712
|
defined $offset or $offset = 0; |
|
548
|
379
|
50
|
|
|
|
684
|
$offset < 0 and return; |
|
549
|
379
|
|
|
|
|
544
|
$offset += $self->{cursor_curr}; |
|
550
|
379
|
50
|
|
|
|
644
|
$offset >= $self->{cursor_limit} and return; |
|
551
|
379
|
|
|
|
|
1184
|
return substr $self->{content}, $offset, 1; |
|
552
|
|
|
|
|
|
|
} |
|
553
|
|
|
|
|
|
|
|
|
554
|
|
|
|
|
|
|
sub ppi_document { |
|
555
|
83
|
|
|
83
|
1
|
156
|
my ( $self ) = @_; |
|
556
|
|
|
|
|
|
|
|
|
557
|
83
|
50
|
|
|
|
156
|
defined $self->{find} or $self->_remainder(); |
|
558
|
|
|
|
|
|
|
|
|
559
|
83
|
|
|
|
|
467
|
return PPI::Document->new( \"$self->{find}" ); |
|
560
|
|
|
|
|
|
|
} |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
sub prior_significant_token { |
|
563
|
2413
|
|
|
2413
|
1
|
3302
|
my ( $self, $method, @args ) = @_; |
|
564
|
2413
|
100
|
|
|
|
3260
|
defined $method or return $self->{prior_significant_token}; |
|
565
|
|
|
|
|
|
|
$self->{prior_significant_token}->can( $method ) |
|
566
|
|
|
|
|
|
|
or confess 'Programming error - ', |
|
567
|
|
|
|
|
|
|
( ref $self->{prior_significant_token} || |
|
568
|
2394
|
50
|
0
|
|
|
6943
|
$self->{prior_significant_token} ), |
|
569
|
|
|
|
|
|
|
' does not support method ', $method; |
|
570
|
2394
|
|
|
|
|
5897
|
return $self->{prior_significant_token}->$method( @args ); |
|
571
|
|
|
|
|
|
|
} |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# my $length = $token->__recognize_postderef( $tokenizer, $iterator ). |
|
574
|
|
|
|
|
|
|
# |
|
575
|
|
|
|
|
|
|
# This method is private to the PPIx-Regexp package, and may be changed |
|
576
|
|
|
|
|
|
|
# or retracted without warning. What it does is to recognize postfix |
|
577
|
|
|
|
|
|
|
# dereferences. It returns the length in characters of the first postfix |
|
578
|
|
|
|
|
|
|
# dereference found, or a false value if none is found. |
|
579
|
|
|
|
|
|
|
# |
|
580
|
|
|
|
|
|
|
# The optional $iterator argument can be one of the following: |
|
581
|
|
|
|
|
|
|
# - A code reference, which will be called to provide PPI::Element |
|
582
|
|
|
|
|
|
|
# objects to be checked to see if they represent a postfix |
|
583
|
|
|
|
|
|
|
# dereference. |
|
584
|
|
|
|
|
|
|
# - A PPI::Element, which is checked to see if it is a postfix |
|
585
|
|
|
|
|
|
|
# dereference. |
|
586
|
|
|
|
|
|
|
# - Undef, or omitted, in which case ppi() is called on the invocant, |
|
587
|
|
|
|
|
|
|
# and everything that follows the '->' operator is checked to see if |
|
588
|
|
|
|
|
|
|
# it is a postfix dereference. |
|
589
|
|
|
|
|
|
|
# - Anything else results in an exception and stack trace. |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
{ |
|
592
|
|
|
|
|
|
|
sub __recognize_postderef { |
|
593
|
149
|
|
|
149
|
|
251
|
my ( $self, $token, $iterator ) = @_; |
|
594
|
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
# Note that if ppi() gets called I have to hold a reference to |
|
596
|
|
|
|
|
|
|
# the returned object until I am done with all its children. |
|
597
|
149
|
|
|
|
|
191
|
my $ppi; |
|
598
|
149
|
100
|
|
|
|
305
|
if ( ! defined $iterator ) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
# This MUST be done before ppi() is called. |
|
601
|
|
|
|
|
|
|
$self->{index_locations} |
|
602
|
145
|
100
|
|
|
|
297
|
and $self->_update_location( $token ); |
|
603
|
|
|
|
|
|
|
|
|
604
|
145
|
|
|
|
|
410
|
$ppi = $token->ppi(); |
|
605
|
29
|
|
|
|
|
6362
|
my @ops = grep { '->' eq $_->content() } @{ |
|
606
|
145
|
100
|
|
|
|
256
|
$ppi->find( 'PPI::Token::Operator' ) || [] }; |
|
|
145
|
|
|
|
|
458
|
|
|
607
|
|
|
|
|
|
|
$iterator = sub { |
|
608
|
151
|
100
|
|
151
|
|
625
|
my $op = shift @ops |
|
609
|
|
|
|
|
|
|
or return; |
|
610
|
15
|
|
|
|
|
55
|
return $op->snext_sibling(); |
|
611
|
145
|
|
|
|
|
31538
|
}; |
|
612
|
|
|
|
|
|
|
} elsif ( $iterator->isa( 'PPI::Element' ) ) { |
|
613
|
4
|
|
|
|
|
9
|
my @eles = ( $iterator ); |
|
614
|
|
|
|
|
|
|
$iterator = sub { |
|
615
|
4
|
|
|
4
|
|
10
|
return shift @eles; |
|
616
|
4
|
|
|
|
|
13
|
}; |
|
617
|
|
|
|
|
|
|
} elsif ( CODE_REF ne ref $iterator ) { |
|
618
|
0
|
|
|
|
|
0
|
confess 'Programming error - Iterator not understood'; |
|
619
|
|
|
|
|
|
|
} |
|
620
|
|
|
|
|
|
|
|
|
621
|
149
|
|
|
|
|
675
|
my $accept = $token->__postderef_accept_cast(); |
|
622
|
|
|
|
|
|
|
|
|
623
|
149
|
|
|
|
|
296
|
while ( my $elem = $iterator->() ) { |
|
624
|
|
|
|
|
|
|
|
|
625
|
19
|
|
|
|
|
407
|
my $content = $elem->content(); |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
# As of PPI 1.238, all postfix dereferences are parsed as |
|
628
|
|
|
|
|
|
|
# casts. So if we find a cast of the correct content we have |
|
629
|
|
|
|
|
|
|
# a postfix deref. |
|
630
|
19
|
100
|
|
|
|
168
|
$elem->isa( 'PPI::Token::Cast' ) |
|
631
|
|
|
|
|
|
|
or next; |
|
632
|
|
|
|
|
|
|
|
|
633
|
15
|
100
|
|
|
|
85
|
if ( $content =~ m/ ( .* ) \* \z /smx ) { |
|
|
|
50
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# If we're an acceptable cast ending in a glob, accept |
|
635
|
|
|
|
|
|
|
# it. |
|
636
|
10
|
100
|
|
|
|
90
|
$accept->{$1} |
|
637
|
|
|
|
|
|
|
and return length $content; |
|
638
|
|
|
|
|
|
|
} elsif ( $accept->{$content} ) { |
|
639
|
|
|
|
|
|
|
# If we're an acceptable cast followed by a subscript, |
|
640
|
|
|
|
|
|
|
# we're a slice -- accept both cast and subscript. |
|
641
|
5
|
50
|
|
|
|
16
|
my $next = $elem->snext_sibling() |
|
642
|
|
|
|
|
|
|
or next; |
|
643
|
5
|
50
|
|
|
|
84
|
$next->isa( 'PPI::Structure::Subscript' ) |
|
644
|
|
|
|
|
|
|
or next; |
|
645
|
5
|
|
|
|
|
17
|
return length( $content ) + length( $next->content() ); |
|
646
|
|
|
|
|
|
|
} |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# Otherwise, we're not a postfix dereference; try the next |
|
649
|
|
|
|
|
|
|
# iteration. |
|
650
|
|
|
|
|
|
|
} |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
# No postfix dereference found. |
|
653
|
136
|
|
|
|
|
681
|
return; |
|
654
|
|
|
|
|
|
|
} |
|
655
|
|
|
|
|
|
|
} |
|
656
|
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
sub significant { |
|
658
|
0
|
|
|
0
|
1
|
0
|
return 1; |
|
659
|
|
|
|
|
|
|
} |
|
660
|
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
sub strict { |
|
662
|
4
|
|
|
4
|
1
|
7
|
my ( $self ) = @_; |
|
663
|
4
|
|
|
|
|
20
|
return $self->{strict}; |
|
664
|
|
|
|
|
|
|
} |
|
665
|
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
sub _known_tokenizers { |
|
667
|
3036
|
|
|
3036
|
|
3743
|
my ( $self ) = @_; |
|
668
|
|
|
|
|
|
|
|
|
669
|
3036
|
|
|
|
|
3546
|
my $mode = $self->{mode}; |
|
670
|
|
|
|
|
|
|
|
|
671
|
3036
|
|
|
|
|
3394
|
my @expect; |
|
672
|
3036
|
100
|
|
|
|
5051
|
if ( $self->{expect_next} ) { |
|
673
|
328
|
|
|
|
|
537
|
$self->{expect} = $self->{expect_next}; |
|
674
|
328
|
|
|
|
|
534
|
$self->{expect_next} = undef; |
|
675
|
|
|
|
|
|
|
} |
|
676
|
3036
|
100
|
|
|
|
4903
|
if ( $self->{expect} ) { |
|
677
|
|
|
|
|
|
|
@expect = $self->_known_tokenizer_check( |
|
678
|
334
|
|
|
|
|
470
|
@{ $self->{expect} } ); |
|
|
334
|
|
|
|
|
809
|
|
|
679
|
|
|
|
|
|
|
} |
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
exists $self->{known}{$mode} and return ( |
|
682
|
3036
|
100
|
|
|
|
6371
|
@expect, @{ $self->{known}{$mode} } ); |
|
|
2493
|
|
|
|
|
7574
|
|
|
683
|
|
|
|
|
|
|
|
|
684
|
543
|
|
|
|
|
1346
|
my @found = $self->_known_tokenizer_check( |
|
685
|
|
|
|
|
|
|
$self->__tokenizer_classes() ); |
|
686
|
|
|
|
|
|
|
|
|
687
|
543
|
|
|
|
|
1809
|
$self->{known}{$mode} = \@found; |
|
688
|
543
|
|
|
|
|
1660
|
return (@expect, @found); |
|
689
|
|
|
|
|
|
|
} |
|
690
|
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
sub _known_tokenizer_check { |
|
692
|
877
|
|
|
877
|
|
2566
|
my ( $self, @args ) = @_; |
|
693
|
|
|
|
|
|
|
|
|
694
|
877
|
|
|
|
|
1190
|
my $handler = '__PPIX_TOKENIZER__' . $self->{mode}; |
|
695
|
877
|
|
|
|
|
1085
|
my @found; |
|
696
|
|
|
|
|
|
|
|
|
697
|
877
|
|
|
|
|
1221
|
foreach my $class ( @args ) { |
|
698
|
|
|
|
|
|
|
|
|
699
|
8611
|
100
|
|
|
|
33493
|
$class->can( $handler ) or next; |
|
700
|
8408
|
|
|
|
|
10970
|
push @found, $class; |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
} |
|
703
|
|
|
|
|
|
|
|
|
704
|
877
|
|
|
|
|
3385
|
return @found; |
|
705
|
|
|
|
|
|
|
} |
|
706
|
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
sub tokens { |
|
708
|
205
|
|
|
205
|
1
|
450
|
my ( $self ) = @_; |
|
709
|
|
|
|
|
|
|
|
|
710
|
205
|
|
|
|
|
300
|
my @rslt; |
|
711
|
205
|
|
|
|
|
738
|
while ( my $token = $self->next_token() ) { |
|
712
|
1933
|
|
|
|
|
3631
|
push @rslt, $token; |
|
713
|
|
|
|
|
|
|
} |
|
714
|
|
|
|
|
|
|
|
|
715
|
205
|
|
|
|
|
1206
|
return @rslt; |
|
716
|
|
|
|
|
|
|
} |
|
717
|
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
# $self->_deprecation_notice( $type, $name ); |
|
719
|
|
|
|
|
|
|
# |
|
720
|
|
|
|
|
|
|
# This method centralizes deprecation. Type is 'attribute' or |
|
721
|
|
|
|
|
|
|
# 'method'. Deprecation is driven of the %deprecate hash. Values |
|
722
|
|
|
|
|
|
|
# are: |
|
723
|
|
|
|
|
|
|
# false - no warning |
|
724
|
|
|
|
|
|
|
# 1 - warn on first use |
|
725
|
|
|
|
|
|
|
# 2 - warn on each use |
|
726
|
|
|
|
|
|
|
# 3 - die on each use. |
|
727
|
|
|
|
|
|
|
# |
|
728
|
|
|
|
|
|
|
# $self->_deprecation_in_progress( $type, $name ) |
|
729
|
|
|
|
|
|
|
# |
|
730
|
|
|
|
|
|
|
# This method returns true if the deprecation is in progress. In |
|
731
|
|
|
|
|
|
|
# fact it returns the deprecation level. |
|
732
|
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
=begin comment |
|
734
|
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
{ |
|
736
|
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
my %deprecate = ( |
|
738
|
|
|
|
|
|
|
attribute => { |
|
739
|
|
|
|
|
|
|
postderef => 3, |
|
740
|
|
|
|
|
|
|
}, |
|
741
|
|
|
|
|
|
|
); |
|
742
|
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
sub _deprecation_notice { |
|
744
|
|
|
|
|
|
|
my ( undef, $type, $name, $repl ) = @_; # Invocant unused |
|
745
|
|
|
|
|
|
|
$deprecate{$type} or return; |
|
746
|
|
|
|
|
|
|
$deprecate{$type}{$name} or return; |
|
747
|
|
|
|
|
|
|
my $msg = sprintf 'The %s %s is %s', $name, $type, |
|
748
|
|
|
|
|
|
|
$deprecate{$type}{$name} > 2 ? 'removed' : 'deprecated'; |
|
749
|
|
|
|
|
|
|
defined $repl |
|
750
|
|
|
|
|
|
|
and $msg .= "; use $repl instead"; |
|
751
|
|
|
|
|
|
|
$deprecate{$type}{$name} >= 3 |
|
752
|
|
|
|
|
|
|
and croak $msg; |
|
753
|
|
|
|
|
|
|
warnings::enabled( 'deprecated' ) |
|
754
|
|
|
|
|
|
|
and carp $msg; |
|
755
|
|
|
|
|
|
|
$deprecate{$type}{$name} == 1 |
|
756
|
|
|
|
|
|
|
and $deprecate{$type}{$name} = 0; |
|
757
|
|
|
|
|
|
|
return; |
|
758
|
|
|
|
|
|
|
} |
|
759
|
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
sub _deprecation_in_progress { |
|
761
|
|
|
|
|
|
|
my ( $self, $type, $name ) = @_; |
|
762
|
|
|
|
|
|
|
$deprecate{$type} or return; |
|
763
|
|
|
|
|
|
|
return $deprecate{$type}{$name}; |
|
764
|
|
|
|
|
|
|
} |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
} |
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
=end comment |
|
769
|
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
=cut |
|
771
|
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
sub _remainder { |
|
773
|
3623
|
|
|
3623
|
|
4530
|
my ( $self ) = @_; |
|
774
|
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
$self->{cursor_curr} > $self->{cursor_limit} |
|
776
|
3623
|
50
|
|
|
|
6404
|
and confess "Programming error - Trying to find past end of string"; |
|
777
|
|
|
|
|
|
|
$self->{find} = substr( |
|
778
|
|
|
|
|
|
|
$self->{content}, |
|
779
|
|
|
|
|
|
|
$self->{cursor_curr}, |
|
780
|
|
|
|
|
|
|
$self->{cursor_limit} - $self->{cursor_curr} |
|
781
|
3623
|
|
|
|
|
7473
|
); |
|
782
|
|
|
|
|
|
|
|
|
783
|
3623
|
|
|
|
|
4716
|
return; |
|
784
|
|
|
|
|
|
|
} |
|
785
|
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
sub _make_final_token { |
|
787
|
10
|
|
|
10
|
|
22
|
my ( $self, $len, $class, $arg ) = @_; |
|
788
|
10
|
|
|
|
|
26
|
my $token = $self->make_token( $len, $class, $arg ); |
|
789
|
10
|
|
|
|
|
45
|
$self->_set_mode( 'kaput' ); |
|
790
|
10
|
|
|
|
|
57
|
return $token; |
|
791
|
|
|
|
|
|
|
} |
|
792
|
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
sub _set_mode { |
|
794
|
1657
|
|
|
1657
|
|
2699
|
my ( $self, $mode ) = @_; |
|
795
|
|
|
|
|
|
|
$self->{trace} |
|
796
|
1657
|
50
|
|
|
|
2946
|
and warn "Tokenizer going from mode $self->{mode} to $mode\n"; |
|
797
|
1657
|
|
|
|
|
2316
|
$self->{mode} = $mode; |
|
798
|
1657
|
100
|
|
|
|
2889
|
if ( 'kaput' eq $mode ) { |
|
799
|
|
|
|
|
|
|
$self->{cursor_curr} = $self->{cursor_limit} = |
|
800
|
537
|
|
|
|
|
1061
|
length $self->{content}; |
|
801
|
|
|
|
|
|
|
} |
|
802
|
1657
|
|
|
|
|
2259
|
return; |
|
803
|
|
|
|
|
|
|
} |
|
804
|
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
sub __init_error { |
|
806
|
10
|
|
|
10
|
|
16
|
my ( $self , $err ) = @_; |
|
807
|
10
|
100
|
|
|
|
26
|
defined $err |
|
808
|
|
|
|
|
|
|
or $err = 'Tokenizer found illegal first characters'; |
|
809
|
|
|
|
|
|
|
return $self->_make_final_token( |
|
810
|
10
|
|
|
|
|
55
|
length $self->{content}, TOKEN_UNKNOWN, { |
|
811
|
|
|
|
|
|
|
error => $err, |
|
812
|
|
|
|
|
|
|
}, |
|
813
|
|
|
|
|
|
|
); |
|
814
|
|
|
|
|
|
|
} |
|
815
|
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
sub _update_location { |
|
817
|
107
|
|
|
107
|
|
156
|
my ( $self, $token ) = @_; |
|
818
|
|
|
|
|
|
|
$token->{location} # Idempotent |
|
819
|
107
|
100
|
|
|
|
187
|
and return; |
|
820
|
105
|
|
66
|
|
|
213
|
my $loc = $self->{_location} ||= do { |
|
821
|
|
|
|
|
|
|
my %loc = ( |
|
822
|
|
|
|
|
|
|
location => $self->{location}, |
|
823
|
12
|
|
|
|
|
34
|
); |
|
824
|
12
|
100
|
|
|
|
40
|
if ( __instance( $self->{source}, 'PPI::Element' ) ) { |
|
825
|
11
|
|
33
|
|
|
77
|
$loc{location} ||= $self->{source}->location(); |
|
826
|
11
|
50
|
|
|
|
1395
|
if ( my $doc = $self->{source}->document() ) { |
|
827
|
11
|
|
|
|
|
250
|
$loc{tab_width} = $doc->tab_width(); |
|
828
|
|
|
|
|
|
|
} |
|
829
|
|
|
|
|
|
|
} |
|
830
|
12
|
|
100
|
|
|
70
|
$loc{tab_width} ||= 1; |
|
831
|
12
|
|
|
|
|
30
|
\%loc; |
|
832
|
|
|
|
|
|
|
}; |
|
833
|
|
|
|
|
|
|
$loc->{location} |
|
834
|
105
|
50
|
|
|
|
197
|
or return; |
|
835
|
105
|
|
|
|
|
104
|
$token->{location} = [ @{ $loc->{location} } ]; |
|
|
105
|
|
|
|
|
220
|
|
|
836
|
105
|
50
|
|
|
|
208
|
if ( defined( my $content = $token->content() ) ) { |
|
837
|
|
|
|
|
|
|
|
|
838
|
105
|
|
|
|
|
124
|
my $lines; |
|
839
|
105
|
|
|
|
|
214
|
pos( $content ) = 0; |
|
840
|
105
|
|
|
|
|
238
|
$lines++ while $content =~ m/ \n /smxgc; |
|
841
|
105
|
100
|
|
|
|
160
|
if ( pos $content ) { |
|
842
|
2
|
|
|
|
|
3
|
$loc->{location}[LOCATION_LINE] += $lines; |
|
843
|
2
|
|
|
|
|
4
|
$loc->{location}[LOCATION_LOGICAL_LINE] += $lines; |
|
844
|
|
|
|
|
|
|
$loc->{location}[LOCATION_CHARACTER] = |
|
845
|
2
|
|
|
|
|
3
|
$loc->{location}[LOCATION_COLUMN] = 1; |
|
846
|
|
|
|
|
|
|
} |
|
847
|
|
|
|
|
|
|
|
|
848
|
105
|
100
|
|
|
|
205
|
if ( my $chars = length( $content ) - pos( $content ) ) { |
|
849
|
102
|
|
|
|
|
120
|
$loc->{location}[LOCATION_CHARACTER] += $chars; |
|
850
|
102
|
100
|
100
|
|
|
197
|
if ( $loc->{tab_width} > 1 && $content =~ m/ \t /smx ) { |
|
851
|
5
|
|
|
|
|
8
|
my $pos = $loc->{location}[LOCATION_COLUMN]; |
|
852
|
5
|
|
|
|
|
6
|
my $tab_width = $loc->{tab_width}; |
|
853
|
|
|
|
|
|
|
# Stolen shamelessly from PPI::Document::_visual_length |
|
854
|
5
|
|
|
|
|
6
|
my ( $vis_inc ); |
|
855
|
5
|
|
|
|
|
14
|
foreach my $part ( split /(\t)/, $content ) { |
|
856
|
10
|
100
|
|
|
|
14
|
if ($part eq "\t") { |
|
857
|
5
|
|
|
|
|
6
|
$vis_inc = $tab_width - ($pos-1) % $tab_width; |
|
858
|
|
|
|
|
|
|
} else { |
|
859
|
5
|
|
|
|
|
6
|
$vis_inc = length $part; |
|
860
|
|
|
|
|
|
|
} |
|
861
|
10
|
|
|
|
|
10
|
$pos += $vis_inc; |
|
862
|
|
|
|
|
|
|
} |
|
863
|
5
|
|
|
|
|
10
|
$loc->{location}[LOCATION_COLUMN] = $pos; |
|
864
|
|
|
|
|
|
|
} else { |
|
865
|
97
|
|
|
|
|
133
|
$loc->{location}[LOCATION_COLUMN] += $chars; |
|
866
|
|
|
|
|
|
|
} |
|
867
|
|
|
|
|
|
|
} |
|
868
|
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
} |
|
870
|
105
|
|
|
|
|
175
|
return; |
|
871
|
|
|
|
|
|
|
} |
|
872
|
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
sub __PPIX_TOKENIZER__init { |
|
874
|
537
|
|
|
537
|
|
1062
|
my ( $self ) = @_; |
|
875
|
|
|
|
|
|
|
|
|
876
|
537
|
50
|
|
|
|
2879
|
$self->find_regexp( |
|
877
|
|
|
|
|
|
|
qr{ \A ( \s* ) ( qr | m | s )? ( \s* ) ( . ) }smx ) |
|
878
|
|
|
|
|
|
|
or return $self->__init_error(); |
|
879
|
|
|
|
|
|
|
|
|
880
|
537
|
|
|
|
|
2114
|
my ( $leading_white, $type, $next_white, $delim_start ) = $self->capture(); |
|
881
|
|
|
|
|
|
|
|
|
882
|
537
|
100
|
|
|
|
1577
|
defined $type |
|
883
|
|
|
|
|
|
|
or $type = ''; |
|
884
|
|
|
|
|
|
|
|
|
885
|
537
|
100
|
100
|
|
|
2443
|
$type |
|
886
|
|
|
|
|
|
|
or $delim_start =~ m< \A [/?] \z >smx |
|
887
|
|
|
|
|
|
|
or return $self->__init_error(); |
|
888
|
531
|
100
|
100
|
|
|
2135
|
$type |
|
|
|
|
100
|
|
|
|
|
|
889
|
|
|
|
|
|
|
and not $next_white |
|
890
|
|
|
|
|
|
|
and $delim_start =~ m< \A \w \z >smx |
|
891
|
|
|
|
|
|
|
and return $self->__init_error(); |
|
892
|
|
|
|
|
|
|
|
|
893
|
529
|
|
|
|
|
1274
|
$self->{type} = $type; |
|
894
|
|
|
|
|
|
|
|
|
895
|
529
|
|
|
|
|
830
|
my @tokens; |
|
896
|
|
|
|
|
|
|
|
|
897
|
529
|
100
|
|
|
|
1568
|
'' ne $leading_white |
|
898
|
|
|
|
|
|
|
and push @tokens, $self->make_token( length $leading_white, |
|
899
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Whitespace' ); |
|
900
|
529
|
|
|
|
|
1830
|
push @tokens, $self->make_token( length $type, |
|
901
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Structure' ); |
|
902
|
529
|
100
|
|
|
|
1207
|
'' ne $next_white |
|
903
|
|
|
|
|
|
|
and push @tokens, $self->make_token( length $next_white, |
|
904
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Whitespace' ); |
|
905
|
|
|
|
|
|
|
|
|
906
|
529
|
|
|
|
|
1144
|
$self->{delimiter_start} = $delim_start; |
|
907
|
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
$self->{trace} |
|
909
|
529
|
50
|
|
|
|
1357
|
and warn "Tokenizer found regexp start delimiter '$delim_start' at $self->{cursor_curr}\n"; |
|
910
|
|
|
|
|
|
|
|
|
911
|
529
|
50
|
|
|
|
1366
|
if ( my $offset = $self->find_matching_delimiter() ) { |
|
912
|
529
|
|
|
|
|
1047
|
my $cursor_limit = $self->{cursor_curr} + $offset; |
|
913
|
|
|
|
|
|
|
$self->{trace} |
|
914
|
529
|
50
|
|
|
|
1127
|
and warn "Tokenizer found regexp end delimiter at $cursor_limit\n"; |
|
915
|
529
|
100
|
|
|
|
1562
|
if ( $self->__number_of_extra_parts() ) { |
|
916
|
|
|
|
|
|
|
### my $found_embedded_comments; |
|
917
|
46
|
100
|
|
|
|
152
|
if ( $self->close_bracket( |
|
918
|
|
|
|
|
|
|
$self->{delimiter_start} ) ) { |
|
919
|
|
|
|
|
|
|
pos $self->{content} = $self->{cursor_curr} + |
|
920
|
7
|
|
|
|
|
39
|
$offset + 1; |
|
921
|
|
|
|
|
|
|
# If we're bracketed, there may be Perl comments between |
|
922
|
|
|
|
|
|
|
# the regex and the replacement. PPI gets the parse |
|
923
|
|
|
|
|
|
|
# wrong as of 1.220, but if we get the handling of the |
|
924
|
|
|
|
|
|
|
# underlying string right, we will Just Work when PPI |
|
925
|
|
|
|
|
|
|
# gets it right. |
|
926
|
7
|
|
|
|
|
54
|
while ( $self->{content} =~ |
|
927
|
|
|
|
|
|
|
m/ \G \s* \n \s* \# [^\n]* /smxgc ) { |
|
928
|
|
|
|
|
|
|
## $found_embedded_comments = 1; |
|
929
|
|
|
|
|
|
|
} |
|
930
|
7
|
|
|
|
|
27
|
$self->{content} =~ m/ \s* /smxgc; |
|
931
|
|
|
|
|
|
|
} else { |
|
932
|
|
|
|
|
|
|
pos $self->{content} = $self->{cursor_curr} + |
|
933
|
39
|
|
|
|
|
199
|
$offset; |
|
934
|
|
|
|
|
|
|
} |
|
935
|
|
|
|
|
|
|
# Localizing cursor_curr and delimiter_start would be |
|
936
|
|
|
|
|
|
|
# cleaner, but I don't want the old values restored if a |
|
937
|
|
|
|
|
|
|
# parse error occurs. |
|
938
|
46
|
|
|
|
|
109
|
my $cursor_curr = $self->{cursor_curr}; |
|
939
|
46
|
|
|
|
|
132
|
my $delimiter_start = $self->{delimiter_start}; |
|
940
|
46
|
|
|
|
|
114
|
$self->{cursor_curr} = pos $self->{content}; |
|
941
|
|
|
|
|
|
|
$self->{delimiter_start} = substr |
|
942
|
|
|
|
|
|
|
$self->{content}, |
|
943
|
|
|
|
|
|
|
$self->{cursor_curr}, |
|
944
|
46
|
|
|
|
|
112
|
1; |
|
945
|
|
|
|
|
|
|
$self->{trace} |
|
946
|
46
|
50
|
|
|
|
126
|
and warn "Tokenizer found replacement start delimiter '$self->{delimiter_start}' at $self->{cursor_curr}\n"; |
|
947
|
46
|
100
|
|
|
|
101
|
if ( my $s_off = $self->find_matching_delimiter() ) { |
|
948
|
|
|
|
|
|
|
$self->{cursor_modifiers} = |
|
949
|
44
|
|
|
|
|
112
|
$self->{cursor_curr} + $s_off + 1; |
|
950
|
|
|
|
|
|
|
$self->{trace} |
|
951
|
44
|
50
|
|
|
|
143
|
and warn "Tokenizer found replacement end delimiter at @{[ |
|
952
|
0
|
|
|
|
|
0
|
$self->{cursor_curr} + $s_off ]}\n"; |
|
953
|
44
|
|
|
|
|
78
|
$self->{cursor_curr} = $cursor_curr; |
|
954
|
44
|
|
|
|
|
93
|
$self->{delimiter_start} = $delimiter_start; |
|
955
|
|
|
|
|
|
|
} else { |
|
956
|
|
|
|
|
|
|
$self->{trace} |
|
957
|
2
|
50
|
|
|
|
6
|
and warn 'Tokenizer failed to find replacement', |
|
958
|
|
|
|
|
|
|
"end delimiter starting at $self->{cursor_curr}\n"; |
|
959
|
2
|
|
|
|
|
3
|
$self->{cursor_curr} = 0; |
|
960
|
|
|
|
|
|
|
# TODO If I were smart enough here I could check for |
|
961
|
|
|
|
|
|
|
# PPI mis-parses like s{foo} |
|
962
|
|
|
|
|
|
|
# #{bar} |
|
963
|
|
|
|
|
|
|
# {baz} |
|
964
|
|
|
|
|
|
|
# here, doing so if $found_embedded_comments (commented |
|
965
|
|
|
|
|
|
|
# out above) is true. The problem is that there seem to |
|
966
|
|
|
|
|
|
|
# as many mis-parses as there are possible delimiters. |
|
967
|
2
|
|
|
|
|
6
|
return $self->__init_error( |
|
968
|
|
|
|
|
|
|
'Tokenizer found mismatched replacement delimiters', |
|
969
|
|
|
|
|
|
|
); |
|
970
|
|
|
|
|
|
|
} |
|
971
|
|
|
|
|
|
|
} else { |
|
972
|
483
|
|
|
|
|
888
|
$self->{cursor_modifiers} = $cursor_limit + 1; |
|
973
|
|
|
|
|
|
|
} |
|
974
|
527
|
|
|
|
|
1079
|
$self->{cursor_limit} = $cursor_limit; |
|
975
|
|
|
|
|
|
|
} else { |
|
976
|
0
|
|
|
|
|
0
|
$self->{cursor_curr} = 0; |
|
977
|
|
|
|
|
|
|
return $self->_make_final_token( |
|
978
|
0
|
|
|
|
|
0
|
length( $self->{content} ), TOKEN_UNKNOWN, { |
|
979
|
|
|
|
|
|
|
error => 'Tokenizer found mismatched regexp delimiters', |
|
980
|
|
|
|
|
|
|
}, |
|
981
|
|
|
|
|
|
|
); |
|
982
|
|
|
|
|
|
|
} |
|
983
|
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
{ |
|
985
|
|
|
|
|
|
|
# We have to instantiate the trailing tokens now so we can |
|
986
|
|
|
|
|
|
|
# figure out what modifiers are in effect. But we can't |
|
987
|
|
|
|
|
|
|
# index their locations (if desired) because they are being |
|
988
|
|
|
|
|
|
|
# instantiated out of order |
|
989
|
|
|
|
|
|
|
|
|
990
|
527
|
|
|
|
|
697
|
local $self->{index_locations} = 0; |
|
|
527
|
|
|
|
|
1136
|
|
|
991
|
|
|
|
|
|
|
|
|
992
|
527
|
|
|
|
|
723
|
my @mods = @{ $self->{default_modifiers} }; |
|
|
527
|
|
|
|
|
1050
|
|
|
993
|
527
|
|
|
|
|
1751
|
pos $self->{content} = $self->{cursor_modifiers}; |
|
994
|
527
|
|
|
|
|
1149
|
local $self->{cursor_curr} = $self->{cursor_modifiers}; |
|
995
|
527
|
|
|
|
|
1261
|
local $self->{cursor_limit} = length $self->{content}; |
|
996
|
527
|
|
|
|
|
827
|
my @trailing; |
|
997
|
|
|
|
|
|
|
{ |
|
998
|
527
|
|
|
|
|
655
|
my $len = $self->find_regexp( qr{ \A [[:lower:]]* }smx ); |
|
|
527
|
|
|
|
|
1881
|
|
|
999
|
527
|
|
|
|
|
1469
|
push @trailing, $self->make_token( $len, |
|
1000
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Modifier' ); |
|
1001
|
|
|
|
|
|
|
} |
|
1002
|
527
|
100
|
|
|
|
1597
|
if ( my $len = $self->find_regexp( qr{ \A \s+ }smx ) ) { |
|
1003
|
1
|
|
|
|
|
4
|
push @trailing, $self->make_token( $len, |
|
1004
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Whitespace' ); |
|
1005
|
|
|
|
|
|
|
} |
|
1006
|
527
|
100
|
|
|
|
1741
|
if ( my $len = $self->find_regexp( qr{ \A .+ }smx ) ) { |
|
1007
|
1
|
|
|
|
|
5
|
push @trailing, $self->make_token( $len, TOKEN_UNKNOWN, { |
|
1008
|
|
|
|
|
|
|
error => 'Trailing characters after expression', |
|
1009
|
|
|
|
|
|
|
} ); |
|
1010
|
|
|
|
|
|
|
} |
|
1011
|
527
|
|
|
|
|
1508
|
$self->{trailing_tokens} = \@trailing; |
|
1012
|
527
|
|
|
|
|
1359
|
push @mods, $trailing[0]->content(); |
|
1013
|
|
|
|
|
|
|
$self->{effective_modifiers} = |
|
1014
|
527
|
|
|
|
|
1235
|
PPIx::Regexp::Token::Modifier::__aggregate_modifiers ( |
|
1015
|
|
|
|
|
|
|
@mods ); |
|
1016
|
|
|
|
|
|
|
$self->{modifiers} = [ |
|
1017
|
527
|
|
|
|
|
1042
|
{ %{ $self->{effective_modifiers} } }, |
|
|
527
|
|
|
|
|
2481
|
|
|
1018
|
|
|
|
|
|
|
]; |
|
1019
|
|
|
|
|
|
|
} |
|
1020
|
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
$self->{delimiter_finish} = substr |
|
1022
|
|
|
|
|
|
|
$self->{content}, |
|
1023
|
|
|
|
|
|
|
$self->{cursor_limit}, |
|
1024
|
527
|
|
|
|
|
1478
|
1; |
|
1025
|
|
|
|
|
|
|
|
|
1026
|
527
|
|
|
|
|
1193
|
push @tokens, $self->make_token( 1, |
|
1027
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Delimiter' ); |
|
1028
|
|
|
|
|
|
|
|
|
1029
|
527
|
|
|
|
|
1738
|
$self->_set_mode( 'regexp' ); |
|
1030
|
|
|
|
|
|
|
|
|
1031
|
527
|
|
|
|
|
803
|
$self->{find} = undef; |
|
1032
|
|
|
|
|
|
|
|
|
1033
|
527
|
|
|
|
|
2072
|
return @tokens; |
|
1034
|
|
|
|
|
|
|
} |
|
1035
|
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
# Match the initial part of the regexp including any leading white |
|
1037
|
|
|
|
|
|
|
# space. The initial delimiter is the first thing not consumed, though |
|
1038
|
|
|
|
|
|
|
# we check it for sanity. |
|
1039
|
|
|
|
|
|
|
sub __initial_match { |
|
1040
|
0
|
|
|
0
|
|
0
|
my ( $self ) = @_; |
|
1041
|
|
|
|
|
|
|
|
|
1042
|
0
|
0
|
|
|
|
0
|
$self->find_regexp( |
|
1043
|
|
|
|
|
|
|
qr{ \A ( \s* ) ( qr | m | s )? ( \s* ) (?: [^\w\s] ) }smx ) |
|
1044
|
|
|
|
|
|
|
or return; |
|
1045
|
|
|
|
|
|
|
|
|
1046
|
0
|
|
|
|
|
0
|
my ( $leading_white, $type, $next_white ) = $self->capture(); |
|
1047
|
|
|
|
|
|
|
|
|
1048
|
0
|
0
|
|
|
|
0
|
defined $type |
|
1049
|
|
|
|
|
|
|
or $type = ''; |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
0
|
|
|
|
|
0
|
$self->{type} = $type; |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
0
|
|
|
|
|
0
|
my @tokens; |
|
1054
|
|
|
|
|
|
|
|
|
1055
|
0
|
0
|
|
|
|
0
|
'' ne $leading_white |
|
1056
|
|
|
|
|
|
|
and push @tokens, $self->make_token( length $leading_white, |
|
1057
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Whitespace' ); |
|
1058
|
0
|
|
|
|
|
0
|
push @tokens, $self->make_token( length $type, |
|
1059
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Structure' ); |
|
1060
|
0
|
0
|
|
|
|
0
|
'' ne $next_white |
|
1061
|
|
|
|
|
|
|
and push @tokens, $self->make_token( length $next_white, |
|
1062
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Whitespace' ); |
|
1063
|
|
|
|
|
|
|
|
|
1064
|
0
|
|
|
|
|
0
|
return @tokens; |
|
1065
|
|
|
|
|
|
|
} |
|
1066
|
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
{ |
|
1068
|
|
|
|
|
|
|
my %extra_parts = ( |
|
1069
|
|
|
|
|
|
|
s => 1, |
|
1070
|
|
|
|
|
|
|
); |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
# Return the number of extra delimited parts. This will be 0 except |
|
1073
|
|
|
|
|
|
|
# for s///, which will be 1. |
|
1074
|
|
|
|
|
|
|
sub __number_of_extra_parts { |
|
1075
|
855
|
|
|
855
|
|
1332
|
my ( $self ) = @_; |
|
1076
|
855
|
|
100
|
|
|
3328
|
return $extra_parts{$self->{type}} || 0; |
|
1077
|
|
|
|
|
|
|
} |
|
1078
|
|
|
|
|
|
|
} |
|
1079
|
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
{ |
|
1081
|
|
|
|
|
|
|
my @part_class = qw{ |
|
1082
|
|
|
|
|
|
|
PPIx::Regexp::Structure::Regexp |
|
1083
|
|
|
|
|
|
|
PPIx::Regexp::Structure::Replacement |
|
1084
|
|
|
|
|
|
|
}; |
|
1085
|
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
# Return the classes for the parts of the expression. |
|
1087
|
|
|
|
|
|
|
sub __part_classes { |
|
1088
|
326
|
|
|
326
|
|
678
|
my ( $self ) = @_; |
|
1089
|
326
|
|
|
|
|
756
|
my $max = $self->__number_of_extra_parts(); |
|
1090
|
326
|
|
|
|
|
1429
|
return @part_class[ 0 .. $max ]; |
|
1091
|
|
|
|
|
|
|
} |
|
1092
|
|
|
|
|
|
|
} |
|
1093
|
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
sub __PPIX_TOKENIZER__regexp { |
|
1095
|
3036
|
|
|
3036
|
|
4516
|
my ( $self, $character ) = @_; |
|
1096
|
|
|
|
|
|
|
|
|
1097
|
3036
|
|
|
|
|
4039
|
my $mode = $self->{mode}; |
|
1098
|
3036
|
|
|
|
|
3828
|
my $handler = '__PPIX_TOKENIZER__' . $mode; |
|
1099
|
|
|
|
|
|
|
|
|
1100
|
3036
|
|
|
|
|
4244
|
$self->{cursor_orig} = $self->{cursor_curr}; |
|
1101
|
3036
|
|
|
|
|
5333
|
foreach my $class ( $self->_known_tokenizers() ) { |
|
1102
|
13478
|
|
|
|
|
38113
|
my @tokens = grep { $_ } $class->$handler( $self, $character ); |
|
|
3867
|
|
|
|
|
7054
|
|
|
1103
|
|
|
|
|
|
|
$self->{trace} |
|
1104
|
13478
|
50
|
|
|
|
19899
|
and warn $class, "->$handler( \$self, '$character' )", |
|
1105
|
|
|
|
|
|
|
" => (@tokens)\n"; |
|
1106
|
|
|
|
|
|
|
@tokens |
|
1107
|
|
|
|
|
|
|
and return ( map { |
|
1108
|
13478
|
100
|
|
|
|
19996
|
ref $_ ? $_ : $self->make_token( $_, |
|
|
3033
|
100
|
|
|
|
8408
|
|
|
1109
|
|
|
|
|
|
|
$class ) } @tokens ); |
|
1110
|
|
|
|
|
|
|
} |
|
1111
|
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
# Find a fallback processor for the character. |
|
1113
|
27
|
|
33
|
|
|
204
|
my $fallback = __PACKAGE__->can( '__PPIX_TOKEN_FALLBACK__' . $mode ) |
|
1114
|
|
|
|
|
|
|
|| __PACKAGE__->can( '__PPIX_TOKEN_FALLBACK__regexp' ) |
|
1115
|
|
|
|
|
|
|
|| confess "Programming error - unable to find fallback for $mode"; |
|
1116
|
27
|
|
|
|
|
67
|
return $fallback->( $self, $character ); |
|
1117
|
|
|
|
|
|
|
} |
|
1118
|
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
*__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp; |
|
1120
|
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
sub __PPIX_TOKEN_FALLBACK__regexp { |
|
1122
|
18
|
|
|
18
|
|
39
|
my ( $self, $character ) = @_; |
|
1123
|
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
# As a fallback in regexp mode, any escaped character is a literal. |
|
1125
|
18
|
100
|
66
|
|
|
62
|
if ( $character eq '\\' |
|
1126
|
|
|
|
|
|
|
&& $self->{cursor_limit} - $self->{cursor_curr} > 1 |
|
1127
|
|
|
|
|
|
|
) { |
|
1128
|
2
|
|
|
|
|
6
|
return $self->make_token( 2, TOKEN_LITERAL ); |
|
1129
|
|
|
|
|
|
|
} |
|
1130
|
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
# Any normal character is unknown. |
|
1132
|
16
|
|
|
|
|
78
|
return $self->make_token( 1, TOKEN_UNKNOWN, { |
|
1133
|
|
|
|
|
|
|
error => 'Tokenizer found unexpected literal', |
|
1134
|
|
|
|
|
|
|
}, |
|
1135
|
|
|
|
|
|
|
); |
|
1136
|
|
|
|
|
|
|
} |
|
1137
|
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
sub __PPIX_TOKEN_FALLBACK__repl { |
|
1139
|
9
|
|
|
9
|
|
15
|
my ( $self, $character ) = @_; |
|
1140
|
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
# As a fallback in replacement mode, any escaped character is a literal. |
|
1142
|
9
|
100
|
66
|
|
|
28
|
if ( $character eq '\\' |
|
1143
|
|
|
|
|
|
|
&& defined ( my $next = $self->peek( 1 ) ) ) { |
|
1144
|
|
|
|
|
|
|
|
|
1145
|
5
|
0
|
33
|
|
|
12
|
if ( $self->interpolates() || $next eq q<'> || $next eq '\\' ) { |
|
|
|
|
33
|
|
|
|
|
|
1146
|
5
|
|
|
|
|
12
|
return $self->make_token( 2, TOKEN_LITERAL ); |
|
1147
|
|
|
|
|
|
|
} |
|
1148
|
0
|
|
|
|
|
0
|
return $self->make_token( 1, TOKEN_LITERAL ); |
|
1149
|
|
|
|
|
|
|
} |
|
1150
|
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
# So is any normal character. |
|
1152
|
4
|
|
|
|
|
9
|
return $self->make_token( 1, TOKEN_LITERAL ); |
|
1153
|
|
|
|
|
|
|
} |
|
1154
|
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
sub __PPIX_TOKENIZER__finish { |
|
1156
|
560
|
|
|
560
|
|
987
|
my ( $self ) = @_; # $character unused |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
$self->{cursor_limit} > length $self->{content} |
|
1159
|
560
|
50
|
|
|
|
1439
|
and confess "Programming error - ran off string"; |
|
1160
|
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
my @tokens = $self->make_token( length $self->{delimiter_finish}, |
|
1162
|
560
|
|
|
|
|
1510
|
'PPIx::Regexp::Token::Delimiter' ); |
|
1163
|
|
|
|
|
|
|
|
|
1164
|
560
|
100
|
|
|
|
1415
|
if ( $self->{cursor_curr} == $self->{cursor_modifiers} ) { |
|
1165
|
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
# We are out of string. Add the trailing tokens (created when we |
|
1167
|
|
|
|
|
|
|
# did the initial bracket scan) and close up shop. |
|
1168
|
|
|
|
|
|
|
|
|
1169
|
516
|
|
|
|
|
1416
|
push @tokens, $self->_get_trailing_tokens(); |
|
1170
|
|
|
|
|
|
|
|
|
1171
|
516
|
|
|
|
|
996
|
$self->_set_mode( 'kaput' ); |
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
} else { |
|
1174
|
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
# Clear the cookies, because we are going around again. |
|
1176
|
44
|
|
|
|
|
167
|
$self->{cookie} = {}; |
|
1177
|
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
# Move the cursor limit to just before the modifiers. |
|
1179
|
44
|
|
|
|
|
117
|
$self->{cursor_limit} = $self->{cursor_modifiers} - 1; |
|
1180
|
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
# If the preceding regular expression was bracketed, we need to |
|
1182
|
|
|
|
|
|
|
# consume possible whitespace and find another delimiter. |
|
1183
|
|
|
|
|
|
|
|
|
1184
|
44
|
100
|
|
|
|
172
|
if ( $self->close_bracket( $self->{delimiter_start} ) ) { |
|
1185
|
7
|
|
|
|
|
11
|
my $accept; |
|
1186
|
|
|
|
|
|
|
# If we are bracketed, there can be honest-to-God Perl |
|
1187
|
|
|
|
|
|
|
# comments between the regexp and the replacement, not just |
|
1188
|
|
|
|
|
|
|
# regexp comments. As of version 1.220, PPI does not get |
|
1189
|
|
|
|
|
|
|
# this parse right, but if we can handle this is a string, |
|
1190
|
|
|
|
|
|
|
# then we will Just Work when PPI gets itself straight. |
|
1191
|
7
|
|
|
|
|
31
|
while ( $self->find_regexp( |
|
1192
|
|
|
|
|
|
|
qr{ \A ( \s* \n \s* ) ( \# [^\n]* \n ) }smx ) ) { |
|
1193
|
2
|
|
|
|
|
7
|
my ( $white_space, $comment ) = $self->capture(); |
|
1194
|
2
|
|
|
|
|
6
|
push @tokens, $self->make_token( |
|
1195
|
|
|
|
|
|
|
length $white_space, |
|
1196
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Whitespace', |
|
1197
|
|
|
|
|
|
|
), $self->make_token( |
|
1198
|
|
|
|
|
|
|
length $comment, |
|
1199
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Comment', |
|
1200
|
|
|
|
|
|
|
); |
|
1201
|
|
|
|
|
|
|
} |
|
1202
|
7
|
100
|
|
|
|
26
|
$accept = $self->find_regexp( qr{ \A \s+ }smx ) |
|
1203
|
|
|
|
|
|
|
and push @tokens, $self->make_token( |
|
1204
|
|
|
|
|
|
|
$accept, 'PPIx::Regexp::Token::Whitespace' ); |
|
1205
|
7
|
|
|
|
|
26
|
my $character = $self->peek(); |
|
1206
|
7
|
|
|
|
|
17
|
$self->{delimiter_start} = $character; |
|
1207
|
7
|
|
|
|
|
19
|
push @tokens, $self->make_token( |
|
1208
|
|
|
|
|
|
|
1, 'PPIx::Regexp::Token::Delimiter' ); |
|
1209
|
|
|
|
|
|
|
$self->{delimiter_finish} = substr |
|
1210
|
|
|
|
|
|
|
$self->{content}, |
|
1211
|
7
|
|
|
|
|
19
|
$self->{cursor_limit} - 1, |
|
1212
|
|
|
|
|
|
|
1; |
|
1213
|
|
|
|
|
|
|
} |
|
1214
|
|
|
|
|
|
|
|
|
1215
|
44
|
100
|
|
|
|
105
|
if ( $self->modifier( 'e*' ) ) { |
|
1216
|
|
|
|
|
|
|
# With /e or /ee, the replacement portion is code. We make |
|
1217
|
|
|
|
|
|
|
# it all into one big PPIx::Regexp::Token::Code, slap on the |
|
1218
|
|
|
|
|
|
|
# trailing delimiter and modifiers, and return it all. |
|
1219
|
|
|
|
|
|
|
push @tokens, $self->make_token( |
|
1220
|
|
|
|
|
|
|
$self->{cursor_limit} - $self->{cursor_curr}, |
|
1221
|
11
|
|
|
|
|
54
|
'PPIx::Regexp::Token::Code', |
|
1222
|
|
|
|
|
|
|
{ perl_version_introduced => MINIMUM_PERL }, |
|
1223
|
|
|
|
|
|
|
); |
|
1224
|
11
|
|
|
|
|
34
|
$self->{cursor_limit} = length $self->{content}; |
|
1225
|
11
|
|
|
|
|
32
|
push @tokens, $self->make_token( 1, |
|
1226
|
|
|
|
|
|
|
'PPIx::Regexp::Token::Delimiter' ), |
|
1227
|
|
|
|
|
|
|
$self->_get_trailing_tokens(); |
|
1228
|
11
|
|
|
|
|
29
|
$self->_set_mode( 'kaput' ); |
|
1229
|
|
|
|
|
|
|
} else { |
|
1230
|
|
|
|
|
|
|
# Put our mode to replacement. |
|
1231
|
33
|
|
|
|
|
70
|
$self->_set_mode( 'repl' ); |
|
1232
|
|
|
|
|
|
|
} |
|
1233
|
|
|
|
|
|
|
|
|
1234
|
|
|
|
|
|
|
} |
|
1235
|
|
|
|
|
|
|
|
|
1236
|
560
|
|
|
|
|
1507
|
return @tokens; |
|
1237
|
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
} |
|
1239
|
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
# To common processing on trailing tokens. |
|
1241
|
|
|
|
|
|
|
sub _get_trailing_tokens { |
|
1242
|
527
|
|
|
527
|
|
874
|
my ( $self ) = @_; |
|
1243
|
527
|
100
|
|
|
|
1074
|
if ( $self->{index_locations} ) { |
|
1244
|
|
|
|
|
|
|
# We turned off index_locations when these were created, because |
|
1245
|
|
|
|
|
|
|
# they were done out of order. Fix that now. |
|
1246
|
11
|
|
|
|
|
16
|
foreach my $token ( @{ $self->{trailing_tokens} } ) { |
|
|
11
|
|
|
|
|
21
|
|
|
1247
|
11
|
|
|
|
|
24
|
$self->_update_location( $token ); |
|
1248
|
|
|
|
|
|
|
} |
|
1249
|
|
|
|
|
|
|
} |
|
1250
|
527
|
|
|
|
|
648
|
return @{ delete $self->{trailing_tokens} }; |
|
|
527
|
|
|
|
|
1341
|
|
|
1251
|
|
|
|
|
|
|
} |
|
1252
|
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
1; |
|
1254
|
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
__END__ |
|
1256
|
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
=head1 NAME |
|
1258
|
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
PPIx::Regexp::Tokenizer - Tokenize a regular expression |
|
1260
|
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
1262
|
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
use PPIx::Regexp::Dumper; |
|
1264
|
|
|
|
|
|
|
PPIx::Regexp::Dumper->new( 'qr{foo}smx' ) |
|
1265
|
|
|
|
|
|
|
->print(); |
|
1266
|
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
=head1 INHERITANCE |
|
1268
|
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
C<PPIx::Regexp::Tokenizer> is a |
|
1270
|
|
|
|
|
|
|
L<PPIx::Regexp::Support|PPIx::Regexp::Support>. |
|
1271
|
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
C<PPIx::Regexp::Tokenizer> has no descendants. |
|
1273
|
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
1275
|
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
This class provides tokenization of the regular expression. |
|
1277
|
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
=head1 METHODS |
|
1279
|
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
This class provides the following public methods. Methods not documented |
|
1281
|
|
|
|
|
|
|
here (or documented below under L</EXTERNAL TOKENIZERS>) are private, |
|
1282
|
|
|
|
|
|
|
and unsupported in the sense that the author reserves the right to |
|
1283
|
|
|
|
|
|
|
change or remove them without notice. |
|
1284
|
|
|
|
|
|
|
|
|
1285
|
|
|
|
|
|
|
=head2 new |
|
1286
|
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
my $tokenizer = PPIx::Regexp::Tokenizer->new( 'xyzzy' ); |
|
1288
|
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
This static method instantiates the tokenizer. You must pass it the |
|
1290
|
|
|
|
|
|
|
regular expression to be parsed, either as a string or as a |
|
1291
|
|
|
|
|
|
|
L<PPI::Element|PPI::Element> of some sort. You can also pass optional |
|
1292
|
|
|
|
|
|
|
name/value pairs of arguments. The option names are specified B<without> |
|
1293
|
|
|
|
|
|
|
a leading dash. Supported options are: |
|
1294
|
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
=over |
|
1296
|
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
=item default_modifiers array_reference |
|
1298
|
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
This argument specifies default statement modifiers. It is optional, but |
|
1300
|
|
|
|
|
|
|
if specified must be an array reference. See the |
|
1301
|
|
|
|
|
|
|
L<PPIx::Regexp|PPIx::Regexp> L<new()|PPIx::Regexp/new> documentation for |
|
1302
|
|
|
|
|
|
|
the details. |
|
1303
|
|
|
|
|
|
|
|
|
1304
|
|
|
|
|
|
|
=item encoding name |
|
1305
|
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
This option specifies the encoding of the string to be tokenized. If |
|
1307
|
|
|
|
|
|
|
specified, an C<Encode::decode> is done on the string (or the C<content> |
|
1308
|
|
|
|
|
|
|
of the PPI class) before it is tokenized. |
|
1309
|
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
=item index_locations |
|
1311
|
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
This Boolean option specifies that the locations of the generated tokens |
|
1313
|
|
|
|
|
|
|
are to be computed. |
|
1314
|
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
=item strict boolean |
|
1316
|
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
This option specifies whether tokenization should assume |
|
1318
|
|
|
|
|
|
|
C<use re 'strict';> is in effect. |
|
1319
|
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
The C<'strict'> pragma was introduced in Perl 5.22, and its |
|
1321
|
|
|
|
|
|
|
documentation says that it is experimental, and that there is no |
|
1322
|
|
|
|
|
|
|
commitment to backward compatibility. The same applies to the |
|
1323
|
|
|
|
|
|
|
tokenization produced when this option is asserted. |
|
1324
|
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
=item trace number |
|
1326
|
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
Specifying a positive value for this option causes a trace of the |
|
1328
|
|
|
|
|
|
|
tokenization. This option is unsupported in the sense that the author |
|
1329
|
|
|
|
|
|
|
reserves the right to alter it without notice. |
|
1330
|
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
If this option is unspecified, the value comes from environment variable |
|
1332
|
|
|
|
|
|
|
C<PPIX_REGEXP_TOKENIZER_TRACE> (see L</ENVIRONMENT VARIABLES>). If this |
|
1333
|
|
|
|
|
|
|
environment variable does not exist, the default is 0. |
|
1334
|
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
=back |
|
1336
|
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
Undocumented options are unsupported. |
|
1338
|
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
The returned value is the instantiated tokenizer, or C<undef> if |
|
1340
|
|
|
|
|
|
|
instantiation failed. In the latter case a call to L</errstr> will |
|
1341
|
|
|
|
|
|
|
return the reason. |
|
1342
|
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
=head2 content |
|
1344
|
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
print $tokenizer->content(); |
|
1346
|
|
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
|
This method returns the string being tokenized. This will be the result |
|
1348
|
|
|
|
|
|
|
of the L<< PPI::Element->content()|PPI::Element/content >> method if the |
|
1349
|
|
|
|
|
|
|
object was instantiated with a L<PPI::Element|PPI::Element>. |
|
1350
|
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
=head2 default_modifiers |
|
1352
|
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
print join ', ', @{ $tokenizer->default_modifiers() }; |
|
1354
|
|
|
|
|
|
|
|
|
1355
|
|
|
|
|
|
|
This method returns a reference to a copy of the array passed to the |
|
1356
|
|
|
|
|
|
|
C<default_modifiers> argument to L<new()|/new>. If this argument was not |
|
1357
|
|
|
|
|
|
|
used to instantiate the object, the return is a reference to an empty |
|
1358
|
|
|
|
|
|
|
array. |
|
1359
|
|
|
|
|
|
|
|
|
1360
|
|
|
|
|
|
|
=head2 encoding |
|
1361
|
|
|
|
|
|
|
|
|
1362
|
|
|
|
|
|
|
This method returns the encoding of the data being parsed, if one was |
|
1363
|
|
|
|
|
|
|
set when the class was instantiated; otherwise it simply returns undef. |
|
1364
|
|
|
|
|
|
|
|
|
1365
|
|
|
|
|
|
|
=head2 errstr |
|
1366
|
|
|
|
|
|
|
|
|
1367
|
|
|
|
|
|
|
my $tokenizer = PPIx::Regexp::Tokenizer->new( 'xyzzy' ) |
|
1368
|
|
|
|
|
|
|
or die PPIx::Regexp::Tokenizer->errstr(); |
|
1369
|
|
|
|
|
|
|
|
|
1370
|
|
|
|
|
|
|
This static method returns an error description if tokenizer |
|
1371
|
|
|
|
|
|
|
instantiation failed. |
|
1372
|
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
=head2 failures |
|
1374
|
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
print $tokenizer->failures(), " tokenization failures\n"; |
|
1376
|
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
This method returns the number of tokenization failures encountered. A |
|
1378
|
|
|
|
|
|
|
tokenization failure is represented in the output token stream by a |
|
1379
|
|
|
|
|
|
|
L<PPIx::Regexp::Token::Unknown|PPIx::Regexp::Token::Unknown>. |
|
1380
|
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
=head2 modifier |
|
1382
|
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
$tokenizer->modifier( 'x' ) |
|
1384
|
|
|
|
|
|
|
and print "Tokenizing an extended regular expression\n"; |
|
1385
|
|
|
|
|
|
|
|
|
1386
|
|
|
|
|
|
|
This method returns true if the given modifier character was found on |
|
1387
|
|
|
|
|
|
|
the end of the regular expression, and false otherwise. |
|
1388
|
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
Starting with version 0.036_01, if the argument is a |
|
1390
|
|
|
|
|
|
|
single-character modifier followed by an asterisk (intended as a wild |
|
1391
|
|
|
|
|
|
|
card character), the return is the number of times that modifier |
|
1392
|
|
|
|
|
|
|
appears. In this case an exception will be thrown if you specify a |
|
1393
|
|
|
|
|
|
|
multi-character modifier (e.g. C<'ee*'>), or if you specify one of the |
|
1394
|
|
|
|
|
|
|
match semantics modifiers (e.g. C<'a*'>). |
|
1395
|
|
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
If called by an external tokenizer, this method returns true if if the |
|
1397
|
|
|
|
|
|
|
given modifier was true at the current point in the tokenization. |
|
1398
|
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
=head2 next_token |
|
1400
|
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
my $token = $tokenizer->next_token(); |
|
1402
|
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
This method returns the next token in the token stream, or nothing if |
|
1404
|
|
|
|
|
|
|
there are no more tokens. |
|
1405
|
|
|
|
|
|
|
|
|
1406
|
|
|
|
|
|
|
=head2 significant |
|
1407
|
|
|
|
|
|
|
|
|
1408
|
|
|
|
|
|
|
This method exists simply for the convenience of |
|
1409
|
|
|
|
|
|
|
L<PPIx::Regexp::Dumper|PPIx::Regexp::Dumper>. It always returns true. |
|
1410
|
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
=head2 tokens |
|
1412
|
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
my @tokens = $tokenizer->tokens(); |
|
1414
|
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
This method returns all remaining tokens in the token stream. |
|
1416
|
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
=head1 EXTERNAL TOKENIZERS |
|
1418
|
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
This class does very little of its own tokenization. Instead the token |
|
1420
|
|
|
|
|
|
|
classes contain external tokenization routines, whose name is |
|
1421
|
|
|
|
|
|
|
'__PPIX_TOKENIZER__' concatenated with the current mode of the tokenizer |
|
1422
|
|
|
|
|
|
|
('regexp' for regular expressions, 'repl' for the replacement string). |
|
1423
|
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
These external tokenizers are called as static methods, and passed the |
|
1425
|
|
|
|
|
|
|
C<PPIx::Regexp::Tokenizer> object and the current character in the |
|
1426
|
|
|
|
|
|
|
character stream. |
|
1427
|
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
If the external tokenizer wants to make one or more tokens, it returns |
|
1429
|
|
|
|
|
|
|
an array containing either length in characters for tokens of the |
|
1430
|
|
|
|
|
|
|
tokenizer's own class, or the results of one or more L</make_token> |
|
1431
|
|
|
|
|
|
|
calls for tokens of an arbitrary class. |
|
1432
|
|
|
|
|
|
|
|
|
1433
|
|
|
|
|
|
|
If the external tokenizer is not interested in the characters starting |
|
1434
|
|
|
|
|
|
|
at the current position it simply returns. |
|
1435
|
|
|
|
|
|
|
|
|
1436
|
|
|
|
|
|
|
The following methods are for the use of external tokenizers, and B<are |
|
1437
|
|
|
|
|
|
|
not part of the public interface to this class.> |
|
1438
|
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
=head2 capture |
|
1440
|
|
|
|
|
|
|
|
|
1441
|
|
|
|
|
|
|
if ( $tokenizer->find_regexp( qr{ \A ( foo ) }smx ) ) { |
|
1442
|
|
|
|
|
|
|
foreach ( $tokenizer->capture() ) { |
|
1443
|
|
|
|
|
|
|
print "$_\n"; |
|
1444
|
|
|
|
|
|
|
} |
|
1445
|
|
|
|
|
|
|
} |
|
1446
|
|
|
|
|
|
|
|
|
1447
|
|
|
|
|
|
|
This method returns all the contents of any capture buffers from the |
|
1448
|
|
|
|
|
|
|
previous call to L</find_regexp>. The first element of the array (i.e. |
|
1449
|
|
|
|
|
|
|
element 0) corresponds to C<$1>, and so on. |
|
1450
|
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
The captures are cleared by L</make_token>, as well as by another call |
|
1452
|
|
|
|
|
|
|
to L</find_regexp>. |
|
1453
|
|
|
|
|
|
|
|
|
1454
|
|
|
|
|
|
|
=head2 cookie |
|
1455
|
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
$tokenizer->cookie( foo => sub { 1 } ); |
|
1457
|
|
|
|
|
|
|
my $cookie = $tokenizer->cookie( 'foo' ); |
|
1458
|
|
|
|
|
|
|
my $old_hint = $tokenizer->cookie( foo => undef ); |
|
1459
|
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
This method either creates, deletes, or accesses a cookie. |
|
1461
|
|
|
|
|
|
|
|
|
1462
|
|
|
|
|
|
|
A cookie is a code reference which is called whenever the tokenizer makes |
|
1463
|
|
|
|
|
|
|
a token. If it returns a false value, it is deleted. Explicitly setting |
|
1464
|
|
|
|
|
|
|
the cookie to C<undef> also deletes it. |
|
1465
|
|
|
|
|
|
|
|
|
1466
|
|
|
|
|
|
|
When you call C<< $tokenizer->cookie( 'foo' ) >>, the current cookie is |
|
1467
|
|
|
|
|
|
|
returned. If you pass a new value of C<undef> to delete the token, the |
|
1468
|
|
|
|
|
|
|
deleted cookie (if any) is returned. |
|
1469
|
|
|
|
|
|
|
|
|
1470
|
|
|
|
|
|
|
When the L</make_token> method calls a cookie, it passes it the tokenizer |
|
1471
|
|
|
|
|
|
|
and the token just made. If a token calls a cookie, it is recommended that |
|
1472
|
|
|
|
|
|
|
it merely pass the tokenizer, though of course the token can do whatever |
|
1473
|
|
|
|
|
|
|
it wants. |
|
1474
|
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
The cookie mechanism seems to be a bit of a crock, but it appeared to be |
|
1476
|
|
|
|
|
|
|
more work to fix things up in the lexer after the tokenizer got |
|
1477
|
|
|
|
|
|
|
something wrong. |
|
1478
|
|
|
|
|
|
|
|
|
1479
|
|
|
|
|
|
|
The recommended way to write a cookie is to use a closure to store any |
|
1480
|
|
|
|
|
|
|
necessary data, and have a call to the cookie return the data; otherwise |
|
1481
|
|
|
|
|
|
|
the ultimate consumer of the cookie has no way to access the data. Of |
|
1482
|
|
|
|
|
|
|
course, it may be that the presence of the cookie at a certain point in |
|
1483
|
|
|
|
|
|
|
the parse is all that is required. |
|
1484
|
|
|
|
|
|
|
|
|
1485
|
|
|
|
|
|
|
=head2 expect |
|
1486
|
|
|
|
|
|
|
|
|
1487
|
|
|
|
|
|
|
$tokenizer->expect( 'PPIx::Regexp::Token::Code' ); |
|
1488
|
|
|
|
|
|
|
|
|
1489
|
|
|
|
|
|
|
This method inserts a given class at the head of the token scan, for the |
|
1490
|
|
|
|
|
|
|
next iteration only. More than one class can be specified. Class names |
|
1491
|
|
|
|
|
|
|
can be abbreviated by removing the leading 'PPIx::Regexp::'. |
|
1492
|
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
If no class is specified, this method does nothing. |
|
1494
|
|
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
|
The expectation lasts from the next time L</get_token> is called until |
|
1496
|
|
|
|
|
|
|
the next time L</make_token> makes a significant token, or until the |
|
1497
|
|
|
|
|
|
|
next C<expect> call if that is done sooner. |
|
1498
|
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
=head2 find_regexp |
|
1500
|
|
|
|
|
|
|
|
|
1501
|
|
|
|
|
|
|
my $end = $tokenizer->find_regexp( qr{ \A \w+ }smx ); |
|
1502
|
|
|
|
|
|
|
my ( $begin, $end ) = $tokenizer->find_regexp( |
|
1503
|
|
|
|
|
|
|
qr{ \A \w+ }smx ); |
|
1504
|
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
This method finds the given regular expression in the content, starting |
|
1506
|
|
|
|
|
|
|
at the current position. If called in scalar context, the offset from |
|
1507
|
|
|
|
|
|
|
the current position to the end of the matched string is returned. If |
|
1508
|
|
|
|
|
|
|
called in list context, the offsets to both the beginning and the end of |
|
1509
|
|
|
|
|
|
|
the matched string are returned. |
|
1510
|
|
|
|
|
|
|
|
|
1511
|
|
|
|
|
|
|
=head2 find_matching_delimiter |
|
1512
|
|
|
|
|
|
|
|
|
1513
|
|
|
|
|
|
|
my $offset = $tokenizer->find_matching_delimiter(); |
|
1514
|
|
|
|
|
|
|
|
|
1515
|
|
|
|
|
|
|
This method is used by tokenizers to find the delimiter matching the |
|
1516
|
|
|
|
|
|
|
character at the current position in the content string. If the |
|
1517
|
|
|
|
|
|
|
delimiter is an opening bracket of some sort, bracket nesting will be |
|
1518
|
|
|
|
|
|
|
taken into account. |
|
1519
|
|
|
|
|
|
|
|
|
1520
|
|
|
|
|
|
|
When searching for the matching delimiter, the back slash character is |
|
1521
|
|
|
|
|
|
|
considered to escape the following character, so back-slashed delimiters |
|
1522
|
|
|
|
|
|
|
will be ignored. No other quoting mechanisms are recognized, though, so |
|
1523
|
|
|
|
|
|
|
delimiters inside quotes still count. This is actually the way Perl |
|
1524
|
|
|
|
|
|
|
works, as |
|
1525
|
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
$ perl -e 'qr<(?{ print "}" })>' |
|
1527
|
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
demonstrates. |
|
1529
|
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
This method returns the offset from the current position in the content |
|
1531
|
|
|
|
|
|
|
string to the matching delimiter (which will always be positive), or |
|
1532
|
|
|
|
|
|
|
undef if no match can be found. |
|
1533
|
|
|
|
|
|
|
|
|
1534
|
|
|
|
|
|
|
=head2 get_mode |
|
1535
|
|
|
|
|
|
|
|
|
1536
|
|
|
|
|
|
|
This method returns the name of the current mode of the tokenizer. |
|
1537
|
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
=head2 get_start_delimiter |
|
1539
|
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
my $start_delimiter = $tokenizer->get_start_delimiter(); |
|
1541
|
|
|
|
|
|
|
|
|
1542
|
|
|
|
|
|
|
This method is used by tokenizers to access the start delimiter for the |
|
1543
|
|
|
|
|
|
|
regular expression. |
|
1544
|
|
|
|
|
|
|
|
|
1545
|
|
|
|
|
|
|
=head2 get_token |
|
1546
|
|
|
|
|
|
|
|
|
1547
|
|
|
|
|
|
|
my $token = $tokenizer->make_token( 3 ); |
|
1548
|
|
|
|
|
|
|
my @tokens = $tokenizer->get_token(); |
|
1549
|
|
|
|
|
|
|
|
|
1550
|
|
|
|
|
|
|
This method returns the next token that can be made from the input |
|
1551
|
|
|
|
|
|
|
stream. It is B<not> part of the external interface, but is intended for |
|
1552
|
|
|
|
|
|
|
the use of an external tokenizer which calls it after making and |
|
1553
|
|
|
|
|
|
|
retaining its own token to look at the next token ( if any ) in the |
|
1554
|
|
|
|
|
|
|
input stream. |
|
1555
|
|
|
|
|
|
|
|
|
1556
|
|
|
|
|
|
|
If any external tokenizer calls get_token without first calling |
|
1557
|
|
|
|
|
|
|
make_token, a fatal error occurs; this is better than the infinite |
|
1558
|
|
|
|
|
|
|
recursion which would occur if the condition were not trapped. |
|
1559
|
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
An external tokenizer B<must> return anything returned by get_token; |
|
1561
|
|
|
|
|
|
|
otherwise tokens get lost. |
|
1562
|
|
|
|
|
|
|
|
|
1563
|
|
|
|
|
|
|
=head2 interpolates |
|
1564
|
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
This method returns true if the top-level structure being tokenized |
|
1566
|
|
|
|
|
|
|
interpolates; that is, if the delimiter is not a single quote. |
|
1567
|
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
=head2 make_token |
|
1569
|
|
|
|
|
|
|
|
|
1570
|
|
|
|
|
|
|
return $tokenizer->make_token( 3, 'PPIx::Regexp::Token::Unknown' ); |
|
1571
|
|
|
|
|
|
|
|
|
1572
|
|
|
|
|
|
|
This method is used by this class (and possibly by individual |
|
1573
|
|
|
|
|
|
|
tokenizers) to manufacture a token. Its arguments are the number of |
|
1574
|
|
|
|
|
|
|
characters to include in the token, and optionally the class of the |
|
1575
|
|
|
|
|
|
|
token. If no class name is given, the caller's class is used. Class |
|
1576
|
|
|
|
|
|
|
names may be shortened by removing the initial 'PPIx::Regexp::', which |
|
1577
|
|
|
|
|
|
|
will be restored by this method. |
|
1578
|
|
|
|
|
|
|
|
|
1579
|
|
|
|
|
|
|
The token will be manufactured from the given number of characters |
|
1580
|
|
|
|
|
|
|
starting at the current cursor position, which will be adjusted. |
|
1581
|
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
If the given length would include characters past the end of the string |
|
1583
|
|
|
|
|
|
|
being tokenized, the length is reduced appropriately. If this means a |
|
1584
|
|
|
|
|
|
|
token with no characters, nothing is returned. |
|
1585
|
|
|
|
|
|
|
|
|
1586
|
|
|
|
|
|
|
=head2 match |
|
1587
|
|
|
|
|
|
|
|
|
1588
|
|
|
|
|
|
|
if ( $tokenizer->find_regexp( qr{ \A \w+ }smx ) ) { |
|
1589
|
|
|
|
|
|
|
print $tokenizer->match(), "\n"; |
|
1590
|
|
|
|
|
|
|
} |
|
1591
|
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
This method returns the string matched by the previous call to |
|
1593
|
|
|
|
|
|
|
L</find_regexp>. |
|
1594
|
|
|
|
|
|
|
|
|
1595
|
|
|
|
|
|
|
The match is set to C<undef> by L</make_token>, as well as by another |
|
1596
|
|
|
|
|
|
|
call to L</find_regexp>. |
|
1597
|
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
=head2 modifier_duplicate |
|
1599
|
|
|
|
|
|
|
|
|
1600
|
|
|
|
|
|
|
$tokenizer->modifier_duplicate(); |
|
1601
|
|
|
|
|
|
|
|
|
1602
|
|
|
|
|
|
|
This method duplicates the modifiers on the top of the modifier stack, |
|
1603
|
|
|
|
|
|
|
with the intent of creating a locally-scoped copy of the modifiers. This |
|
1604
|
|
|
|
|
|
|
should only be called by an external tokenizer that is actually creating |
|
1605
|
|
|
|
|
|
|
a modifier scope. In other words, only when creating a |
|
1606
|
|
|
|
|
|
|
L<PPIx::Regexp::Token::Structure|PPIx::Regexp::Token::Structure> token |
|
1607
|
|
|
|
|
|
|
whose content is '('. |
|
1608
|
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
=head2 modifier_modify |
|
1610
|
|
|
|
|
|
|
|
|
1611
|
|
|
|
|
|
|
$tokenizer->modifier_modify( name => $value ... ); |
|
1612
|
|
|
|
|
|
|
|
|
1613
|
|
|
|
|
|
|
This method sets new values for the modifiers in the local scope. Only |
|
1614
|
|
|
|
|
|
|
the modifiers whose names are actually passed have their values changed. |
|
1615
|
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
This method is intended to be called after manufacturing a |
|
1617
|
|
|
|
|
|
|
L<PPIx::Regexp::Token::Modifier|PPIx::Regexp::Token::Modifier> token, |
|
1618
|
|
|
|
|
|
|
and passed the results of its C<modifiers> method. |
|
1619
|
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
=head2 modifier_pop |
|
1621
|
|
|
|
|
|
|
|
|
1622
|
|
|
|
|
|
|
$tokenizer->modifier_pop(); |
|
1623
|
|
|
|
|
|
|
|
|
1624
|
|
|
|
|
|
|
This method removes the modifiers on the top of the modifier stack. This |
|
1625
|
|
|
|
|
|
|
should only be called by an external tokenizer that is ending a modifier |
|
1626
|
|
|
|
|
|
|
scope. In other words, only when creating a |
|
1627
|
|
|
|
|
|
|
L<PPIx::Regexp::Token::Structure|PPIx::Regexp::Token::Structure> token |
|
1628
|
|
|
|
|
|
|
whose content is ')'. |
|
1629
|
|
|
|
|
|
|
|
|
1630
|
|
|
|
|
|
|
Note that this method will never pop the last modifier item off the |
|
1631
|
|
|
|
|
|
|
stack, to guard against unmatched right parentheses. |
|
1632
|
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
=head2 modifier_seen |
|
1634
|
|
|
|
|
|
|
|
|
1635
|
|
|
|
|
|
|
$tokenizer->modifier_seen( 'i' ) |
|
1636
|
|
|
|
|
|
|
and print "/i was seen at some point.\n"; |
|
1637
|
|
|
|
|
|
|
|
|
1638
|
|
|
|
|
|
|
Unlike L<modifier()|/modifier>, this method returns a true value if the |
|
1639
|
|
|
|
|
|
|
given modifier has been seen in any scope visible from the current |
|
1640
|
|
|
|
|
|
|
location in the parse. There is no magic for group match semantics ( |
|
1641
|
|
|
|
|
|
|
/a, /aa, /d, /l, /u) or modifiers that can be repeated, like /x and /xx, |
|
1642
|
|
|
|
|
|
|
or /e and /ee. |
|
1643
|
|
|
|
|
|
|
|
|
1644
|
|
|
|
|
|
|
=head2 peek |
|
1645
|
|
|
|
|
|
|
|
|
1646
|
|
|
|
|
|
|
my $character = $tokenizer->peek(); |
|
1647
|
|
|
|
|
|
|
my $next_char = $tokenizer->peek( 1 ); |
|
1648
|
|
|
|
|
|
|
|
|
1649
|
|
|
|
|
|
|
This method returns the character at the given non-negative offset from |
|
1650
|
|
|
|
|
|
|
the current position. If no offset is given, an offset of 0 is used. |
|
1651
|
|
|
|
|
|
|
|
|
1652
|
|
|
|
|
|
|
If you ask for a negative offset or an offset off the end of the sting, |
|
1653
|
|
|
|
|
|
|
C<undef> is returned. |
|
1654
|
|
|
|
|
|
|
|
|
1655
|
|
|
|
|
|
|
=head2 ppi_document |
|
1656
|
|
|
|
|
|
|
|
|
1657
|
|
|
|
|
|
|
This method makes a PPI document out of the remainder of the string, and |
|
1658
|
|
|
|
|
|
|
returns it. |
|
1659
|
|
|
|
|
|
|
|
|
1660
|
|
|
|
|
|
|
=head2 prior_significant_token |
|
1661
|
|
|
|
|
|
|
|
|
1662
|
|
|
|
|
|
|
$tokenizer->prior_significant_token( 'can_be_quantified' ) |
|
1663
|
|
|
|
|
|
|
and print "The prior token can be quantified.\n"; |
|
1664
|
|
|
|
|
|
|
|
|
1665
|
|
|
|
|
|
|
This method calls the named method on the most-recently-instantiated |
|
1666
|
|
|
|
|
|
|
significant token, and returns the result. Any arguments subsequent to |
|
1667
|
|
|
|
|
|
|
the method name will be passed to the method. |
|
1668
|
|
|
|
|
|
|
|
|
1669
|
|
|
|
|
|
|
Because this method is designed to be used within the tokenizing system, |
|
1670
|
|
|
|
|
|
|
it will die horribly if the named method does not exist. |
|
1671
|
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
If called with no arguments at all the most-recently-instantiated |
|
1673
|
|
|
|
|
|
|
significant token is returned. |
|
1674
|
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
=head2 strict |
|
1676
|
|
|
|
|
|
|
|
|
1677
|
|
|
|
|
|
|
say 'Parse is ', $tokenizer->strict() ? 'strict' : 'lenient'; |
|
1678
|
|
|
|
|
|
|
|
|
1679
|
|
|
|
|
|
|
This method simply returns true or false, depending on whether the |
|
1680
|
|
|
|
|
|
|
C<'strict'> option to C<new()> was true or false. |
|
1681
|
|
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
=head1 ENVIRONMENT VARIABLES |
|
1683
|
|
|
|
|
|
|
|
|
1684
|
|
|
|
|
|
|
A tokenizer trace can be requested by setting environment variable |
|
1685
|
|
|
|
|
|
|
PPIX_REGEXP_TOKENIZER_TRACE to a numeric value other than 0. Use of this |
|
1686
|
|
|
|
|
|
|
environment variable is unsupported in the same sense that the C<trace> |
|
1687
|
|
|
|
|
|
|
option of L</new> is unsupported. Explicitly specifying the C<trace> |
|
1688
|
|
|
|
|
|
|
option to L</new> overrides the environment variable. |
|
1689
|
|
|
|
|
|
|
|
|
1690
|
|
|
|
|
|
|
The real reason this is documented is to give the user a way to |
|
1691
|
|
|
|
|
|
|
troubleshoot funny output from the tokenizer. |
|
1692
|
|
|
|
|
|
|
|
|
1693
|
|
|
|
|
|
|
=head1 SUPPORT |
|
1694
|
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
Support is by the author. Please file bug reports at |
|
1696
|
|
|
|
|
|
|
L<https://rt.cpan.org/Public/Dist/Display.html?Name=PPIx-Regexp>, |
|
1697
|
|
|
|
|
|
|
L<https://github.com/trwyant/perl-PPIx-Regexp/issues>, or in |
|
1698
|
|
|
|
|
|
|
electronic mail to the author. |
|
1699
|
|
|
|
|
|
|
|
|
1700
|
|
|
|
|
|
|
=head1 AUTHOR |
|
1701
|
|
|
|
|
|
|
|
|
1702
|
|
|
|
|
|
|
Thomas R. Wyant, III F<wyant at cpan dot org> |
|
1703
|
|
|
|
|
|
|
|
|
1704
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
1705
|
|
|
|
|
|
|
|
|
1706
|
|
|
|
|
|
|
Copyright (C) 2009-2023, 2025 by Thomas R. Wyant, III |
|
1707
|
|
|
|
|
|
|
|
|
1708
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
|
1709
|
|
|
|
|
|
|
under the same terms as Perl 5.10.0. For more details, see the full text |
|
1710
|
|
|
|
|
|
|
of the licenses in the directory LICENSES. |
|
1711
|
|
|
|
|
|
|
|
|
1712
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but |
|
1713
|
|
|
|
|
|
|
without any warranty; without even the implied warranty of |
|
1714
|
|
|
|
|
|
|
merchantability or fitness for a particular purpose. |
|
1715
|
|
|
|
|
|
|
|
|
1716
|
|
|
|
|
|
|
=cut |
|
1717
|
|
|
|
|
|
|
|
|
1718
|
|
|
|
|
|
|
# ex: set textwidth=72 : |