line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package PPI::Tokenizer; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=pod |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 NAME |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
PPI::Tokenizer - The Perl Document Tokenizer |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 SYNOPSIS |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# Create a tokenizer for a file, array or string |
12
|
|
|
|
|
|
|
$Tokenizer = PPI::Tokenizer->new( 'filename.pl' ); |
13
|
|
|
|
|
|
|
$Tokenizer = PPI::Tokenizer->new( \@lines ); |
14
|
|
|
|
|
|
|
$Tokenizer = PPI::Tokenizer->new( \$source ); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# Return all the tokens for the document |
17
|
|
|
|
|
|
|
my $tokens = $Tokenizer->all_tokens; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# Or we can use it as an iterator |
20
|
|
|
|
|
|
|
while ( my $Token = $Tokenizer->get_token ) { |
21
|
|
|
|
|
|
|
print "Found token '$Token'\n"; |
22
|
|
|
|
|
|
|
} |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# If we REALLY need to manually nudge the cursor, you |
25
|
|
|
|
|
|
|
# can do that to (The lexer needs this ability to do rollbacks) |
26
|
|
|
|
|
|
|
$is_incremented = $Tokenizer->increment_cursor; |
27
|
|
|
|
|
|
|
$is_decremented = $Tokenizer->decrement_cursor; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 DESCRIPTION |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
PPI::Tokenizer is the class that provides Tokenizer objects for use in |
32
|
|
|
|
|
|
|
breaking strings of Perl source code into Tokens. |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
By the time you are reading this, you probably need to know a little |
35
|
|
|
|
|
|
|
about the difference between how perl parses Perl "code" and how PPI |
36
|
|
|
|
|
|
|
parsers Perl "documents". |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
"perl" itself (the interpreter) uses a heavily modified lex specification |
39
|
|
|
|
|
|
|
to specify its parsing logic, maintains several types of state as it |
40
|
|
|
|
|
|
|
goes, and incrementally tokenizes, lexes AND EXECUTES at the same time. |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
In fact, it is provably impossible to use perl's parsing method without |
43
|
|
|
|
|
|
|
simultaneously executing code. A formal mathematical proof has been |
44
|
|
|
|
|
|
|
published demonstrating the method. |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
This is where the truism "Only perl can parse Perl" comes from. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
PPI uses a completely different approach by abandoning the (impossible) |
49
|
|
|
|
|
|
|
ability to parse Perl the same way that the interpreter does, and instead |
50
|
|
|
|
|
|
|
parsing the source as a document, using a document structure independently |
51
|
|
|
|
|
|
|
derived from the Perl documentation and approximating the perl interpreter |
52
|
|
|
|
|
|
|
interpretation as closely as possible. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
It was touch and go for a long time whether we could get it close enough, |
55
|
|
|
|
|
|
|
but in the end it turned out that it could be done. |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
In this approach, the tokenizer C is implemented separately |
58
|
|
|
|
|
|
|
from the lexer L. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
The job of C is to take pure source as a string and break it |
61
|
|
|
|
|
|
|
up into a stream/set of tokens, and contains most of the "black magic" used |
62
|
|
|
|
|
|
|
in PPI. By comparison, the lexer implements a relatively straight forward |
63
|
|
|
|
|
|
|
tree structure, and has an implementation that is uncomplicated (compared |
64
|
|
|
|
|
|
|
to the insanity in the tokenizer at least). |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
The Tokenizer uses an immense amount of heuristics, guessing and cruft, |
67
|
|
|
|
|
|
|
supported by a very B flexible internal API, but fortunately it was |
68
|
|
|
|
|
|
|
possible to largely encapsulate the black magic, so there is not a lot that |
69
|
|
|
|
|
|
|
gets exposed to people using the C itself. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head1 METHODS |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Despite the incredible complexity, the Tokenizer itself only exposes a |
74
|
|
|
|
|
|
|
relatively small number of methods, with most of the complexity implemented |
75
|
|
|
|
|
|
|
in private methods. |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=cut |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# Make sure everything we need is loaded so |
80
|
|
|
|
|
|
|
# we don't have to go and load all of PPI. |
81
|
64
|
|
|
64
|
|
375
|
use strict; |
|
64
|
|
|
|
|
119
|
|
|
64
|
|
|
|
|
1820
|
|
82
|
64
|
|
|
64
|
|
290
|
use Params::Util qw{_INSTANCE _SCALAR0 _ARRAY0}; |
|
64
|
|
|
|
|
128
|
|
|
64
|
|
|
|
|
2852
|
|
83
|
64
|
|
|
64
|
|
328
|
use List::Util 1.33 (); |
|
64
|
|
|
|
|
958
|
|
|
64
|
|
|
|
|
1013
|
|
84
|
64
|
|
|
64
|
|
283
|
use PPI::Util (); |
|
64
|
|
|
|
|
112
|
|
|
64
|
|
|
|
|
997
|
|
85
|
64
|
|
|
64
|
|
298
|
use PPI::Element (); |
|
64
|
|
|
|
|
135
|
|
|
64
|
|
|
|
|
1033
|
|
86
|
64
|
|
|
64
|
|
300
|
use PPI::Token (); |
|
64
|
|
|
|
|
142
|
|
|
64
|
|
|
|
|
1214
|
|
87
|
64
|
|
|
64
|
|
310
|
use PPI::Exception (); |
|
64
|
|
|
|
|
125
|
|
|
64
|
|
|
|
|
1042
|
|
88
|
64
|
|
|
64
|
|
21973
|
use PPI::Exception::ParserRejection (); |
|
64
|
|
|
|
|
162
|
|
|
64
|
|
|
|
|
154239
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
our $VERSION = '1.276'; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# The x operator cannot follow most Perl operators, implying that |
93
|
|
|
|
|
|
|
# anything beginning with x following an operator is a word. |
94
|
|
|
|
|
|
|
# These are the exceptions. |
95
|
|
|
|
|
|
|
my %X_CAN_FOLLOW_OPERATOR = map { $_ => 1 } qw( -- ++ ); |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# The x operator cannot follow most structure elements, implying that |
98
|
|
|
|
|
|
|
# anything beginning with x following a structure element is a word. |
99
|
|
|
|
|
|
|
# These are the exceptions. |
100
|
|
|
|
|
|
|
my %X_CAN_FOLLOW_STRUCTURE = map { $_ => 1 } qw( } ] \) ); |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
# Something that looks like the x operator but follows a word |
103
|
|
|
|
|
|
|
# is usually that word's argument. |
104
|
|
|
|
|
|
|
# These are the exceptions. |
105
|
|
|
|
|
|
|
# chop, chomp, dump are ambiguous because they can have either parms |
106
|
|
|
|
|
|
|
# or no parms. |
107
|
|
|
|
|
|
|
my %X_CAN_FOLLOW_WORD = map { $_ => 1 } qw( |
108
|
|
|
|
|
|
|
endgrent |
109
|
|
|
|
|
|
|
endhostent |
110
|
|
|
|
|
|
|
endnetent |
111
|
|
|
|
|
|
|
endprotoent |
112
|
|
|
|
|
|
|
endpwent |
113
|
|
|
|
|
|
|
endservent |
114
|
|
|
|
|
|
|
fork |
115
|
|
|
|
|
|
|
getgrent |
116
|
|
|
|
|
|
|
gethostent |
117
|
|
|
|
|
|
|
getlogin |
118
|
|
|
|
|
|
|
getnetent |
119
|
|
|
|
|
|
|
getppid |
120
|
|
|
|
|
|
|
getprotoent |
121
|
|
|
|
|
|
|
getpwent |
122
|
|
|
|
|
|
|
getservent |
123
|
|
|
|
|
|
|
setgrent |
124
|
|
|
|
|
|
|
setpwent |
125
|
|
|
|
|
|
|
time |
126
|
|
|
|
|
|
|
times |
127
|
|
|
|
|
|
|
wait |
128
|
|
|
|
|
|
|
wantarray |
129
|
|
|
|
|
|
|
__SUB__ |
130
|
|
|
|
|
|
|
); |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
##################################################################### |
135
|
|
|
|
|
|
|
# Creation and Initialization |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=pod |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=head2 new $file | \@lines | \$source |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
The main C constructor creates a new Tokenizer object. These |
142
|
|
|
|
|
|
|
objects have no configuration parameters, and can only be used once, |
143
|
|
|
|
|
|
|
to tokenize a single perl source file. |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
It takes as argument either a normal scalar containing source code, |
146
|
|
|
|
|
|
|
a reference to a scalar containing source code, or a reference to an |
147
|
|
|
|
|
|
|
ARRAY containing newline-terminated lines of source code. |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
Returns a new C object on success, or throws a |
150
|
|
|
|
|
|
|
L exception on error. |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=cut |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub new { |
155
|
16798
|
|
33
|
16798
|
1
|
44591
|
my $class = ref($_[0]) || $_[0]; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
# Create the empty tokenizer struct |
158
|
16798
|
|
|
|
|
102015
|
my $self = bless { |
159
|
|
|
|
|
|
|
# Source code |
160
|
|
|
|
|
|
|
source => undef, |
161
|
|
|
|
|
|
|
source_bytes => undef, |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Line buffer |
164
|
|
|
|
|
|
|
line => undef, |
165
|
|
|
|
|
|
|
line_length => undef, |
166
|
|
|
|
|
|
|
line_cursor => undef, |
167
|
|
|
|
|
|
|
line_count => 0, |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# Parse state |
170
|
|
|
|
|
|
|
token => undef, |
171
|
|
|
|
|
|
|
class => 'PPI::Token::BOM', |
172
|
|
|
|
|
|
|
zone => 'PPI::Token::Whitespace', |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Output token buffer |
175
|
|
|
|
|
|
|
tokens => [], |
176
|
|
|
|
|
|
|
token_cursor => 0, |
177
|
|
|
|
|
|
|
token_eof => 0, |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# Perl 6 blocks |
180
|
|
|
|
|
|
|
perl6 => [], |
181
|
|
|
|
|
|
|
}, $class; |
182
|
|
|
|
|
|
|
|
183
|
16798
|
50
|
|
|
|
57005
|
if ( ! defined $_[1] ) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# We weren't given anything |
185
|
0
|
|
|
|
|
0
|
PPI::Exception->throw("No source provided to Tokenizer"); |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
} elsif ( ! ref $_[1] ) { |
188
|
496
|
|
|
|
|
1679
|
my $source = PPI::Util::_slurp($_[1]); |
189
|
496
|
50
|
|
|
|
1702
|
if ( ref $source ) { |
190
|
|
|
|
|
|
|
# Content returned by reference |
191
|
496
|
|
|
|
|
1507
|
$self->{source} = $$source; |
192
|
|
|
|
|
|
|
} else { |
193
|
|
|
|
|
|
|
# Errors returned as a string |
194
|
0
|
|
|
|
|
0
|
return( $source ); |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
} elsif ( _SCALAR0($_[1]) ) { |
198
|
16302
|
|
|
|
|
17816
|
$self->{source} = ${$_[1]}; |
|
16302
|
|
|
|
|
29689
|
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
} elsif ( _ARRAY0($_[1]) ) { |
201
|
0
|
|
|
|
|
0
|
$self->{source} = join '', map { "\n" } @{$_[1]}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
} else { |
204
|
|
|
|
|
|
|
# We don't support whatever this is |
205
|
0
|
|
|
|
|
0
|
PPI::Exception->throw(ref($_[1]) . " is not supported as a source provider"); |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# We can't handle a null string |
209
|
16798
|
|
|
|
|
26249
|
$self->{source_bytes} = length $self->{source}; |
210
|
16798
|
100
|
|
|
|
25477
|
if ( $self->{source_bytes} ) { |
211
|
|
|
|
|
|
|
# Split on local newlines |
212
|
16794
|
|
|
|
|
256726
|
$self->{source} =~ s/(?:\015{1,2}\012|\015|\012)/\n/g; |
213
|
16794
|
|
|
|
|
181424
|
$self->{source} = [ split /(?<=\n)/, $self->{source} ]; |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
} else { |
216
|
4
|
|
|
|
|
7
|
$self->{source} = [ ]; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
### EVIL |
220
|
|
|
|
|
|
|
# I'm explaining this earlier than I should so you can understand |
221
|
|
|
|
|
|
|
# why I'm about to do something that looks very strange. There's |
222
|
|
|
|
|
|
|
# a problem with the Tokenizer, in that tokens tend to change |
223
|
|
|
|
|
|
|
# classes as each letter is added, but they don't get allocated |
224
|
|
|
|
|
|
|
# their definite final class until the "end" of the token, the |
225
|
|
|
|
|
|
|
# detection of which occurs in about a hundred different places, |
226
|
|
|
|
|
|
|
# all through various crufty code (that triples the speed). |
227
|
|
|
|
|
|
|
# |
228
|
|
|
|
|
|
|
# However, in general, this does not apply to tokens in which a |
229
|
|
|
|
|
|
|
# whitespace character is valid, such as comments, whitespace and |
230
|
|
|
|
|
|
|
# big strings. |
231
|
|
|
|
|
|
|
# |
232
|
|
|
|
|
|
|
# So what we do is add a space to the end of the source. This |
233
|
|
|
|
|
|
|
# triggers normal "end of token" functionality for all cases. Then, |
234
|
|
|
|
|
|
|
# once the tokenizer hits end of file, it examines the last token to |
235
|
|
|
|
|
|
|
# manually either remove the ' ' token, or chop it off the end of |
236
|
|
|
|
|
|
|
# a longer one in which the space would be valid. |
237
|
16798
|
100
|
|
70863
|
|
57556
|
if ( List::Util::any { /^__(?:DATA|END)__\s*$/ } @{$self->{source}} ) { |
|
70863
|
100
|
|
|
|
124152
|
|
|
16798
|
100
|
|
|
|
46173
|
|
238
|
10
|
|
|
|
|
29
|
$self->{source_eof_chop} = ''; |
239
|
|
|
|
|
|
|
} elsif ( ! defined $self->{source}->[0] ) { |
240
|
4
|
|
|
|
|
17
|
$self->{source_eof_chop} = ''; |
241
|
|
|
|
|
|
|
} elsif ( $self->{source}->[-1] =~ /\s$/ ) { |
242
|
1060
|
|
|
|
|
2634
|
$self->{source_eof_chop} = ''; |
243
|
|
|
|
|
|
|
} else { |
244
|
15724
|
|
|
|
|
24946
|
$self->{source_eof_chop} = 1; |
245
|
15724
|
|
|
|
|
26696
|
$self->{source}->[-1] .= ' '; |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
16798
|
|
|
|
|
56828
|
$self; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
##################################################################### |
256
|
|
|
|
|
|
|
# Main Public Methods |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=pod |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=head2 get_token |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
When using the PPI::Tokenizer object as an iterator, the C |
263
|
|
|
|
|
|
|
method is the primary method that is used. It increments the cursor |
264
|
|
|
|
|
|
|
and returns the next Token in the output array. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
The actual parsing of the file is done only as-needed, and a line at |
267
|
|
|
|
|
|
|
a time. When C hits the end of the token array, it will |
268
|
|
|
|
|
|
|
cause the parser to pull in the next line and parse it, continuing |
269
|
|
|
|
|
|
|
as needed until there are more tokens on the output array that |
270
|
|
|
|
|
|
|
get_token can then return. |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
This means that a number of Tokenizer objects can be created, and |
273
|
|
|
|
|
|
|
won't consume significant CPU until you actually begin to pull tokens |
274
|
|
|
|
|
|
|
from it. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
Return a L object on success, C<0> if the Tokenizer had |
277
|
|
|
|
|
|
|
reached the end of the file, or C on error. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=cut |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub get_token { |
282
|
380898
|
|
|
380898
|
1
|
443571
|
my $self = shift; |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# Shortcut for EOF |
285
|
380898
|
50
|
66
|
|
|
639257
|
if ( $self->{token_eof} |
286
|
13364
|
|
|
|
|
29080
|
and $self->{token_cursor} > scalar @{$self->{tokens}} |
287
|
|
|
|
|
|
|
) { |
288
|
0
|
|
|
|
|
0
|
return 0; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
# Return the next token if we can |
292
|
380898
|
100
|
|
|
|
841972
|
if ( my $token = $self->{tokens}->[ $self->{token_cursor} ] ) { |
293
|
304418
|
|
|
|
|
328770
|
$self->{token_cursor}++; |
294
|
304418
|
|
|
|
|
924282
|
return $token; |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
76480
|
|
|
|
|
80791
|
my $line_rv; |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# Catch exceptions and return undef, so that we |
300
|
|
|
|
|
|
|
# can start to convert code to exception-based code. |
301
|
76480
|
|
|
|
|
86957
|
my $rv = eval { |
302
|
|
|
|
|
|
|
# No token, we need to get some more |
303
|
76480
|
|
|
|
|
121400
|
while ( $line_rv = $self->_process_next_line ) { |
304
|
|
|
|
|
|
|
# If there is something in the buffer, return it |
305
|
|
|
|
|
|
|
# The defined() prevents a ton of calls to PPI::Util::TRUE |
306
|
67198
|
100
|
|
|
|
137239
|
if ( defined( my $token = $self->{tokens}->[ $self->{token_cursor} ] ) ) { |
307
|
46407
|
|
|
|
|
51382
|
$self->{token_cursor}++; |
308
|
46407
|
|
|
|
|
73219
|
return $token; |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
} |
311
|
30072
|
|
|
|
|
38970
|
return undef; |
312
|
|
|
|
|
|
|
}; |
313
|
76480
|
100
|
|
|
|
178473
|
if ( $@ ) { |
|
|
100
|
|
|
|
|
|
314
|
1
|
50
|
|
|
|
9
|
if ( _INSTANCE($@, 'PPI::Exception') ) { |
315
|
1
|
|
|
|
|
12
|
$@->throw; |
316
|
|
|
|
|
|
|
} else { |
317
|
0
|
|
|
|
|
0
|
my $errstr = $@; |
318
|
0
|
|
|
|
|
0
|
$errstr =~ s/^(.*) at line .+$/$1/; |
319
|
0
|
|
|
|
|
0
|
PPI::Exception->throw( $errstr ); |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
} elsif ( $rv ) { |
322
|
46407
|
|
|
|
|
180201
|
return $rv; |
323
|
|
|
|
|
|
|
} |
324
|
|
|
|
|
|
|
|
325
|
30072
|
50
|
|
|
|
45548
|
if ( defined $line_rv ) { |
326
|
|
|
|
|
|
|
# End of file, but we can still return things from the buffer |
327
|
30072
|
50
|
|
|
|
49055
|
if ( my $token = $self->{tokens}->[ $self->{token_cursor} ] ) { |
328
|
0
|
|
|
|
|
0
|
$self->{token_cursor}++; |
329
|
0
|
|
|
|
|
0
|
return $token; |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
# Set our token end of file flag |
333
|
30072
|
|
|
|
|
33153
|
$self->{token_eof} = 1; |
334
|
30072
|
|
|
|
|
87132
|
return 0; |
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# Error, pass it up to our caller |
338
|
0
|
|
|
|
|
0
|
undef; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
=pod |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head2 all_tokens |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
When not being used as an iterator, the C method tells |
346
|
|
|
|
|
|
|
the Tokenizer to parse the entire file and return all of the tokens |
347
|
|
|
|
|
|
|
in a single ARRAY reference. |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
It should be noted that C does B interfere with the |
350
|
|
|
|
|
|
|
use of the Tokenizer object as an iterator (does not modify the token |
351
|
|
|
|
|
|
|
cursor) and use of the two different mechanisms can be mixed safely. |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
Returns a reference to an ARRAY of L objects on success |
354
|
|
|
|
|
|
|
or throws an exception on error. |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
=cut |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
sub all_tokens { |
359
|
4
|
|
|
4
|
1
|
16
|
my $self = shift; |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# Catch exceptions and return undef, so that we |
362
|
|
|
|
|
|
|
# can start to convert code to exception-based code. |
363
|
4
|
|
|
|
|
5
|
my $ok = eval { |
364
|
|
|
|
|
|
|
# Process lines until we get EOF |
365
|
4
|
50
|
|
|
|
9
|
unless ( $self->{token_eof} ) { |
366
|
4
|
|
|
|
|
4
|
my $rv; |
367
|
4
|
|
|
|
|
9
|
while ( $rv = $self->_process_next_line ) {} |
368
|
4
|
50
|
|
|
|
6
|
unless ( defined $rv ) { |
369
|
0
|
|
|
|
|
0
|
PPI::Exception->throw("Error while processing source"); |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# Clean up the end of the tokenizer |
373
|
4
|
|
|
|
|
8
|
$self->_clean_eof; |
374
|
|
|
|
|
|
|
} |
375
|
4
|
|
|
|
|
8
|
1; |
376
|
|
|
|
|
|
|
}; |
377
|
4
|
50
|
|
|
|
8
|
if ( !$ok ) { |
378
|
0
|
|
|
|
|
0
|
my $errstr = $@; |
379
|
0
|
|
|
|
|
0
|
$errstr =~ s/^(.*) at line .+$/$1/; |
380
|
0
|
|
|
|
|
0
|
PPI::Exception->throw( $errstr ); |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
# End of file, return a copy of the token array. |
384
|
4
|
|
|
|
|
5
|
return [ @{$self->{tokens}} ]; |
|
4
|
|
|
|
|
12
|
|
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=pod |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=head2 increment_cursor |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
Although exposed as a public method, C is implemented |
392
|
|
|
|
|
|
|
for expert use only, when writing lexers or other components that work |
393
|
|
|
|
|
|
|
directly on token streams. |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
It manually increments the token cursor forward through the file, in effect |
396
|
|
|
|
|
|
|
"skipping" the next token. |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
Return true if the cursor is incremented, C<0> if already at the end of |
399
|
|
|
|
|
|
|
the file, or C on error. |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=cut |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
sub increment_cursor { |
404
|
|
|
|
|
|
|
# Do this via the get_token method, which makes sure there |
405
|
|
|
|
|
|
|
# is actually a token there to move to. |
406
|
0
|
0
|
|
0
|
1
|
0
|
$_[0]->get_token and 1; |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=pod |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=head2 decrement_cursor |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
Although exposed as a public method, C is implemented |
414
|
|
|
|
|
|
|
for expert use only, when writing lexers or other components that work |
415
|
|
|
|
|
|
|
directly on token streams. |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
It manually decrements the token cursor backwards through the file, in |
418
|
|
|
|
|
|
|
effect "rolling back" the token stream. And indeed that is what it is |
419
|
|
|
|
|
|
|
primarily intended for, when the component that is consuming the token |
420
|
|
|
|
|
|
|
stream needs to implement some sort of "roll back" feature in its use |
421
|
|
|
|
|
|
|
of the token stream. |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
Return true if the cursor is decremented, C<0> if already at the |
424
|
|
|
|
|
|
|
beginning of the file, or C on error. |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=cut |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
sub decrement_cursor { |
429
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
# Check for the beginning of the file |
432
|
0
|
0
|
|
|
|
0
|
return 0 unless $self->{token_cursor}; |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
# Decrement the token cursor |
435
|
0
|
|
|
|
|
0
|
$self->{token_eof} = 0; |
436
|
0
|
|
|
|
|
0
|
--$self->{token_cursor}; |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
##################################################################### |
444
|
|
|
|
|
|
|
# Working With Source |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
# Fetches the next line from the input line buffer |
447
|
|
|
|
|
|
|
# Returns undef at EOF. |
448
|
|
|
|
|
|
|
sub _get_line { |
449
|
105036
|
|
|
105036
|
|
109269
|
my $self = shift; |
450
|
105036
|
100
|
|
|
|
169981
|
return undef unless $self->{source}; # EOF hit previously |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# Pull off the next line |
453
|
89162
|
|
|
|
|
91518
|
my $line = shift @{$self->{source}}; |
|
89162
|
|
|
|
|
155174
|
|
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
# Flag EOF if we hit it |
456
|
89162
|
100
|
|
|
|
149506
|
$self->{source} = undef unless defined $line; |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
# Return the line (or EOF flag) |
459
|
89162
|
|
|
|
|
132306
|
return $line; # string or undef |
460
|
|
|
|
|
|
|
} |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# Fetches the next line, ready to process |
463
|
|
|
|
|
|
|
# Returns 1 on success |
464
|
|
|
|
|
|
|
# Returns 0 on EOF |
465
|
|
|
|
|
|
|
sub _fill_line { |
466
|
102729
|
|
|
102729
|
|
110707
|
my $self = shift; |
467
|
102729
|
|
|
|
|
110861
|
my $inscan = shift; |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
# Get the next line |
470
|
102729
|
|
|
|
|
134299
|
my $line = $self->_get_line; |
471
|
102729
|
100
|
|
|
|
154552
|
unless ( defined $line ) { |
472
|
|
|
|
|
|
|
# End of file |
473
|
32113
|
100
|
|
|
|
47108
|
unless ( $inscan ) { |
474
|
30076
|
|
|
|
|
43022
|
delete $self->{line}; |
475
|
30076
|
|
|
|
|
35134
|
delete $self->{line_cursor}; |
476
|
30076
|
|
|
|
|
31460
|
delete $self->{line_length}; |
477
|
30076
|
|
|
|
|
55047
|
return 0; |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
# In the scan version, just set the cursor to the end |
481
|
|
|
|
|
|
|
# of the line, and the rest should just cascade out. |
482
|
2037
|
|
|
|
|
2581
|
$self->{line_cursor} = $self->{line_length}; |
483
|
2037
|
|
|
|
|
3727
|
return 0; |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
# Populate the appropriate variables |
487
|
70616
|
|
|
|
|
101773
|
$self->{line} = $line; |
488
|
70616
|
|
|
|
|
83466
|
$self->{line_cursor} = -1; |
489
|
70616
|
|
|
|
|
83999
|
$self->{line_length} = length $line; |
490
|
70616
|
|
|
|
|
76726
|
$self->{line_count}++; |
491
|
|
|
|
|
|
|
|
492
|
70616
|
|
|
|
|
119924
|
1; |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
# Get the current character |
496
|
|
|
|
|
|
|
sub _char { |
497
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
498
|
0
|
|
|
|
|
0
|
substr( $self->{line}, $self->{line_cursor}, 1 ); |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
#################################################################### |
506
|
|
|
|
|
|
|
# Per line processing methods |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
# Processes the next line |
509
|
|
|
|
|
|
|
# Returns 1 on success completion |
510
|
|
|
|
|
|
|
# Returns 0 if EOF |
511
|
|
|
|
|
|
|
# Returns undef on error |
512
|
|
|
|
|
|
|
sub _process_next_line { |
513
|
97285
|
|
|
97285
|
|
108876
|
my $self = shift; |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
# Fill the line buffer |
516
|
97285
|
|
|
|
|
96909
|
my $rv; |
517
|
97285
|
100
|
|
|
|
137513
|
unless ( $rv = $self->_fill_line ) { |
518
|
30076
|
50
|
|
|
|
43995
|
return undef unless defined $rv; |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
# End of file, finalize last token |
521
|
30076
|
|
|
|
|
51182
|
$self->_finalize_token; |
522
|
30076
|
|
|
|
|
55370
|
return 0; |
523
|
|
|
|
|
|
|
} |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# Run the __TOKENIZER__on_line_start |
526
|
67209
|
|
|
|
|
160629
|
$rv = $self->{class}->__TOKENIZER__on_line_start( $self ); |
527
|
67209
|
100
|
|
|
|
106695
|
unless ( $rv ) { |
528
|
|
|
|
|
|
|
# If there are no more source lines, then clean up |
529
|
27815
|
100
|
66
|
|
|
50153
|
if ( ref $self->{source} eq 'ARRAY' and ! @{$self->{source}} ) { |
|
27815
|
|
|
|
|
61600
|
|
530
|
307
|
|
|
|
|
840
|
$self->_clean_eof; |
531
|
|
|
|
|
|
|
} |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
# Defined but false means next line |
534
|
27815
|
50
|
|
|
|
58641
|
return 1 if defined $rv; |
535
|
0
|
|
|
|
|
0
|
PPI::Exception->throw("Error at line $self->{line_count}"); |
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
# If we can't deal with the entire line, process char by char |
539
|
39394
|
|
|
|
|
64089
|
while ( $rv = $self->_process_next_char ) {} |
540
|
39393
|
50
|
|
|
|
66414
|
unless ( defined $rv ) { |
541
|
0
|
|
|
|
|
0
|
PPI::Exception->throw("Error at line $self->{line_count}, character $self->{line_cursor}"); |
542
|
|
|
|
|
|
|
} |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
# Trigger any action that needs to happen at the end of a line |
545
|
39393
|
|
|
|
|
92742
|
$self->{class}->__TOKENIZER__on_line_end( $self ); |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
# If there are no more source lines, then clean up |
548
|
39393
|
100
|
100
|
|
|
85027
|
unless ( ref($self->{source}) eq 'ARRAY' and @{$self->{source}} ) { |
|
37114
|
|
|
|
|
95947
|
|
549
|
16486
|
|
|
|
|
28548
|
return $self->_clean_eof; |
550
|
|
|
|
|
|
|
} |
551
|
|
|
|
|
|
|
|
552
|
22907
|
|
|
|
|
46974
|
return 1; |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
##################################################################### |
560
|
|
|
|
|
|
|
# Per-character processing methods |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# Process on a per-character basis. |
563
|
|
|
|
|
|
|
# Note that due the high number of times this gets |
564
|
|
|
|
|
|
|
# called, it has been fairly heavily in-lined, so the code |
565
|
|
|
|
|
|
|
# might look a bit ugly and duplicated. |
566
|
|
|
|
|
|
|
sub _process_next_char { |
567
|
434430
|
|
|
434430
|
|
492547
|
my $self = shift; |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
### FIXME - This checks for a screwed up condition that triggers |
570
|
|
|
|
|
|
|
### several warnings, amongst other things. |
571
|
434430
|
50
|
33
|
|
|
1070434
|
if ( ! defined $self->{line_cursor} or ! defined $self->{line_length} ) { |
572
|
|
|
|
|
|
|
# $DB::single = 1; |
573
|
0
|
|
|
|
|
0
|
return undef; |
574
|
|
|
|
|
|
|
} |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
# Increment the counter and check for end of line |
577
|
434430
|
100
|
|
|
|
699061
|
return 0 if ++$self->{line_cursor} >= $self->{line_length}; |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
# Pass control to the token class |
580
|
395037
|
|
|
|
|
394356
|
my $result; |
581
|
395037
|
100
|
|
|
|
746300
|
unless ( $result = $self->{class}->__TOKENIZER__on_char( $self ) ) { |
582
|
|
|
|
|
|
|
# undef is error. 0 is "Did stuff ourself, you don't have to do anything" |
583
|
76272
|
50
|
|
|
|
205225
|
return defined $result ? 1 : undef; |
584
|
|
|
|
|
|
|
} |
585
|
|
|
|
|
|
|
|
586
|
|
|
|
|
|
|
# We will need the value of the current character |
587
|
318764
|
|
|
|
|
472747
|
my $char = substr( $self->{line}, $self->{line_cursor}, 1 ); |
588
|
318764
|
100
|
|
|
|
475287
|
if ( $result eq '1' ) { |
589
|
|
|
|
|
|
|
# If __TOKENIZER__on_char returns 1, it is signaling that it thinks that |
590
|
|
|
|
|
|
|
# the character is part of it. |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
# Add the character |
593
|
57739
|
50
|
|
|
|
85532
|
if ( defined $self->{token} ) { |
594
|
57739
|
|
|
|
|
80268
|
$self->{token}->{content} .= $char; |
595
|
|
|
|
|
|
|
} else { |
596
|
0
|
0
|
|
|
|
0
|
defined($self->{token} = $self->{class}->new($char)) or return undef; |
597
|
|
|
|
|
|
|
} |
598
|
|
|
|
|
|
|
|
599
|
57739
|
|
|
|
|
126998
|
return 1; |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
# We have been provided with the name of a class |
603
|
261025
|
100
|
|
|
|
481793
|
if ( $self->{class} ne "PPI::Token::$result" ) { |
|
|
100
|
|
|
|
|
|
604
|
|
|
|
|
|
|
# New class |
605
|
101525
|
|
|
|
|
158837
|
$self->_new_token( $result, $char ); |
606
|
|
|
|
|
|
|
} elsif ( defined $self->{token} ) { |
607
|
|
|
|
|
|
|
# Same class as current |
608
|
29517
|
|
|
|
|
39394
|
$self->{token}->{content} .= $char; |
609
|
|
|
|
|
|
|
} else { |
610
|
|
|
|
|
|
|
# Same class, but no current |
611
|
129983
|
50
|
|
|
|
254436
|
defined($self->{token} = $self->{class}->new($char)) or return undef; |
612
|
|
|
|
|
|
|
} |
613
|
|
|
|
|
|
|
|
614
|
261025
|
|
|
|
|
561823
|
1; |
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
##################################################################### |
622
|
|
|
|
|
|
|
# Altering Tokens in Tokenizer |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
# Finish the end of a token. |
625
|
|
|
|
|
|
|
# Returns the resulting parse class as a convenience. |
626
|
|
|
|
|
|
|
sub _finalize_token { |
627
|
394130
|
|
|
394130
|
|
436748
|
my $self = shift; |
628
|
394130
|
100
|
|
|
|
590914
|
return $self->{class} unless defined $self->{token}; |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
# Add the token to the token buffer |
631
|
364052
|
|
|
|
|
367998
|
push @{ $self->{tokens} }, $self->{token}; |
|
364052
|
|
|
|
|
562831
|
|
632
|
364052
|
|
|
|
|
423364
|
$self->{token} = undef; |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# Return the parse class to that of the zone we are in |
635
|
364052
|
|
|
|
|
674651
|
$self->{class} = $self->{zone}; |
636
|
|
|
|
|
|
|
} |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
# Creates a new token and sets it in the tokenizer |
639
|
|
|
|
|
|
|
# The defined() in here prevent a ton of calls to PPI::Util::TRUE |
640
|
|
|
|
|
|
|
sub _new_token { |
641
|
234067
|
|
|
234067
|
|
252484
|
my $self = shift; |
642
|
|
|
|
|
|
|
# throw PPI::Exception() unless @_; |
643
|
234067
|
100
|
|
|
|
454936
|
my $class = substr( $_[0], 0, 12 ) eq 'PPI::Token::' |
644
|
|
|
|
|
|
|
? shift : 'PPI::Token::' . shift; |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
# Finalize any existing token |
647
|
234067
|
100
|
|
|
|
443166
|
$self->_finalize_token if defined $self->{token}; |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
# Create the new token and update the parse class |
650
|
234067
|
50
|
|
|
|
506321
|
defined($self->{token} = $class->new($_[0])) or PPI::Exception->throw; |
651
|
234067
|
|
|
|
|
338119
|
$self->{class} = $class; |
652
|
|
|
|
|
|
|
|
653
|
234067
|
|
|
|
|
298140
|
1; |
654
|
|
|
|
|
|
|
} |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# At the end of the file, we need to clean up the results of the erroneous |
657
|
|
|
|
|
|
|
# space that we inserted at the beginning of the process. |
658
|
|
|
|
|
|
|
sub _clean_eof { |
659
|
16797
|
|
|
16797
|
|
19406
|
my $self = shift; |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
# Finish any partially completed token |
662
|
16797
|
100
|
|
|
|
27660
|
$self->_finalize_token if $self->{token}; |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
# Find the last token, and if it has no content, kill it. |
665
|
|
|
|
|
|
|
# There appears to be some evidence that such "null tokens" are |
666
|
|
|
|
|
|
|
# somehow getting created accidentally. |
667
|
16797
|
|
|
|
|
21439
|
my $last_token = $self->{tokens}->[ -1 ]; |
668
|
16797
|
50
|
|
|
|
28469
|
unless ( length $last_token->{content} ) { |
669
|
0
|
|
|
|
|
0
|
pop @{$self->{tokens}}; |
|
0
|
|
|
|
|
0
|
|
670
|
|
|
|
|
|
|
} |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
# Now, if the last character of the last token is a space we added, |
673
|
|
|
|
|
|
|
# chop it off, deleting the token if there's nothing else left. |
674
|
16797
|
100
|
|
|
|
28691
|
if ( $self->{source_eof_chop} ) { |
675
|
15451
|
|
|
|
|
19147
|
$last_token = $self->{tokens}->[ -1 ]; |
676
|
15451
|
|
|
|
|
58301
|
$last_token->{content} =~ s/ $//; |
677
|
15451
|
100
|
|
|
|
30790
|
unless ( length $last_token->{content} ) { |
678
|
|
|
|
|
|
|
# Popping token |
679
|
13199
|
|
|
|
|
13744
|
pop @{$self->{tokens}}; |
|
13199
|
|
|
|
|
18679
|
|
680
|
|
|
|
|
|
|
} |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
# The hack involving adding an extra space is now reversed, and |
683
|
|
|
|
|
|
|
# now nobody will ever know. The perfect crime! |
684
|
15451
|
|
|
|
|
22665
|
$self->{source_eof_chop} = ''; |
685
|
|
|
|
|
|
|
} |
686
|
|
|
|
|
|
|
|
687
|
16797
|
|
|
|
|
44284
|
1; |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
##################################################################### |
695
|
|
|
|
|
|
|
# Utility Methods |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
# Context |
698
|
|
|
|
|
|
|
sub _last_token { |
699
|
0
|
|
|
0
|
|
0
|
$_[0]->{tokens}->[-1]; |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
sub _last_significant_token { |
703
|
3119
|
|
|
3119
|
|
4425
|
my $self = shift; |
704
|
3119
|
|
|
|
|
3578
|
my $cursor = $#{ $self->{tokens} }; |
|
3119
|
|
|
|
|
4578
|
|
705
|
3119
|
|
|
|
|
6201
|
while ( $cursor >= 0 ) { |
706
|
4118
|
|
|
|
|
5629
|
my $token = $self->{tokens}->[$cursor--]; |
707
|
4118
|
100
|
|
|
|
11641
|
return $token if $token->significant; |
708
|
|
|
|
|
|
|
} |
709
|
407
|
|
|
|
|
741
|
return; |
710
|
|
|
|
|
|
|
} |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
# Get an array ref of previous significant tokens. |
713
|
|
|
|
|
|
|
# Like _last_significant_token except it gets more than just one token |
714
|
|
|
|
|
|
|
# Returns array with 0 to x entries |
715
|
|
|
|
|
|
|
sub _previous_significant_tokens { |
716
|
150072
|
|
|
150072
|
|
167434
|
my $self = shift; |
717
|
150072
|
|
50
|
|
|
218600
|
my $count = shift || 1; |
718
|
150072
|
|
|
|
|
153072
|
my $cursor = $#{ $self->{tokens} }; |
|
150072
|
|
|
|
|
202997
|
|
719
|
|
|
|
|
|
|
|
720
|
150072
|
|
|
|
|
177351
|
my @tokens; |
721
|
150072
|
|
|
|
|
231130
|
while ( $cursor >= 0 ) { |
722
|
240076
|
|
|
|
|
291846
|
my $token = $self->{tokens}->[$cursor--]; |
723
|
240076
|
100
|
|
|
|
451628
|
next if not $token->significant; |
724
|
155630
|
|
|
|
|
180368
|
push @tokens, $token; |
725
|
155630
|
100
|
|
|
|
268325
|
last if @tokens >= $count; |
726
|
|
|
|
|
|
|
} |
727
|
|
|
|
|
|
|
|
728
|
150072
|
|
|
|
|
279670
|
return @tokens; |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
my %OBVIOUS_CLASS = ( |
732
|
|
|
|
|
|
|
'PPI::Token::Symbol' => 'operator', |
733
|
|
|
|
|
|
|
'PPI::Token::Magic' => 'operator', |
734
|
|
|
|
|
|
|
'PPI::Token::Number' => 'operator', |
735
|
|
|
|
|
|
|
'PPI::Token::ArrayIndex' => 'operator', |
736
|
|
|
|
|
|
|
'PPI::Token::Quote::Double' => 'operator', |
737
|
|
|
|
|
|
|
'PPI::Token::Quote::Interpolate' => 'operator', |
738
|
|
|
|
|
|
|
'PPI::Token::Quote::Literal' => 'operator', |
739
|
|
|
|
|
|
|
'PPI::Token::Quote::Single' => 'operator', |
740
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Backtick' => 'operator', |
741
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Command' => 'operator', |
742
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Readline' => 'operator', |
743
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Regexp' => 'operator', |
744
|
|
|
|
|
|
|
'PPI::Token::QuoteLike::Words' => 'operator', |
745
|
|
|
|
|
|
|
); |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
my %OBVIOUS_CONTENT = ( |
748
|
|
|
|
|
|
|
'(' => 'operand', |
749
|
|
|
|
|
|
|
'{' => 'operand', |
750
|
|
|
|
|
|
|
'[' => 'operand', |
751
|
|
|
|
|
|
|
';' => 'operand', |
752
|
|
|
|
|
|
|
'}' => 'operator', |
753
|
|
|
|
|
|
|
); |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
my %USUALLY_FORCES = map { $_ => 1 } qw( sub package use no ); |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
# Try to determine operator/operand context, if possible. |
759
|
|
|
|
|
|
|
# Returns "operator", "operand", or "" if unknown. |
760
|
|
|
|
|
|
|
sub _opcontext { |
761
|
7002
|
|
|
7002
|
|
7981
|
my $self = shift; |
762
|
7002
|
|
|
|
|
10580
|
my @tokens = $self->_previous_significant_tokens(1); |
763
|
7002
|
|
|
|
|
8461
|
my $p0 = $tokens[0]; |
764
|
7002
|
100
|
|
|
|
17105
|
return '' if not $p0; |
765
|
6883
|
|
|
|
|
9886
|
my $c0 = ref $p0; |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
# Map the obvious cases |
768
|
6883
|
100
|
|
|
|
19518
|
return $OBVIOUS_CLASS{$c0} if defined $OBVIOUS_CLASS{$c0}; |
769
|
2263
|
100
|
|
|
|
4532
|
return $OBVIOUS_CONTENT{$p0} if defined $OBVIOUS_CONTENT{$p0}; |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
# Most of the time after an operator, we are an operand |
772
|
1713
|
100
|
|
|
|
6563
|
return 'operand' if $p0->isa('PPI::Token::Operator'); |
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
# If there's NOTHING, it's operand |
775
|
1486
|
50
|
|
|
|
3063
|
return 'operand' if $p0->content eq ''; |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
# Otherwise, we don't know |
778
|
1486
|
|
|
|
|
3387
|
return '' |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
# Assuming we are currently parsing the word 'x', return true |
782
|
|
|
|
|
|
|
# if previous tokens imply the x is an operator, false otherwise. |
783
|
|
|
|
|
|
|
sub _current_x_is_operator { |
784
|
1144
|
|
|
1144
|
|
1822
|
my ( $self ) = @_; |
785
|
1144
|
100
|
|
|
|
1211
|
return if !@{$self->{tokens}}; |
|
1144
|
|
|
|
|
2524
|
|
786
|
|
|
|
|
|
|
|
787
|
942
|
|
|
|
|
1674
|
my ($prev, $prevprev) = $self->_previous_significant_tokens(2); |
788
|
942
|
50
|
|
|
|
2658
|
return if !$prev; |
789
|
|
|
|
|
|
|
|
790
|
942
|
100
|
|
|
|
3267
|
return !$self->__current_token_is_forced_word if $prev->isa('PPI::Token::Word'); |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
return (!$prev->isa('PPI::Token::Operator') || $X_CAN_FOLLOW_OPERATOR{$prev}) |
793
|
782
|
|
100
|
|
|
4336
|
&& (!$prev->isa('PPI::Token::Structure') || $X_CAN_FOLLOW_STRUCTURE{$prev}) |
794
|
|
|
|
|
|
|
&& !$prev->isa('PPI::Token::Label') |
795
|
|
|
|
|
|
|
; |
796
|
|
|
|
|
|
|
} |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
# Assuming we are at the end of parsing the current token that could be a word, |
800
|
|
|
|
|
|
|
# a wordlike operator, or a version string, try to determine whether context |
801
|
|
|
|
|
|
|
# before or after it forces it to be a bareword. This method is only useful |
802
|
|
|
|
|
|
|
# during tokenization. |
803
|
|
|
|
|
|
|
sub __current_token_is_forced_word { |
804
|
32670
|
|
|
32670
|
|
53631
|
my ( $t, $word ) = @_; |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
# Check if forced by preceding tokens. |
807
|
|
|
|
|
|
|
|
808
|
32670
|
|
|
|
|
47534
|
my ( $prev, $prevprev ) = $t->_previous_significant_tokens(2); |
809
|
32670
|
100
|
|
|
|
68893
|
if ( !$prev ) { |
810
|
8914
|
|
|
|
|
17873
|
pos $t->{line} = $t->{line_cursor}; |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
else { |
813
|
23756
|
|
|
|
|
35902
|
my $content = $prev->{content}; |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
# We are forced if we are a method name. |
816
|
|
|
|
|
|
|
# '->' will always be an operator, so we don't check its type. |
817
|
23756
|
100
|
|
|
|
39289
|
return 1 if $content eq '->'; |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
# If we are contained in a pair of curly braces, we are probably a |
820
|
|
|
|
|
|
|
# forced bareword hash key. '{' is never a word or operator, so we |
821
|
|
|
|
|
|
|
# don't check its type. |
822
|
23630
|
|
|
|
|
43922
|
pos $t->{line} = $t->{line_cursor}; |
823
|
23630
|
100
|
100
|
|
|
56526
|
return 1 if $content eq '{' and $t->{line} =~ /\G\s*\}/gc; |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
# sub, package, use, and no all indicate that what immediately follows |
826
|
|
|
|
|
|
|
# is a word not an operator or (in the case of sub and package) a |
827
|
|
|
|
|
|
|
# version string. However, we don't want to be fooled by 'package |
828
|
|
|
|
|
|
|
# package v10' or 'use no v10'. We're a forced package unless we're |
829
|
|
|
|
|
|
|
# preceded by 'package sub', in which case we're a version string. |
830
|
|
|
|
|
|
|
# We also have to make sure that the sub/package/etc doing the forcing |
831
|
|
|
|
|
|
|
# is not a method call. |
832
|
23403
|
100
|
|
|
|
43173
|
if( $USUALLY_FORCES{$content}) { |
833
|
5631
|
100
|
66
|
|
|
11165
|
return if defined $word and $word =~ /^v[0-9]+$/ and ( $content eq "use" or $content eq "no" ); |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
834
|
5621
|
100
|
|
|
|
20906
|
return 1 if not $prevprev; |
835
|
236
|
100
|
100
|
|
|
545
|
return 1 if not $USUALLY_FORCES{$prevprev->content} and $prevprev->content ne '->'; |
836
|
6
|
|
|
|
|
24
|
return; |
837
|
|
|
|
|
|
|
} |
838
|
|
|
|
|
|
|
} |
839
|
|
|
|
|
|
|
# pos on $t->{line} is guaranteed to be set at this point. |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
# Check if forced by following tokens. |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
# If the word is followed by => it is probably a word, not a regex. |
844
|
26686
|
100
|
|
|
|
62378
|
return 1 if $t->{line} =~ /\G\s*=>/gc; |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
# Otherwise we probably aren't forced |
847
|
25896
|
|
|
|
|
120374
|
return ''; |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
1; |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
=pod |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
=head1 NOTES |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
=head2 How the Tokenizer Works |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
Understanding the Tokenizer is not for the faint-hearted. It is by far |
859
|
|
|
|
|
|
|
the most complex and twisty piece of perl I've ever written that is actually |
860
|
|
|
|
|
|
|
still built properly and isn't a terrible spaghetti-like mess. In fact, you |
861
|
|
|
|
|
|
|
probably want to skip this section. |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
But if you really want to understand, well then here goes. |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
=head2 Source Input and Clean Up |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
The Tokenizer starts by taking source in a variety of forms, sucking it |
868
|
|
|
|
|
|
|
all in and merging into one big string, and doing our own internal line |
869
|
|
|
|
|
|
|
split, using a "universal line separator" which allows the Tokenizer to |
870
|
|
|
|
|
|
|
take source for any platform (and even supports a few known types of |
871
|
|
|
|
|
|
|
broken newlines caused by mixed mac/pc/*nix editor screw ups). |
872
|
|
|
|
|
|
|
|
873
|
|
|
|
|
|
|
The resulting array of lines is used to feed the tokenizer, and is also |
874
|
|
|
|
|
|
|
accessed directly by the heredoc-logic to do the line-oriented part of |
875
|
|
|
|
|
|
|
here-doc support. |
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
=head2 Doing Things the Old Fashioned Way |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
Due to the complexity of perl, and after 2 previously aborted parser |
880
|
|
|
|
|
|
|
attempts, in the end the tokenizer was fashioned around a line-buffered |
881
|
|
|
|
|
|
|
character-by-character method. |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
That is, the Tokenizer pulls and holds a line at a time into a line buffer, |
884
|
|
|
|
|
|
|
and then iterates a cursor along it. At each cursor position, a method is |
885
|
|
|
|
|
|
|
called in whatever token class we are currently in, which will examine the |
886
|
|
|
|
|
|
|
character at the current position, and handle it. |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
As the handler methods in the various token classes are called, they |
889
|
|
|
|
|
|
|
build up an output token array for the source code. |
890
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
Various parts of the Tokenizer use look-ahead, arbitrary-distance |
892
|
|
|
|
|
|
|
look-behind (although currently the maximum is three significant tokens), |
893
|
|
|
|
|
|
|
or both, and various other heuristic guesses. |
894
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
I've been told it is officially termed a I<"backtracking parser |
896
|
|
|
|
|
|
|
with infinite lookaheads">. |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
=head2 State Variables |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
Aside from the current line and the character cursor, the Tokenizer |
901
|
|
|
|
|
|
|
maintains a number of different state variables. |
902
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
=over |
904
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
=item Current Class |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
The Tokenizer maintains the current token class at all times. Much of the |
908
|
|
|
|
|
|
|
time is just going to be the "Whitespace" class, which is what the base of |
909
|
|
|
|
|
|
|
a document is. As the tokenizer executes the various character handlers, |
910
|
|
|
|
|
|
|
the class changes a lot as it moves a long. In fact, in some instances, |
911
|
|
|
|
|
|
|
the character handler may not handle the character directly itself, but |
912
|
|
|
|
|
|
|
rather change the "current class" and then hand off to the character |
913
|
|
|
|
|
|
|
handler for the new class. |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
Because of this, and some other things I'll deal with later, the number of |
916
|
|
|
|
|
|
|
times the character handlers are called does not in fact have a direct |
917
|
|
|
|
|
|
|
relationship to the number of actual characters in the document. |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
=item Current Zone |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
Rather than create a class stack to allow for infinitely nested layers of |
922
|
|
|
|
|
|
|
classes, the Tokenizer recognises just a single layer. |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
To put it a different way, in various parts of the file, the Tokenizer will |
925
|
|
|
|
|
|
|
recognise different "base" or "substrate" classes. When a Token such as a |
926
|
|
|
|
|
|
|
comment or a number is finalised by the tokenizer, it "falls back" to the |
927
|
|
|
|
|
|
|
base state. |
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
This allows proper tokenization of special areas such as __DATA__ |
930
|
|
|
|
|
|
|
and __END__ blocks, which also contain things like comments and POD, |
931
|
|
|
|
|
|
|
without allowing the creation of any significant Tokens inside these areas. |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
For the main part of a document we use L for this, |
934
|
|
|
|
|
|
|
with the idea being that code is "floating in a sea of whitespace". |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
=item Current Token |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
The final main state variable is the "current token". This is the Token |
939
|
|
|
|
|
|
|
that is currently being built by the Tokenizer. For certain types, it |
940
|
|
|
|
|
|
|
can be manipulated and morphed and change class quite a bit while being |
941
|
|
|
|
|
|
|
assembled, as the Tokenizer's understanding of the token content changes. |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
When the Tokenizer is confident that it has seen the end of the Token, it |
944
|
|
|
|
|
|
|
will be "finalized", which adds it to the output token array and resets |
945
|
|
|
|
|
|
|
the current class to that of the zone that we are currently in. |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
I should also note at this point that the "current token" variable is |
948
|
|
|
|
|
|
|
optional. The Tokenizer is capable of knowing what class it is currently |
949
|
|
|
|
|
|
|
set to, without actually having accumulated any characters in the Token. |
950
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
=back |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
=head2 Making It Faster |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
As I'm sure you can imagine, calling several different methods for each |
956
|
|
|
|
|
|
|
character and running regexes and other complex heuristics made the first |
957
|
|
|
|
|
|
|
fully working version of the tokenizer extremely slow. |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
During testing, I created a metric to measure parsing speed called |
960
|
|
|
|
|
|
|
LPGC, or "lines per gigacycle" . A gigacycle is simple a billion CPU |
961
|
|
|
|
|
|
|
cycles on a typical single-core CPU, and so a Tokenizer running at |
962
|
|
|
|
|
|
|
"1000 lines per gigacycle" should generate around 1200 lines of tokenized |
963
|
|
|
|
|
|
|
code when running on a 1200 MHz processor. |
964
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
The first working version of the tokenizer ran at only 350 LPGC, so to |
966
|
|
|
|
|
|
|
tokenize a typical large module such as L took |
967
|
|
|
|
|
|
|
10-15 seconds. This sluggishness made it unpractical for many uses. |
968
|
|
|
|
|
|
|
|
969
|
|
|
|
|
|
|
So in the current parser, there are multiple layers of optimisation |
970
|
|
|
|
|
|
|
very carefully built in to the basic. This has brought the tokenizer |
971
|
|
|
|
|
|
|
up to a more reasonable 1000 LPGC, at the expense of making the code |
972
|
|
|
|
|
|
|
quite a bit twistier. |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
=head2 Making It Faster - Whole Line Classification |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
The first step in the optimisation process was to add a hew handler to |
977
|
|
|
|
|
|
|
enable several of the more basic classes (whitespace, comments) to be |
978
|
|
|
|
|
|
|
able to be parsed a line at a time. At the start of each line, a |
979
|
|
|
|
|
|
|
special optional handler (only supported by a few classes) is called to |
980
|
|
|
|
|
|
|
check and see if the entire line can be parsed in one go. |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
This is used mainly to handle things like POD, comments, empty lines, |
983
|
|
|
|
|
|
|
and a few other minor special cases. |
984
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=head2 Making It Faster - Inlining |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
The second stage of the optimisation involved inlining a small |
988
|
|
|
|
|
|
|
number of critical methods that were repeated an extremely high number |
989
|
|
|
|
|
|
|
of times. Profiling suggested that there were about 1,000,000 individual |
990
|
|
|
|
|
|
|
method calls per gigacycle, and by cutting these by two thirds a significant |
991
|
|
|
|
|
|
|
speed improvement was gained, in the order of about 50%. |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
You may notice that many methods in the C code look |
994
|
|
|
|
|
|
|
very nested and long hand. This is primarily due to this inlining. |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
At around this time, some statistics code that existed in the early |
997
|
|
|
|
|
|
|
versions of the parser was also removed, as it was determined that |
998
|
|
|
|
|
|
|
it was consuming around 15% of the CPU for the entire parser, while |
999
|
|
|
|
|
|
|
making the core more complicated. |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
A judgment call was made that with the difficulties likely to be |
1002
|
|
|
|
|
|
|
encountered with future planned enhancements, and given the relatively |
1003
|
|
|
|
|
|
|
high cost involved, the statistics features would be removed from the |
1004
|
|
|
|
|
|
|
Tokenizer. |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
=head2 Making It Faster - Quote Engine |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
Once inlining had reached diminishing returns, it became obvious from |
1009
|
|
|
|
|
|
|
the profiling results that a huge amount of time was being spent |
1010
|
|
|
|
|
|
|
stepping a char at a time though long, simple and "syntactically boring" |
1011
|
|
|
|
|
|
|
code such as comments and strings. |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
The existing regex engine was expanded to also encompass quotes and |
1014
|
|
|
|
|
|
|
other quote-like things, and a special abstract base class was added |
1015
|
|
|
|
|
|
|
that provided a number of specialised parsing methods that would "scan |
1016
|
|
|
|
|
|
|
ahead", looking out ahead to find the end of a string, and updating |
1017
|
|
|
|
|
|
|
the cursor to leave it in a valid position for the next call. |
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
This is also the point at which the number of character handler calls began |
1020
|
|
|
|
|
|
|
to greatly differ from the number of characters. But it has been done |
1021
|
|
|
|
|
|
|
in a way that allows the parser to retain the power of the original |
1022
|
|
|
|
|
|
|
version at the critical points, while skipping through the "boring bits" |
1023
|
|
|
|
|
|
|
as needed for additional speed. |
1024
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
The addition of this feature allowed the tokenizer to exceed 1000 LPGC |
1026
|
|
|
|
|
|
|
for the first time. |
1027
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
=head2 Making It Faster - The "Complete" Mechanism |
1029
|
|
|
|
|
|
|
|
1030
|
|
|
|
|
|
|
As it became evident that great speed increases were available by using |
1031
|
|
|
|
|
|
|
this "skipping ahead" mechanism, a new handler method was added that |
1032
|
|
|
|
|
|
|
explicitly handles the parsing of an entire token, where the structure |
1033
|
|
|
|
|
|
|
of the token is relatively simple. Tokens such as symbols fit this case, |
1034
|
|
|
|
|
|
|
as once we are passed the initial sigil and word char, we know that we |
1035
|
|
|
|
|
|
|
can skip ahead and "complete" the rest of the token much more easily. |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
A number of these have been added for most or possibly all of the common |
1038
|
|
|
|
|
|
|
cases, with most of these "complete" handlers implemented using regular |
1039
|
|
|
|
|
|
|
expressions. |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
In fact, so many have been added that at this point, you could arguably |
1042
|
|
|
|
|
|
|
reclassify the tokenizer as a "hybrid regex, char-by=char heuristic |
1043
|
|
|
|
|
|
|
tokenizer". More tokens are now consumed in "complete" methods in a |
1044
|
|
|
|
|
|
|
typical program than are handled by the normal char-by-char methods. |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
Many of the these complete-handlers were implemented during the writing |
1047
|
|
|
|
|
|
|
of the Lexer, and this has allowed the full parser to maintain around |
1048
|
|
|
|
|
|
|
1000 LPGC despite the increasing weight of the Lexer. |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
=head2 Making It Faster - Porting To C (In Progress) |
1051
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
While it would be extraordinarily difficult to port all of the Tokenizer |
1053
|
|
|
|
|
|
|
to C, work has started on a L "accelerator" package which acts as |
1054
|
|
|
|
|
|
|
a separate and automatically-detected add-on to the main PPI package. |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
L implements faster versions of a variety of functions scattered |
1057
|
|
|
|
|
|
|
over the entire PPI codebase, from the Tokenizer Core, Quote Engine, and |
1058
|
|
|
|
|
|
|
various other places, and implements them identically in XS/C. |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
In particular, the skip-ahead methods from the Quote Engine would appear |
1061
|
|
|
|
|
|
|
to be extremely amenable to being done in C, and a number of other |
1062
|
|
|
|
|
|
|
functions could be cherry-picked one at a time and implemented in C. |
1063
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
Each method is heavily tested to ensure that the functionality is |
1065
|
|
|
|
|
|
|
identical, and a versioning mechanism is included to ensure that if a |
1066
|
|
|
|
|
|
|
function gets out of sync, L will degrade gracefully and just |
1067
|
|
|
|
|
|
|
not replace that single method. |
1068
|
|
|
|
|
|
|
|
1069
|
|
|
|
|
|
|
=head1 TO DO |
1070
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
- Add an option to reset or seek the token stream... |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
- Implement more Tokenizer functions in L |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
=head1 SUPPORT |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
See the L in the main module. |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
=head1 AUTHOR |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
Adam Kennedy Eadamk@cpan.orgE |
1082
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
=head1 COPYRIGHT |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
Copyright 2001 - 2011 Adam Kennedy. |
1086
|
|
|
|
|
|
|
|
1087
|
|
|
|
|
|
|
This program is free software; you can redistribute |
1088
|
|
|
|
|
|
|
it and/or modify it under the same terms as Perl itself. |
1089
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
The full text of the license can be found in the |
1091
|
|
|
|
|
|
|
LICENSE file included with this module. |
1092
|
|
|
|
|
|
|
|
1093
|
|
|
|
|
|
|
=cut |