line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
package String::Tokenizer; |
3
|
|
|
|
|
|
|
|
4
|
2
|
|
|
2
|
|
27758
|
use 5.006; |
|
2
|
|
|
|
|
5
|
|
5
|
2
|
|
|
2
|
|
8
|
use strict; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
37
|
|
6
|
2
|
|
|
2
|
|
5
|
use warnings; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
70
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '0.06'; |
9
|
|
|
|
|
|
|
|
10
|
2
|
|
|
2
|
|
7
|
use constant RETAIN_WHITESPACE => 1; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
129
|
|
11
|
2
|
|
|
2
|
|
6
|
use constant IGNORE_WHITESPACE => 0; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
950
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
### constructor |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
sub new { |
16
|
10
|
|
|
10
|
1
|
3834
|
my ($_class, @args) = @_; |
17
|
10
|
|
33
|
|
|
41
|
my $class = ref($_class) || $_class; |
18
|
10
|
|
|
|
|
26
|
my $string_tokenizer = { |
19
|
|
|
|
|
|
|
tokens => [], |
20
|
|
|
|
|
|
|
delimiter => undef, |
21
|
|
|
|
|
|
|
handle_whitespace => IGNORE_WHITESPACE |
22
|
|
|
|
|
|
|
}; |
23
|
10
|
|
|
|
|
10
|
bless($string_tokenizer, $class); |
24
|
10
|
100
|
|
|
|
28
|
$string_tokenizer->tokenize(@args) if @args; |
25
|
10
|
|
|
|
|
33
|
return $string_tokenizer; |
26
|
|
|
|
|
|
|
} |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
### methods |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub setDelimiter { |
31
|
8
|
|
|
8
|
1
|
7
|
my ($self, $delimiter) = @_; |
32
|
8
|
|
|
|
|
24
|
my $delimiter_reg_exp = join "\|" => map { s/(\W)/\\$1/g; $_ } split // => $delimiter; |
|
26
|
|
|
|
|
112
|
|
|
26
|
|
|
|
|
44
|
|
33
|
8
|
|
|
|
|
111
|
$self->{delimiter} = qr/$delimiter_reg_exp/; |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub handleWhitespace { |
37
|
3
|
|
|
3
|
1
|
4
|
my ($self, $value) = @_; |
38
|
3
|
|
|
|
|
4
|
$self->{handle_whitespace} = $value; |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub tokenize { |
42
|
10
|
|
|
10
|
1
|
392
|
my ($self, $string, $delimiter, $handle_whitespace) = @_; |
43
|
|
|
|
|
|
|
# if we have a delimiter passed in then use it |
44
|
10
|
100
|
|
|
|
27
|
$self->setDelimiter($delimiter) if defined $delimiter; |
45
|
|
|
|
|
|
|
# if we are asking about whitespace then handle it |
46
|
10
|
100
|
|
|
|
22
|
$self->handleWhitespace($handle_whitespace) if defined $handle_whitespace; |
47
|
|
|
|
|
|
|
# if the two above are not handled, then the object will use |
48
|
|
|
|
|
|
|
# the values set already. |
49
|
|
|
|
|
|
|
# split everything by whitespace no matter what |
50
|
|
|
|
|
|
|
# (possible multiple occurances of white space too) |
51
|
10
|
|
|
|
|
8
|
my @tokens; |
52
|
10
|
100
|
|
|
|
20
|
if ($self->{handle_whitespace}) { |
53
|
2
|
|
|
|
|
21
|
@tokens = split /(\s+)/ => $string; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
else { |
56
|
8
|
|
|
|
|
66
|
@tokens = split /\s+/ => $string; |
57
|
|
|
|
|
|
|
} |
58
|
10
|
100
|
|
|
|
21
|
if ($self->{delimiter}) { |
59
|
|
|
|
|
|
|
# create the delimiter reg-ex |
60
|
|
|
|
|
|
|
# escape all non-alpha-numeric |
61
|
|
|
|
|
|
|
# characters, just to be safe |
62
|
9
|
|
|
|
|
11
|
my $delimiter = $self->{delimiter}; |
63
|
|
|
|
|
|
|
# loop through the tokens |
64
|
|
|
|
|
|
|
@tokens = map { |
65
|
|
|
|
|
|
|
# if the token contains a delimiter then ... |
66
|
9
|
100
|
|
|
|
11
|
if (/$delimiter/) { |
|
148
|
|
|
|
|
339
|
|
67
|
60
|
|
|
|
|
43
|
my ($token, @_tokens); |
68
|
|
|
|
|
|
|
# split the token up into characters |
69
|
|
|
|
|
|
|
# and the loop through all the characters |
70
|
60
|
|
|
|
|
83
|
foreach my $char (split //) { |
71
|
|
|
|
|
|
|
# if the character is a delimiter |
72
|
196
|
100
|
|
|
|
422
|
if ($char =~ /^$delimiter$/) { |
73
|
|
|
|
|
|
|
# and we already have a token in the works |
74
|
75
|
100
|
66
|
|
|
190
|
if (defined($token) && $token =~ /^.*$/) { |
75
|
|
|
|
|
|
|
# add the token to the |
76
|
|
|
|
|
|
|
# temp tokens list |
77
|
31
|
|
|
|
|
33
|
push @_tokens => $token; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
# and then push our delimiter character |
80
|
|
|
|
|
|
|
# onto the temp tokens list |
81
|
75
|
|
|
|
|
57
|
push @_tokens => $char; |
82
|
|
|
|
|
|
|
# now we need to undefine our token |
83
|
75
|
|
|
|
|
82
|
$token = undef; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
# if the character is not a delimiter then |
86
|
|
|
|
|
|
|
else { |
87
|
|
|
|
|
|
|
# check to make sure the token is defined |
88
|
121
|
100
|
|
|
|
143
|
$token = "" unless defined $token; |
89
|
|
|
|
|
|
|
# and then add the chracter to it |
90
|
121
|
|
|
|
|
130
|
$token .= $char; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
# now push any remaining token onto |
94
|
|
|
|
|
|
|
# the temp tokens list |
95
|
60
|
100
|
|
|
|
81
|
push @_tokens => $token if defined $token; |
96
|
|
|
|
|
|
|
# and return tokens |
97
|
60
|
|
|
|
|
102
|
@_tokens; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
# if our token does not have |
100
|
|
|
|
|
|
|
# the delimiter in it |
101
|
|
|
|
|
|
|
else { |
102
|
|
|
|
|
|
|
# just return it |
103
|
88
|
|
|
|
|
107
|
$_ |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
} @tokens; |
106
|
|
|
|
|
|
|
} |
107
|
10
|
|
|
|
|
31
|
$self->{tokens} = \@tokens; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
sub getTokens { |
111
|
6
|
|
|
6
|
1
|
680
|
my ($self) = @_; |
112
|
|
|
|
|
|
|
return wantarray ? |
113
|
1
|
|
|
|
|
6
|
@{$self->{tokens}} |
114
|
|
|
|
|
|
|
: |
115
|
6
|
100
|
|
|
|
21
|
$self->{tokens}; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub iterator { |
119
|
4
|
|
|
4
|
1
|
1589
|
my ($self) = @_; |
120
|
|
|
|
|
|
|
# returns a copy of the array |
121
|
4
|
|
|
|
|
10
|
return String::Tokenizer::Iterator->new($self->{tokens}); |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
package String::Tokenizer::Iterator; |
125
|
|
|
|
|
|
|
|
126
|
2
|
|
|
2
|
|
8
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
31
|
|
127
|
2
|
|
|
2
|
|
5
|
use warnings; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
941
|
|
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub new { |
130
|
5
|
100
|
|
5
|
|
34
|
((caller())[0] eq "String::Tokenizer") |
131
|
|
|
|
|
|
|
|| die "Insufficient Access Priviledges : Only String::Tokenizer can create String::Tokenizer::Iterator instances"; |
132
|
4
|
|
|
|
|
5
|
my ($_class, $tokens) = @_; |
133
|
4
|
|
33
|
|
|
15
|
my $class = ref($_class) || $_class; |
134
|
4
|
|
|
|
|
16
|
my $iterator = { |
135
|
|
|
|
|
|
|
tokens => $tokens, |
136
|
|
|
|
|
|
|
index => 0 |
137
|
|
|
|
|
|
|
}; |
138
|
4
|
|
|
|
|
6
|
bless($iterator, $class); |
139
|
4
|
|
|
|
|
8
|
return $iterator; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
sub reset { |
143
|
1
|
|
|
1
|
|
2
|
my ($self) = @_; |
144
|
1
|
|
|
|
|
2
|
$self->{index} = 0; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub hasNextToken { |
148
|
108
|
|
|
108
|
|
3750
|
my ($self) = @_; |
149
|
108
|
100
|
|
|
|
69
|
return ($self->{index} < scalar @{$self->{tokens}}) ? 1 : 0; |
|
108
|
|
|
|
|
189
|
|
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub hasPrevToken { |
153
|
26
|
|
|
26
|
|
20
|
my ($self) = @_; |
154
|
26
|
|
|
|
|
28
|
return ($self->{index} > 0); |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub nextToken { |
158
|
118
|
|
|
118
|
|
978
|
my ($self) = @_; |
159
|
118
|
100
|
|
|
|
78
|
return undef if ($self->{index} >= scalar @{$self->{tokens}}); |
|
118
|
|
|
|
|
187
|
|
160
|
117
|
|
|
|
|
209
|
return $self->{tokens}->[$self->{index}++]; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub prevToken { |
164
|
26
|
|
|
26
|
|
52
|
my ($self) = @_; |
165
|
26
|
100
|
|
|
|
32
|
return undef if ($self->{index} <= 0); |
166
|
25
|
|
|
|
|
33
|
return $self->{tokens}->[--$self->{index}]; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
sub currentToken { |
170
|
25
|
|
|
25
|
|
49
|
my ($self) = @_; |
171
|
25
|
|
|
|
|
37
|
return $self->{tokens}->[$self->{index} - 1]; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
sub lookAheadToken { |
175
|
41
|
|
|
41
|
|
16823
|
my ($self) = @_; |
176
|
|
|
|
|
|
|
return undef if ( $self->{index} <= 0 |
177
|
41
|
100
|
100
|
|
|
108
|
|| $self->{index} >= scalar @{$self->{tokens}}); |
|
40
|
|
|
|
|
131
|
|
178
|
38
|
|
|
|
|
111
|
return $self->{tokens}->[$self->{index}]; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub collectTokensUntil { |
182
|
5
|
|
|
5
|
|
8
|
my ($self, $token_to_match) = @_; |
183
|
|
|
|
|
|
|
# if this matches our current token ... |
184
|
|
|
|
|
|
|
# then we just return nothing as there |
185
|
|
|
|
|
|
|
# is nothing to accumulate |
186
|
5
|
100
|
|
|
|
8
|
if ($self->lookAheadToken() eq $token_to_match) { |
187
|
|
|
|
|
|
|
# then just advance it one |
188
|
1
|
|
|
|
|
2
|
$self->nextToken(); |
189
|
|
|
|
|
|
|
# and return nothing |
190
|
1
|
|
|
|
|
3
|
return; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# if it doesnt match our current token then, ... |
194
|
4
|
|
|
|
|
5
|
my @collection; |
195
|
|
|
|
|
|
|
# store the index we start at |
196
|
4
|
|
|
|
|
4
|
my $old_index = $self->{index}; |
197
|
4
|
|
|
|
|
4
|
my $matched; |
198
|
|
|
|
|
|
|
# loop through the tokens |
199
|
4
|
|
|
|
|
6
|
while ($self->hasNextToken()) { |
200
|
23
|
|
|
|
|
22
|
my $token = $self->nextToken(); |
201
|
23
|
100
|
|
|
|
25
|
if ($token ne $token_to_match) { |
202
|
20
|
|
|
|
|
28
|
push @collection => $token; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
else { |
205
|
3
|
|
|
|
|
3
|
$matched++; |
206
|
3
|
|
|
|
|
4
|
last; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
} |
209
|
4
|
100
|
|
|
|
7
|
unless ($matched) { |
210
|
|
|
|
|
|
|
# reset back to where we started, and ... |
211
|
1
|
|
|
|
|
2
|
$self->{index} = $old_index; |
212
|
|
|
|
|
|
|
# and return nothing |
213
|
1
|
|
|
|
|
4
|
return; |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
# and return our collection |
216
|
3
|
|
|
|
|
20
|
return @collection; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub skipTokensUntil { |
221
|
3
|
|
|
3
|
|
5
|
my ($self, $token_to_match) = @_; |
222
|
|
|
|
|
|
|
# if this matches our current token ... |
223
|
3
|
100
|
|
|
|
6
|
if ($self->lookAheadToken() eq $token_to_match) { |
224
|
|
|
|
|
|
|
# then just advance it one |
225
|
1
|
|
|
|
|
3
|
$self->nextToken(); |
226
|
|
|
|
|
|
|
# and return success |
227
|
1
|
|
|
|
|
4
|
return 1; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
# if it doesnt match our current token then, ... |
230
|
|
|
|
|
|
|
# store the index we start at |
231
|
2
|
|
|
|
|
4
|
my $old_index = $self->{index}; |
232
|
|
|
|
|
|
|
# and loop through the tokens |
233
|
2
|
|
|
|
|
4
|
while ($self->hasNextToken()) { |
234
|
|
|
|
|
|
|
# return success if we match our token |
235
|
18
|
100
|
|
|
|
17
|
return 1 if ($self->nextToken() eq $token_to_match); |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
# otherwise we didnt match, and should |
238
|
|
|
|
|
|
|
# reset back to where we started, and ... |
239
|
1
|
|
|
|
|
2
|
$self->{index} = $old_index; |
240
|
|
|
|
|
|
|
# return failure |
241
|
1
|
|
|
|
|
3
|
return 0; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
sub skipTokenIfWhitespace { |
245
|
4
|
|
|
4
|
|
6
|
my ($self) = @_; |
246
|
4
|
100
|
|
|
|
6
|
$self->{index}++ if $self->lookAheadToken() =~ /^\s+$/; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub skipTokens { |
250
|
13
|
|
|
13
|
|
21
|
my ($self, $num_token_to_skip) = @_; |
251
|
13
|
|
100
|
|
|
29
|
$num_token_to_skip ||= 1; |
252
|
13
|
|
|
|
|
16
|
$self->{index} += $num_token_to_skip; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
*skipToken = \&skipTokens; |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
1; |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
__END__ |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=head1 NAME |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
String::Tokenizer - A simple string tokenizer. |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=head1 SYNOPSIS |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
use String::Tokenizer; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# create the tokenizer and tokenize input |
270
|
|
|
|
|
|
|
my $tokenizer = String::Tokenizer->new("((5+5) * 10)", '+*()'); |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
# create tokenizer |
273
|
|
|
|
|
|
|
my $tokenizer = String::Tokenizer->new(); |
274
|
|
|
|
|
|
|
# ... then tokenize the string |
275
|
|
|
|
|
|
|
$tokenizer->tokenize("((5 + 5) - 10)", '()'); |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
# will print '(, (, 5, +, 5, ), -, 10, )' |
278
|
|
|
|
|
|
|
print join ", " => $tokenizer->getTokens(); |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# create tokenizer which retains whitespace |
281
|
|
|
|
|
|
|
my $st = String::Tokenizer->new( |
282
|
|
|
|
|
|
|
'this is a test with, (significant) whitespace', |
283
|
|
|
|
|
|
|
',()', |
284
|
|
|
|
|
|
|
String::Tokenizer->RETAIN_WHITESPACE |
285
|
|
|
|
|
|
|
); |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
# this will print: |
288
|
|
|
|
|
|
|
# 'this', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'with', ' ', '(', 'significant', ')', ' ', 'whitespace' |
289
|
|
|
|
|
|
|
print "'" . (join "', '" => $tokenizer->getTokens()) . "'"; |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
# get a token iterator |
292
|
|
|
|
|
|
|
my $i = $tokenizer->iterator(); |
293
|
|
|
|
|
|
|
while ($i->hasNextToken()) { |
294
|
|
|
|
|
|
|
my $next = $i->nextToken(); |
295
|
|
|
|
|
|
|
# peek ahead at the next token |
296
|
|
|
|
|
|
|
my $look_ahead = $i->lookAheadToken(); |
297
|
|
|
|
|
|
|
# ... |
298
|
|
|
|
|
|
|
# skip the next 2 tokens |
299
|
|
|
|
|
|
|
$i->skipTokens(2); |
300
|
|
|
|
|
|
|
# ... |
301
|
|
|
|
|
|
|
# then backtrack 1 token |
302
|
|
|
|
|
|
|
my $previous = $i->prevToken(); |
303
|
|
|
|
|
|
|
# ... |
304
|
|
|
|
|
|
|
# get the current token |
305
|
|
|
|
|
|
|
my $current = $i->currentToken(); |
306
|
|
|
|
|
|
|
# ... |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
=head1 DESCRIPTION |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
A simple string tokenizer which takes a string and splits it on whitespace. It also optionally takes a string of characters to use as delimiters, and returns them with the token set as well. This allows for splitting the string in many different ways. |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
This is a very basic tokenizer, so more complex needs should be either addressed with a custom written tokenizer or post-processing of the output generated by this module. Basically, this will not fill everyone's needs, but it spans a gap between simple C<split / /, $string> and the other options that involve much larger and complex modules. |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
Also note that this is not a lexical analyser. Many people confuse tokenization with lexical analysis. A tokenizer merely splits its input into specific chunks, a lexical analyzer classifies those chunks. Sometimes these two steps are combined, but not here. |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=head1 METHODS |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
=over 4 |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=item B<new ($string, $delimiters, $handle_whitespace)> |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
If you do not supply any parameters, nothing happens, the instance is just created. But if you do supply parameters, they are passed on to the C<tokenize> method and that method is run. For information about those arguments, see C<tokenize> below. |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=item B<setDelimiter ($delimiter)> |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
This can be used to set the delimiter string, this is used by C<tokenize>. |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
=item B<handleWhitespace ($value)> |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
This can be used to set the whitespace handling. It accepts one of the two constant values C<RETAIN_WHITESPACE> or C<IGNORE_WHITESPACE>. |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=item B<tokenize ($string, $delimiters, $handle_whitespace)> |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
Takes a C<$string> to tokenize, and optionally a set of C<$delimiter> characters to facilitate the tokenization and the type of whitespace handling with C<$handle_whitespace>. The C<$string> parameter and the C<$handle_whitespace> parameter are pretty obvious, the C<$delimiter> parameter is not as transparent. C<$delimiter> is a string of characters, these characters are then separated into individual characters and are used to split the C<$string> with. So given this string: |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
(5 + (100 * (20 - 35)) + 4) |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
The C<tokenize> method without a C<$delimiter> parameter would return the following comma separated list of tokens: |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
'(5', '+', '(100', '*', '(20', '-', '35))', '+', '4)' |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
However, if you were to pass the following set of delimiters C<(, )> to C<tokenize>, you would get the following comma separated list of tokens: |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
'(', '5', '+', '(', '100', '*', '(', '20', '-', '35', ')', ')', '+', '4', ')' |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
We now can differentiate the parens from the numbers, and no globbing occurs. If you wanted to allow for optionally leaving out the whitespace in the expression, like this: |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
(5+(100*(20-35))+4) |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
as some languages do. Then you would give this delimiter C<+*-()> to arrive at the same result. |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
If you decide that whitespace is significant in your string, then you need to specify that like this: |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
my $st = String::Tokenizer->new( |
356
|
|
|
|
|
|
|
'this is a test with, (significant) whitespace', |
357
|
|
|
|
|
|
|
',()', |
358
|
|
|
|
|
|
|
String::Tokenizer->RETAIN_WHITESPACE |
359
|
|
|
|
|
|
|
); |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
A call to C<getTokens> on this instance would result in the following token set. |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
'this', ' ', 'is', ' ', 'a', ' ', 'test', ' ', 'with', ' ', '(', 'significant', ')', ' ', 'whitespace' |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
All running whitespace is grouped together into a single token, we make no attempt to split it into its individual parts. |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
=item B<getTokens> |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
Simply returns the array of tokens. It returns an array-ref in scalar context. |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
=item B<iterator> |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Returns a B<String::Tokenizer::Iterator> instance, see below for more details. |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
=back |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=head1 INNER CLASS |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
A B<String::Tokenizer::Iterator> instance is returned from the B<String::Tokenizer>'s C<iterator> method and serves as yet another means of iterating through an array of tokens. The simplest way would be to call C<getTokens> and just manipulate the array yourself, or push the array into another object. However, iterating through a set of tokens tends to get messy when done manually. So here I have provided the B<String::Tokenizer::Iterator> to address those common token processing idioms. It is basically a bi-directional iterator which can look ahead, skip and be reset to the beginning. |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
B<NOTE:> |
382
|
|
|
|
|
|
|
B<String::Tokenizer::Iterator> is an inner class, which means that only B<String::Tokenizer> objects can create an instance of it. That said, if B<String::Tokenizer::Iterator>'s C<new> method is called from outside of the B<String::Tokenizer> package, an exception is thrown. |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=over 4 |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=item B<new ($tokens_array_ref)> |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
This accepts an array reference of tokens and sets up the iterator. This method can only be called from within the B<String::Tokenizer> package, otherwise an exception will be thrown. |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=item B<reset> |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
This will reset the internal counter, |
393
|
|
|
|
|
|
|
bringing it back to the beginning of the token list. |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=item B<hasNextToken> |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
This will return true (1) if there are more tokens to be iterated over, |
398
|
|
|
|
|
|
|
and false (0) otherwise. |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=item B<hasPrevToken> |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
This will return true (1) if the beginning of the token list has been reached, and false (0) otherwise. |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
=item B<nextToken> |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
This dispenses the next available token, and move the internal counter ahead by one. |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=item B<prevToken> |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
This dispenses the previous token, and moves the internal counter back by one. |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=item B<currentToken> |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
This returns the current token, which will match the last token retrieved by C<nextToken>. |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
=item B<lookAheadToken> |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
This peeks ahead one token to the next one in the list. This item will match the next item dispensed with C<nextToken>. This is a non-destructive look ahead, meaning it does not alter the position of the internal counter. |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
=item B<skipToken> |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
This will jump the internal counter ahead by 1. |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
=item B<skipTokens ($number_to_skip)> |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
This will jump the internal counter ahead by C<$number_to_skip>. |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item B<skipTokenIfWhitespace> |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
This will skip the next token if it is whitespace. |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=item B<skipTokensUntil ($token_to_match)> |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
Given a string as a C<$token_to_match>, this will skip all tokens until it matches that string. If the C<$token_to_match> is never matched, then the iterator will return the internal pointer to its initial state. |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=item B<collectTokensUntil ($token_to_match)> |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
Given a string as a C<$token_to_match>, this will collect all tokens until it matches that string, at which point the collected tokens will be returned. If the C<$token_to_match> is never matched, then the iterator will return the internal pointer to its initial state and no tokens will be returned. |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=back |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=head1 TO DO |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=over 4 |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=item I<Inline token expansion> |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
The Java StringTokenizer class allows for a token to be tokenized further, therefore breaking it up more and including the results into the current token stream. I have never used this feature in this class, but I can see where it might be a useful one. This may be in the next release if it works out. |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Possibly compliment this expansion with compression as well, so for instance double quoted strings could be compressed into a single token. |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=item I<Token Bookmarks> |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Allow for the creation of "token bookmarks". Meaning we could tag a specific token with a label, that index could be returned to from any point in the token stream. We could mix this with a memory stack as well, so that we would have an ordering to the bookmarks as well. |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=back |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=head1 BUGS |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
None that I am aware of. Of course, if you find a bug, let me know, and I will be sure to fix it. |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
=head1 CODE COVERAGE |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
I use B<Devel::Cover> to test the code coverage of my tests, below is the B<Devel::Cover> report on this module's test suite. |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
------------------------ ------ ------ ------ ------ ------ ------ ------ |
467
|
|
|
|
|
|
|
File stmt branch cond sub pod time total |
468
|
|
|
|
|
|
|
------------------------ ------ ------ ------ ------ ------ ------ ------ |
469
|
|
|
|
|
|
|
String/Tokenizer.pm 100.0 100.0 64.3 100.0 100.0 100.0 97.6 |
470
|
|
|
|
|
|
|
------------------------ ------ ------ ------ ------ ------ ------ ------ |
471
|
|
|
|
|
|
|
Total 100.0 100.0 64.3 100.0 100.0 100.0 97.6 |
472
|
|
|
|
|
|
|
------------------------ ------ ------ ------ ------ ------ ------ ------ |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=head1 SEE ALSO |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
The interface and workings of this module are based largely on the StringTokenizer class from the Java standard library. |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
Below is a short list of other modules that might be considered similar to this one. If this module does not suit your needs, you might look at one of these. |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=over 4 |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
=item L<String::Tokeniser> |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
Along with being a tokenizer, |
485
|
|
|
|
|
|
|
it also provides a means of moving through the resulting tokens, |
486
|
|
|
|
|
|
|
allowing for skipping of tokens and such. |
487
|
|
|
|
|
|
|
It was last updated in 2011. |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
=item L<Parse::Tokens> |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
This one hasn't been touched since 2001, |
492
|
|
|
|
|
|
|
although it did get up to version 0.27. |
493
|
|
|
|
|
|
|
It looks to lean over more towards the parser side than a basic tokenizer. |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
=item L<Text::Tokenizer> |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
This is both a lexical analyzer and a tokenizer. |
498
|
|
|
|
|
|
|
It also uses XS, where String::Tokenizer is pure perl. |
499
|
|
|
|
|
|
|
This is something maybe to look into if you were to need a more beefy solution |
500
|
|
|
|
|
|
|
than String::Tokenizer provides. |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=back |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
=head1 THANKS |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
=over |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=item Thanks to Stephan Tobias for finding bugs and suggestions on whitespace handling. |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=back |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=head1 AUTHOR |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
stevan little, E<lt>stevan@cpan.orgE<gt> |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Copyright 2004-2016 by Infinity Interactive, Inc. |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
L<http://www.iinteractive.com> |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
523
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=cut |