line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Tokenize::Indented;
|
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
33002
|
use 5.006;
|
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
65
|
|
4
|
1
|
|
|
1
|
|
7
|
use strict;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
43
|
|
5
|
1
|
|
|
1
|
|
6
|
use warnings FATAL => 'all';
|
|
1
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
51
|
|
6
|
1
|
|
|
1
|
|
1694
|
use Iterator::Simple;
|
|
1
|
|
|
|
|
8012
|
|
|
1
|
|
|
|
|
64
|
|
7
|
1
|
|
|
1
|
|
1017
|
use Iterator::Simple::Lookahead;
|
|
1
|
|
|
|
|
6291
|
|
|
1
|
|
|
|
|
34
|
|
8
|
1
|
|
|
1
|
|
1254
|
use Data::Dumper;
|
|
1
|
|
|
|
|
9491
|
|
|
1
|
|
|
|
|
81
|
|
9
|
1
|
|
|
1
|
|
9
|
use Carp;
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
740
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Text::Tokenize::Indented - tokenize indented lines in text
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 VERSION
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Version 0.01
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=cut
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.01';
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 SYNOPSIS
|
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
As part of the Decl language project (the windmill I've been tilting at since 2010), I end up working with
|
27
|
|
|
|
|
|
|
text a lot that is structured by indentation. Finally, I think, this module provides a solid underpinning
|
28
|
|
|
|
|
|
|
to working with that kind of text, in that it provides as convenient a tokenizer as possible.
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
It's based on L, meaning that it (1) does a lazy tokenization of a list passed
|
31
|
|
|
|
|
|
|
into it, and (2) provides a peek and unget so that you can easily chain tokenizers; if a given piece that has
|
32
|
|
|
|
|
|
|
already been identified turns out to break into multiple tokens, you simply tokenize it and push the subpieces
|
33
|
|
|
|
|
|
|
back into the stream for later retrieval as individual tokens.
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
This allows very nice compartmentalization of the details of parsing, leaving you a lot less to debug when
|
36
|
|
|
|
|
|
|
parsing more difficult items.
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
You use it like this:
|
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
use Text::Tokenize::Indented;
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
my $tok = Text::Tokenize::Indented ({tab => 4}, < 8}, $trailing_iterator)
|
43
|
|
|
|
|
|
|
text
|
44
|
|
|
|
|
|
|
text
|
45
|
|
|
|
|
|
|
text
|
46
|
|
|
|
|
|
|
text
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
text
|
49
|
|
|
|
|
|
|
EOF
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
(For instance.) This then returns the following token stream:
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
[0, 'text']
|
54
|
|
|
|
|
|
|
[0, 'text']
|
55
|
|
|
|
|
|
|
[3, 'text']
|
56
|
|
|
|
|
|
|
[3, 'text']
|
57
|
|
|
|
|
|
|
[-1]
|
58
|
|
|
|
|
|
|
[0, 'text']
|
59
|
|
|
|
|
|
|
(whatever the trailing iterator returns)
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
We might then chain another tokenizer onto this one which would tokenize the individual lines into more meaningful things.
|
62
|
|
|
|
|
|
|
Note that blank lines officially have an indentation of -1.
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=head1 METHODS
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head2 new
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Creates a new tokenizer, with or without input. Any parameters are passed to C.
|
69
|
|
|
|
|
|
|
The defaults for parameters are as follows: tabs=4 (tabs are 4 spaces), blank, newline.
|
70
|
|
|
|
|
|
|
Any parameter can be changed mid-stream by sending a hashref into the input.
|
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
Returns an Iterator::Simple::Lookahead iterator that returns items from the input queue.
|
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=cut
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub new {
|
77
|
0
|
|
|
0
|
1
|
|
my $class = shift;
|
78
|
0
|
|
|
|
|
|
my $self = bless {
|
79
|
|
|
|
|
|
|
tabs => 4,
|
80
|
|
|
|
|
|
|
blank => qr/\s+/,
|
81
|
|
|
|
|
|
|
newline => qr/\n/,
|
82
|
|
|
|
|
|
|
queue => [],
|
83
|
|
|
|
|
|
|
}, $class;
|
84
|
0
|
0
|
|
|
|
|
$self->input(@_) if @_;
|
85
|
|
|
|
|
|
|
$self->{iterator} = Iterator::Simple::Lookahead->new (
|
86
|
|
|
|
|
|
|
sub {
|
87
|
0
|
|
|
|
|
|
NEXT:
|
88
|
|
|
|
|
|
|
# End of input if the queue is empty.
|
89
|
0
|
0
|
|
0
|
|
|
return undef unless @{$self->{queue}};
|
90
|
0
|
|
|
|
|
|
my $next = $self->{queue}->[0];
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# Take care of parameter updates if the next thing is a hashref, start over.
|
93
|
0
|
0
|
|
|
|
|
if (ref $next eq 'HASH') {
|
94
|
0
|
|
|
|
|
|
foreach my $key (keys(%$next)) {
|
95
|
0
|
|
|
|
|
|
$self->{$key} = $next->{$key};
|
96
|
|
|
|
|
|
|
}
|
97
|
0
|
|
|
|
|
|
shift @{$self->{queue}};
|
|
0
|
|
|
|
|
|
|
98
|
0
|
|
|
|
|
|
goto NEXT;
|
99
|
|
|
|
|
|
|
}
|
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
# Get the next value in the queue.
|
102
|
|
|
|
|
|
|
NEXTVAL:
|
103
|
0
|
|
|
|
|
|
my $nextval = $next->();
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# If the currently next iterator is finished, go to the next thing on the queue.
|
106
|
0
|
0
|
|
|
|
|
if (not defined $nextval) {
|
107
|
0
|
|
|
|
|
|
shift @{$self->{queue}};
|
|
0
|
|
|
|
|
|
|
108
|
0
|
|
|
|
|
|
goto NEXT;
|
109
|
|
|
|
|
|
|
}
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# If the next value itself is a hashref, we'll still get parameters out of it.
|
112
|
0
|
0
|
|
|
|
|
if (ref ($nextval) eq 'HASH') {
|
113
|
0
|
|
|
|
|
|
foreach my $key (keys(%$nextval)) {
|
114
|
0
|
|
|
|
|
|
$self->{$key} = $next->{$key};
|
115
|
|
|
|
|
|
|
}
|
116
|
0
|
|
|
|
|
|
goto NEXTVAL;
|
117
|
|
|
|
|
|
|
}
|
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# Return the value if it's an arrayref, as we have somehow presumably already
|
120
|
|
|
|
|
|
|
# tokenized it in an upstream tokenizer of some sort.
|
121
|
0
|
0
|
|
|
|
|
return $nextval if ref($nextval);
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Oh! A string!
|
124
|
0
|
0
|
|
|
|
|
if ($nextval =~ /^(\s+)(.*)/) {
|
125
|
0
|
|
|
|
|
|
my ($white, $meat) = ($1, $2);
|
126
|
0
|
0
|
|
|
|
|
return [-1] unless $meat;
|
127
|
0
|
|
|
|
|
|
$white =~ s/\t/' ' x $self->{tabs}/ge;
|
|
0
|
|
|
|
|
|
|
128
|
0
|
|
|
|
|
|
return [length($white), $meat];
|
129
|
|
|
|
|
|
|
}
|
130
|
0
|
|
|
|
|
|
return [0, $nextval];
|
131
|
0
|
|
|
|
|
|
});
|
132
|
|
|
|
|
|
|
#print STDERR Dumper($self);
|
133
|
0
|
|
|
|
|
|
$self;
|
134
|
|
|
|
|
|
|
}
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head2 tokenize (@input)
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Creates a tokenizer with input, but instead of returning the object, returns only
|
139
|
|
|
|
|
|
|
the iterator. No new input can be added to this tokenizer, but normally you don't
|
140
|
|
|
|
|
|
|
care.
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=cut
|
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub tokenize {
|
145
|
0
|
|
|
0
|
1
|
|
my $t = new(@_);
|
146
|
0
|
|
|
|
|
|
$t->{iterator};
|
147
|
|
|
|
|
|
|
}
|
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=head2 input
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
Input is where text is loaded up into the tokenizer. It takes a list of items, each of which can be
|
152
|
|
|
|
|
|
|
either a hashref, which will be used to set values in the tokenizer that apply to coming data,
|
153
|
|
|
|
|
|
|
a string, which will be split into lines, or an iterable object, which will be passed through
|
154
|
|
|
|
|
|
|
to the tokenizer output.
|
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
Returns the iterator for the object.
|
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=cut
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
sub input {
|
161
|
0
|
|
|
0
|
1
|
|
my $self = shift;
|
162
|
0
|
|
|
|
|
|
foreach my $load (@_) {
|
163
|
0
|
0
|
|
|
|
|
if (ref $load eq '') { # String input.
|
|
|
0
|
|
|
|
|
|
164
|
0
|
|
|
|
|
|
my @lines = split /\n/, $load;
|
165
|
0
|
|
|
|
|
|
push @{$self->{queue}}, Iterator::Simple::iter(\@lines);
|
|
0
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
} elsif (ref $load eq 'HASH') { # Parameters.
|
167
|
0
|
|
|
|
|
|
push @{$self->{queue}}, $load;
|
|
0
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
} else {
|
169
|
0
|
0
|
|
|
|
|
croak "Non-iterable input supplied" unless Iterator::Simple::is_iterable($load);
|
170
|
0
|
|
|
|
|
|
push @{$self->{queue}}, Iterator::Simple::iter($load);
|
|
0
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
}
|
172
|
|
|
|
|
|
|
}
|
173
|
0
|
|
|
|
|
|
$self->{iterator};
|
174
|
|
|
|
|
|
|
}
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=head1 AUTHOR
|
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
Michael Roberts, C<< >>
|
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
=head1 BUGS
|
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through
|
183
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll
|
184
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes.
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=head1 SUPPORT
|
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command.
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
perldoc Text::Tokenize::Indented
|
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
You can also look for information at:
|
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=over 4
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here)
|
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
L
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
L
|
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=item * CPAN Ratings
|
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
L
|
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=item * Search CPAN
|
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
L
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=back
|
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT
|
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Copyright 2014 Michael Roberts.
|
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it
|
227
|
|
|
|
|
|
|
under the terms of the the Artistic License (2.0). You may obtain a
|
228
|
|
|
|
|
|
|
copy of the full license at:
|
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
L
|
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
Any use, modification, and distribution of the Standard or Modified
|
233
|
|
|
|
|
|
|
Versions is governed by this Artistic License. By using, modifying or
|
234
|
|
|
|
|
|
|
distributing the Package, you accept this license. Do not use, modify,
|
235
|
|
|
|
|
|
|
or distribute the Package, if you do not accept this license.
|
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
If your Modified Version has been derived from a Modified Version made
|
238
|
|
|
|
|
|
|
by someone other than you, you are nevertheless required to ensure that
|
239
|
|
|
|
|
|
|
your Modified Version complies with the requirements of this license.
|
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
This license does not grant you the right to use any trademark, service
|
242
|
|
|
|
|
|
|
mark, tradename, or logo of the Copyright Holder.
|
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
This license includes the non-exclusive, worldwide, free-of-charge
|
245
|
|
|
|
|
|
|
patent license to make, have made, use, offer to sell, sell, import and
|
246
|
|
|
|
|
|
|
otherwise transfer the Package with respect to any patent claims
|
247
|
|
|
|
|
|
|
licensable by the Copyright Holder that are necessarily infringed by the
|
248
|
|
|
|
|
|
|
Package. If you institute patent litigation (including a cross-claim or
|
249
|
|
|
|
|
|
|
counterclaim) against any party alleging that the Package constitutes
|
250
|
|
|
|
|
|
|
direct or contributory patent infringement, then this Artistic License
|
251
|
|
|
|
|
|
|
to you shall terminate on the date that such litigation is filed.
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
|
254
|
|
|
|
|
|
|
AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
|
255
|
|
|
|
|
|
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
256
|
|
|
|
|
|
|
PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
|
257
|
|
|
|
|
|
|
YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
|
258
|
|
|
|
|
|
|
CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
|
259
|
|
|
|
|
|
|
CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
|
260
|
|
|
|
|
|
|
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=cut
|
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
1; # End of Text::Tokenize::Indented
|