line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Zoidberg::StringParser; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
# Hic sunt leones. |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
our $VERSION = '0.981'; |
6
|
|
|
|
|
|
|
|
7
|
20
|
|
|
20
|
|
32756
|
use strict; |
|
20
|
|
|
|
|
28
|
|
|
20
|
|
|
|
|
700
|
|
8
|
20
|
|
|
20
|
|
121
|
no warnings; # can't stand the nagging |
|
20
|
|
|
|
|
24
|
|
|
20
|
|
|
|
|
924
|
|
9
|
20
|
|
|
20
|
|
669
|
use Zoidberg::Utils qw/debug error bug/; |
|
20
|
|
|
|
|
78
|
|
|
20
|
|
|
|
|
129
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $ERROR_CALLER = 1; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# TODO : |
14
|
|
|
|
|
|
|
# esc per type ? |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# how bout more general state machine approach, |
17
|
|
|
|
|
|
|
# making QUOTE and NEST operations like CUT, POP and RECURS |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# grammar can be big hash (sort keys on length) .. how to deal with regexes than ? |
20
|
|
|
|
|
|
|
# ... optimise for normal string tokens, regexes are the exception |
21
|
|
|
|
|
|
|
# need seperate hashes for overloading |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# how bout ->for(gram, string, int, sub) ? exec sub on token with most parser vars in scope |
24
|
|
|
|
|
|
|
# %state ? |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
sub new { |
27
|
23
|
|
|
23
|
1
|
82754
|
my $class = shift; |
28
|
23
|
|
50
|
|
|
1043
|
my $self = { |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
29
|
|
|
|
|
|
|
base_gram => shift || {}, |
30
|
|
|
|
|
|
|
collection => shift || {}, |
31
|
|
|
|
|
|
|
settings => shift || {}, |
32
|
|
|
|
|
|
|
}; |
33
|
23
|
|
|
|
|
71
|
bless $self, $class; |
34
|
23
|
|
|
|
|
89
|
return $self; |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
sub split { |
38
|
1551
|
|
|
1551
|
1
|
36500
|
my ($self, $gram, $input, $int) = @_; |
39
|
1551
|
|
|
|
|
2305
|
$int--; # 1 based => 0 based |
40
|
|
|
|
|
|
|
|
41
|
1551
|
|
|
|
|
3516
|
$$self{broken} = undef; # reset error |
42
|
|
|
|
|
|
|
|
43
|
1551
|
|
|
|
|
8375
|
debug "splitting with $gram"; |
44
|
1551
|
100
|
|
|
|
4949
|
unless (ref $gram) { |
|
|
50
|
|
|
|
|
|
45
|
1548
|
50
|
|
|
|
6023
|
error "No such grammar: $gram" unless $$self{collection}{$gram}; |
46
|
1548
|
|
|
|
|
5514
|
$gram = [$$self{collection}{$gram}] |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
elsif (ref($gram) eq 'ARRAY') { |
49
|
0
|
|
|
|
|
0
|
my $error; |
50
|
0
|
0
|
0
|
|
|
0
|
$gram = [ map { |
51
|
0
|
|
|
|
|
0
|
ref($_) ? $_ : ($$self{collection}{$_} || $error++) |
52
|
|
|
|
|
|
|
} @$gram ]; |
53
|
0
|
0
|
|
|
|
0
|
error "No such grammar: $_" if $error; |
54
|
|
|
|
|
|
|
} |
55
|
3
|
|
|
|
|
7
|
else { $gram = [$gram] } # hash or regex |
56
|
1551
|
|
|
|
|
3843
|
unshift @$gram, $$self{base_gram}; |
57
|
|
|
|
|
|
|
|
58
|
1551
|
|
|
|
|
2050
|
my ($expr, $types); |
59
|
1551
|
|
|
|
|
4142
|
($gram, $expr, $types) = $self->_prepare_gram($gram); |
60
|
|
|
|
|
|
|
# use Data::Dumper; print STDERR Dumper $gram, $expr, $types; |
61
|
|
|
|
|
|
|
|
62
|
1551
|
|
|
|
|
4588
|
my $string; |
63
|
1551
|
100
|
|
|
|
3507
|
if (ref($input) eq 'ARRAY') { $string = shift @$input } |
|
10
|
|
|
|
|
18
|
|
64
|
1541
|
|
|
|
|
4710
|
else { ($string, $input) = ("$input", []) } # quotes in case of overload |
65
|
|
|
|
|
|
|
|
66
|
1551
|
100
|
66
|
|
|
6431
|
return unless length $string or @$input; |
67
|
|
|
|
|
|
|
|
68
|
1550
|
|
|
|
|
1708
|
my ($block, @parts, @open, $i, $s_i); # $i counts splitted parts, $s_i the stack size |
69
|
|
|
|
|
|
|
|
70
|
2039
|
|
|
|
|
16370
|
PARSE_TOKEN: |
71
|
|
|
|
|
|
|
debug 'splitting string: '.$string; |
72
|
|
|
|
|
|
|
|
73
|
2039
|
|
|
|
|
4067
|
my ($token, $type, $sign); |
74
|
2039
|
|
66
|
|
|
90811
|
while ( !$token && $string =~ s{\A(.*?)($expr\z)}{}s ) { |
75
|
2639
|
100
|
|
|
|
16779
|
$block .= $1 if length $1; |
76
|
2639
|
|
|
|
|
6011
|
$sign = $2; |
77
|
|
|
|
|
|
|
|
78
|
2639
|
|
|
|
|
3924
|
my $i = 0; |
79
|
2639
|
100
|
|
|
|
15792
|
($_ eq $2) ? last : $i++ for ($3, $4, $5); |
80
|
2639
|
|
|
|
|
4588
|
$type = $$types[$i]; |
81
|
|
|
|
|
|
|
|
82
|
2639
|
100
|
66
|
|
|
11895
|
last unless length $sign or length $string; # catch the \z |
83
|
|
|
|
|
|
|
|
84
|
1312
|
100
|
|
|
|
2958
|
if ($type eq 'd_esc') { |
85
|
9
|
|
|
|
|
36
|
debug "block: ==>$block<== token: ==>$sign<== type: $type"; |
86
|
9
|
|
|
|
|
14
|
$block .= $sign; |
87
|
9
|
|
|
|
|
74
|
next; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# fetch token |
91
|
1303
|
|
|
|
|
1415
|
my $item; |
92
|
1303
|
|
|
|
|
6029
|
my ($slice) = grep exists($$_{$type}), reverse @$gram; |
93
|
1303
|
100
|
|
|
|
4031
|
if (ref($$slice{$type}[1]) eq 'ARRAY') { # for loop probably faster |
94
|
771
|
100
|
|
|
|
6180
|
($item) = map $$_[1], |
95
|
488
|
|
|
|
|
1258
|
grep {ref($$_[0]) ? ($sign =~ $$_[0]) : ($sign eq $$_[0])} |
96
|
488
|
|
|
|
|
860
|
@{$$slice{$type}[1]} |
97
|
|
|
|
|
|
|
} |
98
|
815
|
|
|
|
|
2252
|
else { $item = $$slice{$type}[1]{$sign} } |
99
|
1303
|
|
|
|
|
8159
|
debug "block: ==>$block<== token: ==>$sign<== type: $type item: $item"; |
100
|
1303
|
100
|
|
|
|
6979
|
$item = $sign if $item eq '_SELF'; |
101
|
|
|
|
|
|
|
|
102
|
1303
|
100
|
66
|
|
|
14124
|
if (exists $$slice{s_esc} and $1 =~ /$$slice{s_esc}$/) { |
103
|
12
|
|
|
|
|
83
|
debug 'escaped token s_esc: '.$$slice{s_esc}; |
104
|
12
|
100
|
100
|
|
|
84
|
$block =~ s/$$slice{s_esc}$// |
105
|
|
|
|
|
|
|
if $type eq 'tokens' and ! $$self{settings}{no_esc_rm}; |
106
|
12
|
|
|
|
|
26
|
$block .= $sign; |
107
|
12
|
|
|
|
|
136
|
next; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
1291
|
100
|
|
|
|
2809
|
if ($type eq 'tokens') { |
111
|
891
|
100
|
|
|
|
1718
|
unless ($s_i) { |
112
|
493
|
50
|
|
|
|
1046
|
if (ref $item) { # for $() matching tactics |
113
|
0
|
|
|
|
|
0
|
debug 'push stack (tokens)'; |
114
|
0
|
|
|
|
|
0
|
push @$gram, $item; |
115
|
0
|
|
|
|
|
0
|
$s_i++; |
116
|
0
|
|
|
|
|
0
|
($gram, $expr, $types) = $self->_prepare_gram($gram); |
117
|
0
|
|
|
|
|
0
|
@open = ($sign, $type); |
118
|
0
|
|
|
|
|
0
|
$token = $$gram[-1]{token}; |
119
|
|
|
|
|
|
|
} |
120
|
493
|
|
|
|
|
819
|
else { $token = $item } |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
else { |
123
|
398
|
50
|
|
|
|
1558
|
if ($item eq '_POP') { |
|
|
0
|
|
|
|
|
|
124
|
398
|
|
|
|
|
816
|
$block .= $sign; |
125
|
398
|
|
|
|
|
1716
|
debug "pop stack ($item)"; |
126
|
398
|
|
|
|
|
874
|
pop @$gram; |
127
|
398
|
|
|
|
|
621
|
$s_i--; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
elsif ($item eq '_CUT') { # for $() matching |
130
|
0
|
|
|
|
|
0
|
$token = $item; |
131
|
0
|
|
|
|
|
0
|
debug "cut stack ($item)"; |
132
|
0
|
|
|
|
|
0
|
splice @$gram, -$s_i; |
133
|
0
|
|
|
|
|
0
|
$s_i = 0; |
134
|
|
|
|
|
|
|
} |
135
|
0
|
|
|
|
|
0
|
else { bug "what to do with $item !?" } |
136
|
398
|
|
|
|
|
1058
|
($gram, $expr, $types) = $self->_prepare_gram($gram); |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
else { # open nest or quote |
140
|
400
|
|
|
|
|
545
|
$block .= $sign; |
141
|
400
|
50
|
|
|
|
839
|
unless (ref $item) { |
142
|
400
|
100
|
|
|
|
866
|
if ($item eq '_REC') { $item = {} } # recurs UGLY |
|
48
|
|
|
|
|
173
|
|
143
|
|
|
|
|
|
|
else { # generate a grammar on the fly |
144
|
352
|
100
|
|
|
|
2990
|
$item = ($type eq 'nests') |
145
|
|
|
|
|
|
|
? { |
146
|
|
|
|
|
|
|
tokens => {$item => '_POP'}, |
147
|
|
|
|
|
|
|
nests => {$sign => '_REC'}, |
148
|
|
|
|
|
|
|
} : { |
149
|
|
|
|
|
|
|
tokens => {$item => '_POP'}, |
150
|
|
|
|
|
|
|
quotes => {$sign => '_REC'}, |
151
|
|
|
|
|
|
|
nests => {}, |
152
|
|
|
|
|
|
|
} ; |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
# else if item is ref => item is grammar |
156
|
400
|
|
|
|
|
1742
|
debug "push stack ($type)"; |
157
|
400
|
|
|
|
|
1090
|
push @$gram, $item; |
158
|
400
|
|
|
|
|
580
|
$s_i++; |
159
|
400
|
|
|
|
|
1691
|
($gram, $expr, $types) = $self->_prepare_gram($gram); |
160
|
400
|
|
|
|
|
1509
|
@open = ($sign, $type); |
161
|
|
|
|
|
|
|
} |
162
|
1291
|
100
|
|
|
|
41642
|
last unless length $string; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
2039
|
100
|
|
|
|
4845
|
if (length $block) { |
166
|
1979
|
|
|
|
|
3782
|
my $part = $block; # force copy |
167
|
1979
|
|
|
|
|
4836
|
push @parts, \$part; |
168
|
|
|
|
|
|
|
} |
169
|
2039
|
100
|
100
|
|
|
6595
|
if ($token and $token ne '_CUT') { push @parts, $token } |
|
84
|
|
|
|
|
150
|
|
170
|
2039
|
|
|
|
|
2854
|
$block = $token = undef; |
171
|
|
|
|
|
|
|
|
172
|
2039
|
100
|
100
|
|
|
16649
|
if (($s_i or ++$i != $int) and length($string) || scalar(@$input)) { |
|
|
100
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
173
|
489
|
100
|
|
|
|
1136
|
$string = shift @$input unless length $string; |
174
|
489
|
|
|
|
|
2545
|
goto PARSE_TOKEN; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
elsif ($i == $int) { |
177
|
1
|
|
|
|
|
4
|
my $part = join '', $string, @$input; |
178
|
1
|
|
|
|
|
3
|
push @parts, \$part; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
1550
|
100
|
|
|
|
3247
|
if ($s_i) { # broken |
182
|
2
|
|
|
|
|
7
|
debug 'stack not empty'; |
183
|
2
|
|
|
|
|
8
|
$open[1] =~ s/s$// ; |
184
|
2
|
|
|
|
|
6
|
$$self{broken} = "Unmatched $open[1] at end of input: $open[0]"; |
185
|
2
|
100
|
|
|
|
11
|
error $$self{broken} unless $$self{settings}{allow_broken}; |
186
|
1
|
|
|
|
|
7
|
pop @$gram for 1 .. $s_i; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
|
189
|
1549
|
50
|
100
|
|
|
6360
|
return grep defined($_), map {ref($_) ? $$_ : $_} @parts |
|
527
|
100
|
|
|
|
8481
|
|
190
|
|
|
|
|
|
|
if $$gram[-1]{was_regexp} && ! $$self{settings}{no_split_intel}; |
191
|
1365
|
|
|
|
|
11631
|
return grep defined($_), @parts; |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
sub _prepare_gram { # index immediatly here |
195
|
2349
|
|
|
2349
|
|
3412
|
my ($self, $gram) = @_; |
196
|
2349
|
|
|
|
|
4008
|
my %index; |
197
|
2349
|
|
|
|
|
4118
|
for my $ref (@$gram) { # prepare grammars for usage |
198
|
5424
|
100
|
|
|
|
21454
|
if (ref($ref) eq 'Regexp') { |
|
|
50
|
|
|
|
|
|
199
|
185
|
|
|
|
|
1456
|
$ref = {tokens => [[$ref, '_CUT']], was_regexp => 1}; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
elsif (ref($ref) ne 'HASH') { |
202
|
0
|
|
|
|
|
0
|
error 'Grammar has wrong data type: '.ref($ref)."\n"; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
5424
|
100
|
|
|
|
15353
|
unless ($$ref{prepared}) { |
206
|
677
|
100
|
100
|
|
|
5579
|
if (exists $$ref{esc}) { |
|
|
100
|
|
|
|
|
|
207
|
21
|
50
|
|
|
|
276
|
$$ref{s_esc} = ref($$ref{esc}) ? $$ref{esc} |
208
|
|
|
|
|
|
|
: quotemeta $$ref{esc}; # single esc regexp |
209
|
21
|
|
|
|
|
169
|
$$ref{d_esc} = '('.($$ref{s_esc}x2).')|'; # double esc regexp |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
elsif (! exists $$ref{s_esc} and exists $index{s_esc}) { |
212
|
636
|
|
|
|
|
2036
|
$$ref{s_esc} = $index{s_esc}; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
677
|
|
|
|
|
1468
|
for (qw/tokens nests quotes/) { |
216
|
2031
|
100
|
|
|
|
5187
|
next unless exists $$ref{$_}; |
217
|
527
|
100
|
|
|
|
2110
|
my $expr = (ref($$ref{$_}) eq 'ARRAY') |
218
|
|
|
|
|
|
|
? join( '|', map { |
219
|
237
|
|
|
|
|
778
|
ref($$_[0]) ? $$_[0] : quotemeta($$_[0]) |
220
|
853
|
|
|
|
|
2958
|
} @{$$ref{$_}} ) |
221
|
1252
|
100
|
|
|
|
4463
|
: join( '|', map { quotemeta($_) } keys %{$$ref{$_}} ) ; |
|
1015
|
|
|
|
|
3915
|
|
222
|
1252
|
100
|
|
|
|
3667
|
$expr = $expr ? '('.$expr.')|' : ''; |
223
|
1252
|
|
|
|
|
4465
|
$$ref{$_} = [$expr, $$ref{$_}]; |
224
|
|
|
|
|
|
|
} |
225
|
677
|
|
|
|
|
8067
|
$$ref{prepared}++; |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
5424
|
|
|
|
|
37318
|
$index{$_} = $$ref{$_}[0] for grep exists($$ref{$_}), qw/tokens nests quotes/; |
229
|
5424
|
|
|
|
|
32735
|
$index{$_} = $$ref{$_} for grep exists($$ref{$_}), qw/s_esc d_esc/; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
2349
|
|
|
|
|
13555
|
my ($expr, @types) = (''); |
233
|
2349
|
|
|
|
|
3920
|
for (qw/d_esc tokens nests quotes/) { |
234
|
9396
|
100
|
|
|
|
22438
|
next unless length $index{$_}; |
235
|
9108
|
|
|
|
|
17980
|
push @types, $_; |
236
|
9108
|
|
|
|
|
19403
|
$expr .= $index{$_}; |
237
|
|
|
|
|
|
|
} |
238
|
2349
|
|
|
|
|
12524
|
return $gram, $expr, \@types; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
1; |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
__END__ |