line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Unicode::SetAutomaton;
|
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
58728
|
use strict;
|
|
2
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
81
|
|
4
|
2
|
|
|
2
|
|
12
|
use warnings;
|
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
60
|
|
5
|
2
|
|
|
2
|
|
2216
|
use Set::IntSpan;
|
|
2
|
|
|
|
|
33448
|
|
|
2
|
|
|
|
|
140
|
|
6
|
2
|
|
|
2
|
|
2451
|
use Set::IntSpan::Partition;
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Storable qw(freeze);
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our $VERSION = '0.01';
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our @utf8hs = (0x00000000, 0x00000000, 0x0000c080, 0x00e08080, 0xf0808080);
|
12
|
|
|
|
|
|
|
our @utf8min = (0x00000000, 0x00000000, 0x0000C280, 0x00E0A080, 0xF0908080);
|
13
|
|
|
|
|
|
|
our @utf8max = (0x00000000, 0x0000007F, 0x0000DFBF, 0x00EFBFBF, 0xF48FBFBF);
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
sub _u8enc {
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $cp = shift;
|
18
|
|
|
|
|
|
|
my $ln = 4;
|
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# Encodes code points as utf-8 integers;
|
21
|
|
|
|
|
|
|
# for example, U+00F6 becomes 0x0000C3B6
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
return $cp if $cp <= 0x7F;
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# Spread the bits to their target locations
|
26
|
|
|
|
|
|
|
my $ret = (($cp << 0) & 0x0000003f) |
|
27
|
|
|
|
|
|
|
(($cp << 2) & 0x00003f00) |
|
28
|
|
|
|
|
|
|
(($cp << 4) & 0x003f0000) |
|
29
|
|
|
|
|
|
|
(($cp << 6) & 0x3f000000) ;
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# Count the length
|
32
|
|
|
|
|
|
|
$ln -= $cp <= 0xFFFF;
|
33
|
|
|
|
|
|
|
$ln -= $cp <= 0x07FF;
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# Merge the spread bits with the mode bits
|
36
|
|
|
|
|
|
|
return $ret | $utf8hs[$ln];
|
37
|
|
|
|
|
|
|
}
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub _get_info {
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
my $u8 = shift;
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
my $width = 4;
|
44
|
|
|
|
|
|
|
$width -= $u8 <= 0xFFFFFF;
|
45
|
|
|
|
|
|
|
$width -= $u8 <= 0xFFFF;
|
46
|
|
|
|
|
|
|
$width -= $u8 <= 0xFF;
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
my $s1 = ($width - 1) * 8;
|
49
|
|
|
|
|
|
|
my $s2 = (4 - $width) * 8;
|
50
|
|
|
|
|
|
|
my $ml = 0x00808080 >> $s2;
|
51
|
|
|
|
|
|
|
my $mh = 0x00BFBFBF >> $s2;
|
52
|
|
|
|
|
|
|
my $xl = 0x00800000 >> $s2;
|
53
|
|
|
|
|
|
|
my $xh = 0x00BF0000 >> $s2;
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# The first byte in the partial utf-8 sequence in u8.
|
56
|
|
|
|
|
|
|
my $head = $u8 >> $s1;
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# All bytes after the first, or zero if there are none.
|
59
|
|
|
|
|
|
|
my $tail = ($u8 & $mh);
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# Indicates whether the first byte after head is 0x80.
|
62
|
|
|
|
|
|
|
my $islow = ($u8 & $xh) == $xl;
|
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# Indicates whether the first byte after head is 0xBF.
|
65
|
|
|
|
|
|
|
my $isupp = ($u8 & $xh) == $xh;
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# Last partial sequence before sequences with head
|
68
|
|
|
|
|
|
|
my $pmax = ($head - 1) << $s1 | $mh;
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# First partial sequence for sequences with head
|
71
|
|
|
|
|
|
|
my $pmin = ($head + 0) << $s1 | $ml;
|
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# Last partial sequence for sequences with head
|
74
|
|
|
|
|
|
|
my $hmax = ($head + 0) << $s1 | $mh;
|
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# First partial sequence after sequences with head
|
77
|
|
|
|
|
|
|
my $hmin = ($head + 1) << $s1 | $ml;
|
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# There are a few special cases for the min,max items
|
80
|
|
|
|
|
|
|
# if the respective head is not a continuation octet.
|
81
|
|
|
|
|
|
|
# E.g., for 0xE0 pmin should be E0A080 but is E08080.
|
82
|
|
|
|
|
|
|
# The caller handles them indirectly by splitting.
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
my $i = {
|
85
|
|
|
|
|
|
|
width => $width, head => $head, tail => $tail,
|
86
|
|
|
|
|
|
|
isLow => $islow, isUpp => $isupp, hmin => $hmin,
|
87
|
|
|
|
|
|
|
hmax => $hmax, pmin => $pmin, pmax => $pmax,
|
88
|
|
|
|
|
|
|
};
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
return $i;
|
91
|
|
|
|
|
|
|
}
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub _get_next {
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
my $iter = shift;
|
96
|
|
|
|
|
|
|
my ($clas, $cmin, $cmax, $nmin, $min, $max);
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
if ($iter->{splitix}) {
|
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# When splitting ranges a separate array is used to keep
|
101
|
|
|
|
|
|
|
# track of artificial ranges. If there any, use them first.
|
102
|
|
|
|
|
|
|
$cmax = $iter->{split}->[ -- $iter->{splitix} ];
|
103
|
|
|
|
|
|
|
$cmin = $iter->{split}->[ -- $iter->{splitix} ];
|
104
|
|
|
|
|
|
|
$clas = $iter->{split}->[ -- $iter->{splitix} ];
|
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
} elsif ( $iter->{derivix} <= $iter->{end} ) {
|
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# If there are none, we pick a new range from the input.
|
109
|
|
|
|
|
|
|
$clas = $iter->{deriv}->[ $iter->{derivix} ++ ];
|
110
|
|
|
|
|
|
|
$cmin = $iter->{deriv}->[ $iter->{derivix} ++ ];
|
111
|
|
|
|
|
|
|
$cmax = $iter->{deriv}->[ $iter->{derivix} ++ ];
|
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
} else {
|
114
|
|
|
|
|
|
|
return
|
115
|
|
|
|
|
|
|
}
|
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# Compute various properties of the partial sequences.
|
118
|
|
|
|
|
|
|
$min = _get_info($cmin);
|
119
|
|
|
|
|
|
|
$max = _get_info($cmax);
|
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
if ($min->{width} != $max->{width}) {
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# The range crosses width boundaries, so split it.
|
124
|
|
|
|
|
|
|
$nmin = $utf8min[ $min->{width} + 1 ];
|
125
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $clas;
|
126
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
|
127
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
|
128
|
|
|
|
|
|
|
$cmax = $utf8max[ $min->{width} ];
|
129
|
|
|
|
|
|
|
}
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
if ($cmin >= 0x00eda080 and $cmax <= 0x00edbfbf) {
|
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# The current range contains only surrogate code points
|
134
|
|
|
|
|
|
|
# which are not allowed. So get the next range, if any.
|
135
|
|
|
|
|
|
|
return _get_next( $iter );
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
} elsif ($cmin >= 0x00eda080 and $cmin <= 0x00edbfbf) {
|
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# cmin is somewhere inside the surrogate range and cmax
|
140
|
|
|
|
|
|
|
# is not. So we set cmin to the first non-surrogate.
|
141
|
|
|
|
|
|
|
$cmin = 0x00ee8080;
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
} elsif ($cmax >= 0x00eda080 and $cmax <= 0x00edbfbf) {
|
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# cmax is somewhere inside the surrogate range and cmin
|
146
|
|
|
|
|
|
|
# is not. So we set cmax to the last non-surrogate.
|
147
|
|
|
|
|
|
|
$cmax = 0x00ed9fbf;
|
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
} elsif ($cmin < 0x00eda080 and $cmax > 0x00edbfbf) {
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# The range includes code points before and after the
|
152
|
|
|
|
|
|
|
# surrogate range. So we have to split it into two.
|
153
|
|
|
|
|
|
|
$nmin = 0x00ee8080;
|
154
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $clas;
|
155
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
|
156
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
|
157
|
|
|
|
|
|
|
$cmax = 0x00ed9fbf;
|
158
|
|
|
|
|
|
|
}
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# cmin and cmax may have changed so recompute the info
|
161
|
|
|
|
|
|
|
$min = _get_info($cmin);
|
162
|
|
|
|
|
|
|
$max = _get_info($cmax);
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
if (!($min->{head} == $max->{head}) && !($min->{isLow} && $max->{isUpp})) {
|
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
if ($min->{isLow}) {
|
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# min is a lower and but max is not an upper end, so we split
|
169
|
|
|
|
|
|
|
# the range into two, one going from min to "one" before max,
|
170
|
|
|
|
|
|
|
# and the other going from the beginning of max's range to max.
|
171
|
|
|
|
|
|
|
$nmin = $max->{pmin};
|
172
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $clas;
|
173
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
|
174
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
|
175
|
|
|
|
|
|
|
$cmax = $max->{pmax};
|
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
} else {
|
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# if the heads are different and min is not a lower end then
|
180
|
|
|
|
|
|
|
# we have to complete min's range first, so split the range.
|
181
|
|
|
|
|
|
|
$nmin = $min->{hmin};
|
182
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $clas;
|
183
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $nmin;
|
184
|
|
|
|
|
|
|
$iter->{split}->[ $iter->{splitix} ++ ] = $cmax;
|
185
|
|
|
|
|
|
|
$cmax = $min->{hmax};
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
}
|
188
|
|
|
|
|
|
|
}
|
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# cmax may have changed so recompute the info
|
191
|
|
|
|
|
|
|
$max = _get_info($cmax);
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
return $clas, $min, $max;
|
194
|
|
|
|
|
|
|
}
|
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub _triples_to_dfa {
|
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
my @triples = @_;
|
199
|
|
|
|
|
|
|
my (@d, @todo, $d2s, $s2d);
|
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# The deriv array stores all character class, or rather, utf-8 range
|
202
|
|
|
|
|
|
|
# information. Each class stores the length of the subsequent data as
|
203
|
|
|
|
|
|
|
# first item. For classes representing end states the next value is
|
204
|
|
|
|
|
|
|
# the number of the class. For other classes a list of
|
205
|
|
|
|
|
|
|
# triples follows. Array references and other structures could be
|
206
|
|
|
|
|
|
|
# used instead, however this structure mirrors the C implementation.
|
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
my @deriv = ( scalar(@triples), @triples );
|
209
|
|
|
|
|
|
|
my $lengthix = scalar @deriv;
|
210
|
|
|
|
|
|
|
my $nextix = scalar @deriv + 1;
|
211
|
|
|
|
|
|
|
my $nextnum = 0;
|
212
|
|
|
|
|
|
|
my $start = $nextnum++;
|
213
|
|
|
|
|
|
|
my $obj2num = { freeze(\@triples), $start };
|
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
$deriv[$lengthix] = 0;
|
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
push @todo, [ $start, $start ];
|
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
while (@todo) {
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
my ($index, $currentS) = @{ pop @todo };
|
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
# A special iterator is used to go over the utf-8 ranges in a class.
|
224
|
|
|
|
|
|
|
# The basic idea of this algorithm is to compute all derivatives of
|
225
|
|
|
|
|
|
|
# a given class of utf-8 ranges; the iterator automatically splits
|
226
|
|
|
|
|
|
|
# these ranges such that we can take the "heads" of a range as label
|
227
|
|
|
|
|
|
|
# for a transition, and the "tails" as range for the next class.
|
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# As an example, consider a simple class from U+00E4 to U+00F6. The
|
230
|
|
|
|
|
|
|
# range is utf-8 encoded to 0x0000C3A4 .. 0x0000C3B6. Here the head
|
231
|
|
|
|
|
|
|
# range would be 0xC3 .. 0xC3 and the tail range 0xA4 .. 0xB6. Then
|
232
|
|
|
|
|
|
|
# -- 0xC3 .. 0xC3 --> <1>, <1> -- 0xA4 .. 0xB6 --> would be
|
233
|
|
|
|
|
|
|
# the automaton. See the _get_next routine for details on splitting.
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
my $iter = {
|
236
|
|
|
|
|
|
|
split => [],
|
237
|
|
|
|
|
|
|
splitix => 0,
|
238
|
|
|
|
|
|
|
deriv => \@deriv,
|
239
|
|
|
|
|
|
|
derivix => $index + 1,
|
240
|
|
|
|
|
|
|
end => $index + $deriv[$index],
|
241
|
|
|
|
|
|
|
};
|
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
my $prev;
|
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
while (1) {
|
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
my $success = my ($cls, $min, $max) = _get_next($iter);
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
# If there are no more ranges in the current class, or if the latest
|
250
|
|
|
|
|
|
|
# head range is not equal to the previous one, we found the end of
|
251
|
|
|
|
|
|
|
# a new class. Note that head ranges can only be the same if min,max
|
252
|
|
|
|
|
|
|
# are equal for both the previous and the current range, so we only
|
253
|
|
|
|
|
|
|
# have to check the two min values.
|
254
|
|
|
|
|
|
|
if ((not $success) or (defined $prev && $prev->[1] ne $min->{head})) {
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# The hash table obj2num is used to ensure that the same classes
|
257
|
|
|
|
|
|
|
# are assigned the same state number; not doing so would result
|
258
|
|
|
|
|
|
|
# an automaton that is not minimal in the number of states, and
|
259
|
|
|
|
|
|
|
# minimizing it later would be considerably more costly.
|
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
my @new = @deriv[($lengthix + 1) .. ($lengthix + $deriv[$lengthix])];
|
262
|
|
|
|
|
|
|
my $ice = freeze(\@new);
|
263
|
|
|
|
|
|
|
my $num = $obj2num->{$ice};
|
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
if (not defined $num) {
|
266
|
|
|
|
|
|
|
$num = $obj2num->{$ice} = $nextnum++;
|
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
# End states store only the number of the associated class;
|
269
|
|
|
|
|
|
|
# end states do not have outgoing transitions so we do not
|
270
|
|
|
|
|
|
|
# add them to the todo list. Other classes are added to it.
|
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
if ($deriv[$lengthix] > 1) {
|
273
|
|
|
|
|
|
|
push @todo, [$lengthix, $num]
|
274
|
|
|
|
|
|
|
} else {
|
275
|
|
|
|
|
|
|
$s2d->{ $num } = $deriv[$lengthix + 1];
|
276
|
|
|
|
|
|
|
$d2s->[ $deriv[$lengthix + 1] ] = $num;
|
277
|
|
|
|
|
|
|
}
|
278
|
|
|
|
|
|
|
}
|
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# Record the newly found transition in the transition table
|
281
|
|
|
|
|
|
|
# as four-tuple .
|
282
|
|
|
|
|
|
|
push @d, [ $currentS, $prev->[1], $prev->[2], $num ];
|
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
$lengthix = $nextix++;
|
285
|
|
|
|
|
|
|
$deriv[$lengthix] = 0;
|
286
|
|
|
|
|
|
|
last unless $success;
|
287
|
|
|
|
|
|
|
}
|
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
# min and max always have the same width. If the width of the
|
290
|
|
|
|
|
|
|
# current range is one then we are moving to an end state. If
|
291
|
|
|
|
|
|
|
# it is greater than one, we are creating a new partial class.
|
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
if ($min->{width} != 1) {
|
294
|
|
|
|
|
|
|
$deriv[ $nextix++ ] = $cls;
|
295
|
|
|
|
|
|
|
$deriv[ $nextix++ ] = $min->{tail};
|
296
|
|
|
|
|
|
|
$deriv[ $nextix++ ] = $max->{tail};
|
297
|
|
|
|
|
|
|
$deriv[ $lengthix ] += 3;
|
298
|
|
|
|
|
|
|
} else {
|
299
|
|
|
|
|
|
|
$deriv[ $nextix++ ] = $cls;
|
300
|
|
|
|
|
|
|
$deriv[ $lengthix ] += 1;
|
301
|
|
|
|
|
|
|
}
|
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
$prev = [ $cls, $min->{head}, $max->{head} ]
|
304
|
|
|
|
|
|
|
}
|
305
|
|
|
|
|
|
|
}
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
return $start, \@d, $d2s, $s2d;
|
308
|
|
|
|
|
|
|
}
|
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
sub new {
|
311
|
|
|
|
|
|
|
my $class = shift;
|
312
|
|
|
|
|
|
|
my %param = @_;
|
313
|
|
|
|
|
|
|
my $self = bless { }, $class;
|
314
|
|
|
|
|
|
|
my @input = @{$param{classes}};
|
315
|
|
|
|
|
|
|
my @spans;
|
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
# A deterministic finite automaton can only be in a single state
|
318
|
|
|
|
|
|
|
# at a time, so split the input classes minimally such that each
|
319
|
|
|
|
|
|
|
# code point belongs to at most a single class, not multiple ones.
|
320
|
|
|
|
|
|
|
my @disjoint = intspan_partition(@input);
|
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
# intspan_partition unfortunately does not keep track of how it
|
323
|
|
|
|
|
|
|
# splits classes; we'd like to know, so restore the information.
|
324
|
|
|
|
|
|
|
for (my $i = 0; $i <= $#disjoint; $i++) {
|
325
|
|
|
|
|
|
|
for (my $j = 0; $j <= $#input; $j++) {
|
326
|
|
|
|
|
|
|
next unless $disjoint[$i]->subset($input[$j]);
|
327
|
|
|
|
|
|
|
push @{$self->{disjoint_to_input}->[$i]}, $j;
|
328
|
|
|
|
|
|
|
}
|
329
|
|
|
|
|
|
|
}
|
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# The construction algorithm considers all spans at once, so we
|
332
|
|
|
|
|
|
|
# collect all into a single array, noting where each belongs.
|
333
|
|
|
|
|
|
|
for (my $i = 0; $i <= $#disjoint; $i++) {
|
334
|
|
|
|
|
|
|
foreach my $span ($disjoint[$i]->spans) {
|
335
|
|
|
|
|
|
|
push @spans, [ $i, @$span ];
|
336
|
|
|
|
|
|
|
}
|
337
|
|
|
|
|
|
|
}
|
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# While not strictly necessary, it is better to sort the spans,
|
340
|
|
|
|
|
|
|
# so we do that here. Note that spans are disjoint, so we only
|
341
|
|
|
|
|
|
|
# have to compare the relevant minimum value for each span pair.
|
342
|
|
|
|
|
|
|
my @sorted = sort { $a->[1] <=> $b->[1] } @spans;
|
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
# Now we can generate a single list for all
|
345
|
|
|
|
|
|
|
# triples where min and max are utf-8 integers. It is easier to
|
346
|
|
|
|
|
|
|
# do this here then telling a complete class apart from partial
|
347
|
|
|
|
|
|
|
# classes generated later; the spans are not array references
|
348
|
|
|
|
|
|
|
# mainly because that mirrors the C implementation more closely.
|
349
|
|
|
|
|
|
|
my @u8triples = map {
|
350
|
|
|
|
|
|
|
$_->[0], _u8enc($_->[1]), _u8enc($_->[2])
|
351
|
|
|
|
|
|
|
} @sorted;
|
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
my ($start, $d, $d2s, $s2d) = _triples_to_dfa(@u8triples);
|
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
$self->{state_to_disjoint} = $s2d;
|
356
|
|
|
|
|
|
|
$self->{disjoint_to_state} = $d2s;
|
357
|
|
|
|
|
|
|
$self->{disjoint_classes} = \@disjoint;
|
358
|
|
|
|
|
|
|
$self->{input_classes} = \@input;
|
359
|
|
|
|
|
|
|
$self->{start_state} = $start;
|
360
|
|
|
|
|
|
|
$self->{transitions} = $d;
|
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
return $self;
|
363
|
|
|
|
|
|
|
}
|
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
sub _regex_append {
|
366
|
|
|
|
|
|
|
my $node = shift;
|
367
|
|
|
|
|
|
|
my $type = shift;
|
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
if (UNIVERSAL::isa($node, 'Set::IntSpan')) {
|
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
if ($node->size == 1) {
|
372
|
|
|
|
|
|
|
$_[0] .= sprintf "\\x%02x", $node->elements
|
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
} else {
|
375
|
|
|
|
|
|
|
$_[0] .= "[";
|
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
foreach my $span ($node->spans) {
|
378
|
|
|
|
|
|
|
if ($span->[0] == $span->[1]) {
|
379
|
|
|
|
|
|
|
$_[0] .= sprintf "\\x%02x", $span->[0]
|
380
|
|
|
|
|
|
|
} else {
|
381
|
|
|
|
|
|
|
$_[0] .= sprintf "\\x%02x-\\x%02x", @$span
|
382
|
|
|
|
|
|
|
}
|
383
|
|
|
|
|
|
|
}
|
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
$_[0] .= "]";
|
386
|
|
|
|
|
|
|
}
|
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
} elsif ($node->[0] eq 'Group') {
|
389
|
|
|
|
|
|
|
_regex_append($node->[1], 'Group', $_[0]);
|
390
|
|
|
|
|
|
|
_regex_append($node->[2], 'Group', $_[0]);
|
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
} elsif ($node->[0] eq 'Choice' and $type eq 'Group') {
|
393
|
|
|
|
|
|
|
$_[0] .= "(";
|
394
|
|
|
|
|
|
|
_regex_append($node->[1], 'Choice', $_[0]);
|
395
|
|
|
|
|
|
|
$_[0] .= "|";
|
396
|
|
|
|
|
|
|
_regex_append($node->[2], 'Choice', $_[0]);
|
397
|
|
|
|
|
|
|
$_[0] .= ")";
|
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
} elsif ($node->[0] eq 'Choice') {
|
400
|
|
|
|
|
|
|
_regex_append($node->[1], 'Choice', $_[0]);
|
401
|
|
|
|
|
|
|
$_[0] .= "|";
|
402
|
|
|
|
|
|
|
_regex_append($node->[2], 'Choice', $_[0]);
|
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
} else {
|
405
|
|
|
|
|
|
|
die
|
406
|
|
|
|
|
|
|
}
|
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
}
|
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
sub as_expressions {
|
411
|
|
|
|
|
|
|
my $self = shift;
|
412
|
|
|
|
|
|
|
my $last = 0;
|
413
|
|
|
|
|
|
|
my @m;
|
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
require Graph::Directed;
|
416
|
|
|
|
|
|
|
my $g = Graph::Directed->new;
|
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
# Convert the transitions into a matrix using Set::IntSpan objects
|
419
|
|
|
|
|
|
|
# to represent byte classes and use a graph to keep track of the
|
420
|
|
|
|
|
|
|
# predecessors and successors of each state. Would be nice if the
|
421
|
|
|
|
|
|
|
# Set::IntSpan::union method accepted undef as set to avoid the if.
|
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
foreach my $transition (@{ $self->{transitions} }) {
|
424
|
|
|
|
|
|
|
my ($src, $min, $max, $dst) = @$transition;
|
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
if (defined $m[$src][$dst]) {
|
427
|
|
|
|
|
|
|
$m[$src][$dst] = $m[$src][$dst]->union([[$min,$max]]);
|
428
|
|
|
|
|
|
|
} else {
|
429
|
|
|
|
|
|
|
$m[$src][$dst] = Set::IntSpan->new([[$min,$max]]);
|
430
|
|
|
|
|
|
|
}
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
$g->add_edge($src, $dst);
|
433
|
|
|
|
|
|
|
$last = $dst > $last ? $dst : $last;
|
434
|
|
|
|
|
|
|
}
|
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
# States will be eliminated in the reverse order of their creation.
|
437
|
|
|
|
|
|
|
# I am unsure if that produces the best result but could so far not
|
438
|
|
|
|
|
|
|
# find counter-examples. A more elaborate algorithm would make sure
|
439
|
|
|
|
|
|
|
# a state is removed before any, if that is possible, that must be
|
440
|
|
|
|
|
|
|
# visited before or after when going from start state to final state.
|
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
my @order = grep {
|
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# We only remove a state if it is neither the start state nor
|
445
|
|
|
|
|
|
|
# a final state. $self->{state_to_disjoint} holds final ones.
|
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
$_ != $self->{start_state} and
|
448
|
|
|
|
|
|
|
not exists $self->{state_to_disjoint}->{$_}
|
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
} (0 .. $last);
|
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
while (@order) {
|
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
my $curr = pop @order;
|
455
|
|
|
|
|
|
|
my @pred = $g->predecessors($curr);
|
456
|
|
|
|
|
|
|
my @succ = $g->successors($curr);
|
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
# A state is eliminated by connecting all predecessors with all
|
459
|
|
|
|
|
|
|
# successors by an increasingly complex regular expression. We
|
460
|
|
|
|
|
|
|
# store the regular expression as a binary tree to ease adding
|
461
|
|
|
|
|
|
|
# needed braces later. Note that the transition graph does not
|
462
|
|
|
|
|
|
|
# have cycles, otherwise we would have to encode the cycle too.
|
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
foreach my $pred (@pred) {
|
465
|
|
|
|
|
|
|
foreach my $succ (@succ) {
|
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
my $group = [ Group => $m[$pred][$curr], $m[$curr][$succ] ];
|
468
|
|
|
|
|
|
|
if ($m[$pred][$succ]) {
|
469
|
|
|
|
|
|
|
$m[$pred][$succ] = [ Choice => $m[$pred][$succ], $group ];
|
470
|
|
|
|
|
|
|
} else {
|
471
|
|
|
|
|
|
|
$m[$pred][$succ] = $group;
|
472
|
|
|
|
|
|
|
$g->add_edge($pred, $succ);
|
473
|
|
|
|
|
|
|
}
|
474
|
|
|
|
|
|
|
}
|
475
|
|
|
|
|
|
|
}
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
$g->delete_vertex($curr);
|
478
|
|
|
|
|
|
|
}
|
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
# Now the matrix has a regular expression for each of the dis-
|
481
|
|
|
|
|
|
|
# joint classes at m[start_state][final_state]. We iterate over
|
482
|
|
|
|
|
|
|
# the disjoint classes, pretty print the expression, and return
|
483
|
|
|
|
|
|
|
# them in the order of the disjoint classes.
|
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
my @expressions;
|
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
for (my $i = 0; $i <= $#{ $self->{disjoint_to_state} }; $i++) {
|
488
|
|
|
|
|
|
|
my $final = $self->{disjoint_to_state}->[$i];
|
489
|
|
|
|
|
|
|
my $regex = "";
|
490
|
|
|
|
|
|
|
_regex_append($m[$self->{start_state}][$final], 'Root', $regex);
|
491
|
|
|
|
|
|
|
push @expressions, $regex;
|
492
|
|
|
|
|
|
|
}
|
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
return @expressions;
|
495
|
|
|
|
|
|
|
}
|
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
1;
|
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
__END__
|