line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
8
|
|
|
8
|
|
5982
|
use 5.008008; |
|
8
|
|
|
|
|
19
|
|
|
8
|
|
|
|
|
269
|
|
2
|
8
|
|
|
8
|
|
78
|
use strict; |
|
8
|
|
|
|
|
10
|
|
|
8
|
|
|
|
|
194
|
|
3
|
8
|
|
|
8
|
|
23
|
use warnings; |
|
8
|
|
|
|
|
11
|
|
|
8
|
|
|
|
|
160
|
|
4
|
8
|
|
|
8
|
|
4134
|
use integer; |
|
8
|
|
|
|
|
64
|
|
|
8
|
|
|
|
|
29
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package Regexp::ERE; |
7
|
|
|
|
|
|
|
our $VERSION = '0.04'; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
BEGIN { |
10
|
8
|
|
|
8
|
|
398
|
use Exporter (); |
|
8
|
|
|
|
|
11
|
|
|
8
|
|
|
|
|
468
|
|
11
|
8
|
|
|
8
|
|
14
|
our (@ISA, @EXPORT_OK); |
12
|
8
|
|
|
|
|
71
|
@ISA = qw(Exporter); |
13
|
8
|
|
|
|
|
529
|
@EXPORT_OK = qw( |
14
|
|
|
|
|
|
|
&ere_to_nfa |
15
|
|
|
|
|
|
|
&ere_to_tree |
16
|
|
|
|
|
|
|
&ere_to_regex |
17
|
|
|
|
|
|
|
&ere_to_input_constraints |
18
|
|
|
|
|
|
|
&nfa_to_tree |
19
|
|
|
|
|
|
|
&nfa_to_regex |
20
|
|
|
|
|
|
|
&nfa_to_input_constraints |
21
|
|
|
|
|
|
|
&nfa_clone |
22
|
|
|
|
|
|
|
&nfa_concat |
23
|
|
|
|
|
|
|
&nfa_union |
24
|
|
|
|
|
|
|
&nfa_inter |
25
|
|
|
|
|
|
|
&nfa_match |
26
|
|
|
|
|
|
|
&nfa_quant |
27
|
|
|
|
|
|
|
&nfa_isomorph |
28
|
|
|
|
|
|
|
&nfa_to_dfa |
29
|
|
|
|
|
|
|
&dfa_to_min_dfa |
30
|
|
|
|
|
|
|
&nfa_to_min_dfa |
31
|
|
|
|
|
|
|
&tree_to_regex |
32
|
|
|
|
|
|
|
&tree_to_input_constraints |
33
|
|
|
|
|
|
|
&char_to_cc |
34
|
|
|
|
|
|
|
&interval_list_to_cc |
35
|
|
|
|
|
|
|
&cc_union |
36
|
|
|
|
|
|
|
"e |
37
|
|
|
|
|
|
|
); |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=encoding utf8 |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 NAME |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Regexp::ERE - extended regular expressions and finite automata |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=head1 SYNOPSIS |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
use Regexp::ERE qw( |
49
|
|
|
|
|
|
|
&ere_to_nfa |
50
|
|
|
|
|
|
|
&nfa_inter |
51
|
|
|
|
|
|
|
&nfa_to_regex |
52
|
|
|
|
|
|
|
&nfa_to_input_constraints |
53
|
|
|
|
|
|
|
&nfa_to_dfa |
54
|
|
|
|
|
|
|
&dfa_to_min_dfa |
55
|
|
|
|
|
|
|
); |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# condition 1: begins with abc or def |
58
|
|
|
|
|
|
|
my $nfa1 = ere_to_nfa('^(abc|def)'); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# condition 2: ends with 123 or 456 |
61
|
|
|
|
|
|
|
my $nfa2 = ere_to_nfa('(123|456)$'); |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# condition 1 and condition 2 |
64
|
|
|
|
|
|
|
my $inter_nfa = nfa_inter($nfa1, $nfa2); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# compute extended regular expression (string) |
67
|
|
|
|
|
|
|
my $ere = nfa_to_regex($inter_nfa); |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
# compute perl regular expression |
70
|
|
|
|
|
|
|
my $perlre = nfa_to_regex($inter_nfa, 1); |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# compute weaker input constraints suitable for widgets |
73
|
|
|
|
|
|
|
my ($input_constraints, $split_perlre) |
74
|
|
|
|
|
|
|
= nfa_to_input_constraints($inter_nfa); |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# minimal dfa (simpler regular expression happens to result) |
77
|
|
|
|
|
|
|
my $nfa3 = ere_to_nfa('^(a|ab|b)*$'); |
78
|
|
|
|
|
|
|
my $dfa3 = nfa_to_dfa($nfa3); |
79
|
|
|
|
|
|
|
my $min_dfa3 = dfa_to_min_dfa($dfa3); |
80
|
|
|
|
|
|
|
my $ere3 = nfa_to_regex($min_dfa3); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=head1 DESCRIPTION |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Pure-perl module for: |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=over 4 |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=item * |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Parsing POSIX Extended Regular Expressions (C<$ere>) into |
91
|
|
|
|
|
|
|
Non-Deterministic Finite Automata (C<$nfa>) |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=item * |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Manipulating C<$nfa>s (concatenating, or-ing, and-ing) |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=item * |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Computing Deterministic Finite Automata (C<$dfa>s) from C<$nfa>s |
100
|
|
|
|
|
|
|
(powerset construction) |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=item * |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Computing minimal C<$dfa>s from C<$dfa>s (Hopcroft's algorithm) |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=item * |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Computing C<$ere>s or Perl Regular Expressions from C<$nfa> or C<$dfa> |
109
|
|
|
|
|
|
|
(Warshall algorithm) |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=item * |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
Heuristically deriving (possibly weaker) constraints from a C<$nfa> or C<$dfa> |
114
|
|
|
|
|
|
|
suitable for display in a graphical user interface, |
115
|
|
|
|
|
|
|
i.e. a sequence of widgets of type 'free text' and 'drop down'; |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Example: C<'^(abc|def)'> => C<$nfa> => C<[['abc', 'def'], 'free text']> |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=back |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=head1 GLOSSARY AND CONVERSIONS OVERVIEW |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head2 Conversions overview |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
$ere -> $nfa -> $tree -> $regex ($ere or $perlre) |
126
|
|
|
|
|
|
|
-> $input_constraints |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
The second argument of -> $regex conversions is an optional boolean, |
129
|
|
|
|
|
|
|
true : conversion to a compiled perl regular expression |
130
|
|
|
|
|
|
|
false: conversion to an ere string |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
The -> $input_constraints conversions return a pair ( |
133
|
|
|
|
|
|
|
$input_constraints: aref as described at tree_to_input_constraints() |
134
|
|
|
|
|
|
|
$split_perlre : a compiled perl regular expression |
135
|
|
|
|
|
|
|
) |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head2 Glossary |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=over 4 |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=item $char_class |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
A set of unicode characters. |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=item $ere |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Extended regular expression (string). |
149
|
|
|
|
|
|
|
See C for the exact syntax. |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=item $perlre |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
Perl regular expression |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=item $nfa |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
Non-deterministic finite automaton |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=item $dfa |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Deterministic finite automaton (special case of C<$nfa>) |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=item $tree |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
Intermediate hierarchical representation of a regular expression |
166
|
|
|
|
|
|
|
(which still can be manipulated before stringification), |
167
|
|
|
|
|
|
|
similar to a parse tree (but used for generating, not for parsing). |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=item $input_constraints |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
Ad-hoc data structure representing a list of gui-widgets |
172
|
|
|
|
|
|
|
(free text fields and drop-down lists), |
173
|
|
|
|
|
|
|
a helper for entering inputs |
174
|
|
|
|
|
|
|
conforming to a given C<$nfa>. |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=back |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=cut |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
############################################################################## |
182
|
|
|
|
|
|
|
# Config |
183
|
|
|
|
|
|
|
############################################################################## |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# If true, nfa_to_tree() always expands concatenated alternations. |
186
|
|
|
|
|
|
|
# Example: (ab|cd) (ef|gh) -> (abef|abgh|cdef|cdgh) |
187
|
|
|
|
|
|
|
our $TREE_CONCAT_FULL_EXPAND = 0; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# If true, prefixes and suffixes are factorized out even for |
190
|
|
|
|
|
|
|
# trees with a single alternation. |
191
|
|
|
|
|
|
|
# Example: (a1b|a2b) -> a(1|2)b |
192
|
|
|
|
|
|
|
our $FULL_FACTORIZE_FIXES = 0; |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
# Should be 0. Else, traces nfa_to_tree() on STDERR. |
195
|
|
|
|
|
|
|
use constant { |
196
|
8
|
|
|
|
|
577
|
TRACE_NFA_TO_TREE => 0 |
197
|
8
|
|
|
8
|
|
35
|
}; |
|
8
|
|
|
|
|
13
|
|
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
use constant { |
200
|
8
|
|
|
|
|
1559
|
MAX_CHAR => 0x10FFFF |
201
|
|
|
|
|
|
|
, CHAR_CLASS => 'cc' # for blessing $char_classes (label only, no methods) |
202
|
8
|
|
|
8
|
|
32
|
}; |
|
8
|
|
|
|
|
8
|
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=head1 DATA STRUCTURES AND SUBROUTINES |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
Each of the documented subroutines can be imported, |
208
|
|
|
|
|
|
|
for instance C |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=cut |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
############################################################################## |
214
|
|
|
|
|
|
|
# $char_class |
215
|
|
|
|
|
|
|
############################################################################## |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=head2 Character class |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
WARNING: C<$char_class>es must be created exclusively by |
221
|
|
|
|
|
|
|
C |
222
|
|
|
|
|
|
|
or C |
223
|
|
|
|
|
|
|
for equivalent character classes to be always the same array reference. |
224
|
|
|
|
|
|
|
For the same reason, C<$char_class>es must never be mutated. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
In this implementation, the state transitions of a C<$nfa> are based upon |
227
|
|
|
|
|
|
|
character classes (not single characters). A character class is an ordered |
228
|
|
|
|
|
|
|
list of disjoint, non-mergeable intervals (over unicode code points, |
229
|
|
|
|
|
|
|
i.e. positive integers). |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
$char_class = [ |
232
|
|
|
|
|
|
|
[ $low_0, $high_0 ] # $interval_0 |
233
|
|
|
|
|
|
|
, [ $low_1, $high_1 ] # $interval_1 |
234
|
|
|
|
|
|
|
, ... |
235
|
|
|
|
|
|
|
] |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
Constraints: |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
1: 0 <= $$char_class[$i][0] (0 <= low) |
241
|
|
|
|
|
|
|
2: $$char_class[$i][1] <= MAX_CHAR (high <= MAX_CHAR) |
242
|
|
|
|
|
|
|
3: $$char_class[$i][0] <= $$char_class[$i][1] (low <= high) |
243
|
|
|
|
|
|
|
4: $$char_class[$i][1] + 1 < $$char_class[$i+1][0] (non mergeable) |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
Exceptions (anchors used only in the parsing phase only): |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
begin : [ -2, -1 ] |
249
|
|
|
|
|
|
|
end : [ -3, -2 ] |
250
|
|
|
|
|
|
|
begin or end : [ -3, -1 ] |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
Immediately after parsing, such pseudo-character classes |
253
|
|
|
|
|
|
|
are removed by C (internal subroutine). |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=over 4 |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=cut |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
our $ERE_literal = qr/ [^.[\\()*+?{|^\$] /xms; |
260
|
|
|
|
|
|
|
our $PERLRE_char_class_special = qr/ [\[\]\\\^\-] /xms; |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
our $cc_any = bless([[ 0, MAX_CHAR ]], CHAR_CLASS); |
263
|
|
|
|
|
|
|
our $cc_none = bless([], CHAR_CLASS); |
264
|
|
|
|
|
|
|
our $cc_beg = bless([[ -2, -1]], CHAR_CLASS); |
265
|
|
|
|
|
|
|
our $cc_end = bless([[ -3, -2]], CHAR_CLASS); |
266
|
|
|
|
|
|
|
{ |
267
|
|
|
|
|
|
|
|
268
|
8
|
|
|
8
|
|
38
|
no warnings qw(utf8); # in particular for 0x10FFFF |
|
8
|
|
|
|
|
9
|
|
|
8
|
|
|
|
|
15499
|
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
my %cc_cache; |
271
|
|
|
|
|
|
|
# keys: join(',',1,map{@$_}@{$char_class}) |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
for ($cc_any, $cc_none, $cc_beg, $cc_end) { |
274
|
|
|
|
|
|
|
$cc_cache{ join(',', 1, map {@$_} @$_) } = $_; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=item char_to_cc($c) |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
Returns the unique C<$char_class> equivalent to C<[[ord($c), ord($c)]]>. |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=cut |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
sub char_to_cc { |
284
|
857
|
|
100
|
857
|
1
|
192470
|
return $cc_cache{ join(',', 1, (ord($_[0])) x 2) } |
285
|
|
|
|
|
|
|
||= bless([[ord($_[0]), ord($_[0])]], CHAR_CLASS); |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# $interval_list is the same data structure as $char_class. |
289
|
|
|
|
|
|
|
# Constraints 1, 2 are assumed. |
290
|
|
|
|
|
|
|
# Constraints 3, 4 are enforced. |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=item interval_list_to_cc($interval_list) |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
C<$interval_list> is an arbitrary list of intervals. |
295
|
|
|
|
|
|
|
Returns the unique C<$char_class> whose reunion of intervals |
296
|
|
|
|
|
|
|
is the same set as the reunion of the intervals of C<$interval_list>. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
Example: |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
interval_list_to_cc([[102, 112], [65, 90], [97, 102], [113, 122]]) |
301
|
|
|
|
|
|
|
returns [[65, 90], [97, 122]] |
302
|
|
|
|
|
|
|
(i.e [f-p]|[A-Z]|[a-f]|[q-z] => [A-Z]|[a-z]) |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
Note that both C<$interval_list> and C<$char_class> are lists of intervals, |
305
|
|
|
|
|
|
|
but only C<$char_class> obeys the constraints above, |
306
|
|
|
|
|
|
|
while C<$interval_list> does not. |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
Remark also that C is the identity |
309
|
|
|
|
|
|
|
(returns the same reference as given) on C<$char_class>es returned |
310
|
|
|
|
|
|
|
by either C or C. |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
=cut |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
sub interval_list_to_cc { |
315
|
6031
|
|
|
6031
|
1
|
4883
|
my ($interval_list) = @_; |
316
|
|
|
|
|
|
|
my @sorted |
317
|
7353
|
|
|
|
|
7380
|
= sort { $$a[0] <=> $$b[0] } |
|
10004
|
|
|
|
|
15651
|
|
318
|
6031
|
|
|
|
|
5656
|
grep { $$_[0] <= $$_[1] } |
319
|
|
|
|
|
|
|
@$interval_list |
320
|
|
|
|
|
|
|
; |
321
|
6031
|
|
|
|
|
7313
|
my $char_class = bless([], CHAR_CLASS); |
322
|
6031
|
|
|
|
|
4879
|
my $i = 0; |
323
|
6031
|
|
|
|
|
9038
|
while ($i != @sorted) { |
324
|
6939
|
|
|
|
|
5528
|
my $interval = $sorted[$i]; |
325
|
6939
|
|
|
|
|
4794
|
$i++; |
326
|
6939
|
|
100
|
|
|
14390
|
while ($i != @sorted && $$interval[1] + 1 >= $sorted[$i][0]) { |
327
|
3065
|
100
|
|
|
|
4458
|
if ($$interval[1] < $sorted[$i][1]) { |
328
|
3064
|
|
|
|
|
3118
|
$$interval[1] = $sorted[$i][1]; |
329
|
|
|
|
|
|
|
} |
330
|
3065
|
|
|
|
|
7705
|
$i++; |
331
|
|
|
|
|
|
|
} |
332
|
6939
|
|
|
|
|
12168
|
push(@$char_class, $interval); |
333
|
|
|
|
|
|
|
} |
334
|
6031
|
|
66
|
|
|
6165
|
return $cc_cache{ join(',', 1, map {@$_} @$char_class) } |
|
6939
|
|
|
|
|
23469
|
|
335
|
|
|
|
|
|
|
||= $char_class; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
sub cc_neg { |
339
|
1260
|
|
|
1260
|
0
|
1125
|
my ($char_class) = @_; |
340
|
|
|
|
|
|
|
|
341
|
1260
|
100
|
|
|
|
2295
|
if (!@$char_class) { return $cc_any; } |
|
85
|
|
|
|
|
328
|
|
342
|
|
|
|
|
|
|
|
343
|
1175
|
|
|
|
|
1494
|
my $neg = bless([], CHAR_CLASS); |
344
|
1175
|
100
|
|
|
|
2205
|
if ($$char_class[0][0] != 0) { |
345
|
1107
|
|
|
|
|
1967
|
push(@$neg, [0, $$char_class[0][0] - 1]); |
346
|
|
|
|
|
|
|
} |
347
|
1175
|
|
|
|
|
973
|
my $i = 0; |
348
|
1175
|
|
|
|
|
2151
|
while ($i != $#$char_class) { |
349
|
194
|
|
|
|
|
434
|
push(@$neg, [$$char_class[$i][1] + 1, $$char_class[$i+1][0] - 1]); |
350
|
194
|
|
|
|
|
324
|
$i++; |
351
|
|
|
|
|
|
|
} |
352
|
1175
|
100
|
|
|
|
1958
|
if ($$char_class[$i][1] != MAX_CHAR) { |
353
|
1107
|
|
|
|
|
1764
|
push(@$neg, [$$char_class[$i][1] + 1, MAX_CHAR]); |
354
|
|
|
|
|
|
|
} |
355
|
1175
|
|
66
|
|
|
1402
|
return $cc_cache{ join(',', 1, map{@$_} @$neg) } ||= $neg; |
|
2408
|
|
|
|
|
11433
|
|
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
sub cc_inter2 { |
359
|
440
|
|
|
440
|
0
|
385
|
my ($char_class_0, $char_class_1) = @_; |
360
|
|
|
|
|
|
|
|
361
|
440
|
|
|
|
|
617
|
my $inter = bless([], CHAR_CLASS); |
362
|
440
|
|
|
|
|
326
|
my $i_0 = 0; |
363
|
440
|
|
|
|
|
343
|
my $i_1 = 0; |
364
|
440
|
|
100
|
|
|
1342
|
while ($i_0 < @$char_class_0 && $i_1 < @$char_class_1) { |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
# skip interval_0 if interval_0 < interval_1 |
367
|
543
|
|
66
|
|
|
2526
|
while ( |
|
|
|
100
|
|
|
|
|
368
|
|
|
|
|
|
|
$i_0 < @$char_class_0 |
369
|
|
|
|
|
|
|
&& $i_1 < @$char_class_1 |
370
|
|
|
|
|
|
|
&& $$char_class_0[$i_0][1] < $$char_class_1[$i_1][0] |
371
|
|
|
|
|
|
|
) { |
372
|
367
|
|
|
|
|
1395
|
$i_0++; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# skip interval_1 if interval_1 < interval_0 |
376
|
543
|
|
100
|
|
|
2240
|
while ( |
|
|
|
100
|
|
|
|
|
377
|
|
|
|
|
|
|
$i_0 < @$char_class_0 |
378
|
|
|
|
|
|
|
&& $i_1 < @$char_class_1 |
379
|
|
|
|
|
|
|
&& $$char_class_1[$i_1][1] < $$char_class_0[$i_0][0] |
380
|
|
|
|
|
|
|
) { |
381
|
192
|
|
|
|
|
637
|
$i_1++; |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
# Check that the exit condition of the first while still holds. |
385
|
543
|
100
|
100
|
|
|
2376
|
if ( |
|
|
|
100
|
|
|
|
|
386
|
|
|
|
|
|
|
$i_0 < @$char_class_0 |
387
|
|
|
|
|
|
|
&& $i_1 < @$char_class_1 |
388
|
|
|
|
|
|
|
&& $$char_class_1[$i_1][0] <= $$char_class_0[$i_0][1] |
389
|
|
|
|
|
|
|
) { |
390
|
|
|
|
|
|
|
# The exit conditions of both whiles hold: |
391
|
|
|
|
|
|
|
# |
392
|
|
|
|
|
|
|
# $$char_class_0[$i_0][1] >= $$char_class_1[$i_1][0] |
393
|
|
|
|
|
|
|
# && $$char_class_1[$i_1][1] >= $$char_class_0[$i_0][0] |
394
|
|
|
|
|
|
|
# |
395
|
|
|
|
|
|
|
# short: |
396
|
|
|
|
|
|
|
# high_0 >= low_1 |
397
|
|
|
|
|
|
|
# high_1 >= low_0 |
398
|
|
|
|
|
|
|
# |
399
|
|
|
|
|
|
|
# furthermore: |
400
|
|
|
|
|
|
|
# high_0 >= low_0 |
401
|
|
|
|
|
|
|
# high_1 >= low_1 |
402
|
|
|
|
|
|
|
# |
403
|
|
|
|
|
|
|
# with: |
404
|
|
|
|
|
|
|
# min_high := min(high_0, high_1) |
405
|
|
|
|
|
|
|
# max_low := max(low_0, low_1) |
406
|
|
|
|
|
|
|
# |
407
|
|
|
|
|
|
|
# holds: |
408
|
|
|
|
|
|
|
# min_high >= max_low_0 |
409
|
|
|
|
|
|
|
|
410
|
291
|
|
|
|
|
209
|
my ($interval_0_done, $interval_1_done); |
411
|
|
|
|
|
|
|
|
412
|
291
|
100
|
|
|
|
518
|
my $max_low = |
413
|
|
|
|
|
|
|
$$char_class_0[$i_0][0] > $$char_class_1[$i_1][0] |
414
|
|
|
|
|
|
|
? $$char_class_0[$i_0][0] |
415
|
|
|
|
|
|
|
: $$char_class_1[$i_1][0] |
416
|
|
|
|
|
|
|
; |
417
|
|
|
|
|
|
|
|
418
|
291
|
|
|
|
|
184
|
my $min_high; |
419
|
291
|
100
|
|
|
|
477
|
if ($$char_class_0[$i_0][1] <= $$char_class_1[$i_1][1]) { |
420
|
186
|
|
|
|
|
152
|
$min_high = $$char_class_0[$i_0][1]; |
421
|
|
|
|
|
|
|
# interval_0 < next interval_1 |
422
|
186
|
|
|
|
|
159
|
$interval_0_done = 1; |
423
|
|
|
|
|
|
|
} |
424
|
291
|
100
|
|
|
|
499
|
if ($$char_class_1[$i_1][1] <= $$char_class_0[$i_0][1]) { |
425
|
208
|
|
|
|
|
193
|
$min_high = $$char_class_1[$i_1][1]; |
426
|
|
|
|
|
|
|
# interval_1 < next interval_0 |
427
|
208
|
|
|
|
|
149
|
$interval_1_done = 1; |
428
|
|
|
|
|
|
|
} |
429
|
291
|
100
|
|
|
|
391
|
if ($interval_0_done) { $i_0++; } |
|
186
|
|
|
|
|
133
|
|
430
|
291
|
100
|
|
|
|
352
|
if ($interval_1_done) { $i_1++; } |
|
208
|
|
|
|
|
145
|
|
431
|
|
|
|
|
|
|
|
432
|
291
|
|
|
|
|
1057
|
push(@$inter, [$max_low, $min_high]); |
433
|
|
|
|
|
|
|
} |
434
|
|
|
|
|
|
|
} |
435
|
440
|
|
33
|
|
|
1256
|
return $cc_cache{ join(',', 1, map{@$_} @$inter) } ||=$inter; |
|
291
|
|
|
|
|
1379
|
|
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
sub cc_match { |
440
|
117555
|
|
|
117555
|
0
|
91132
|
my ($char_class, $c) = @_; |
441
|
117555
|
|
|
|
|
99276
|
for my $interval (@$char_class) { |
442
|
138519
|
100
|
|
|
|
228247
|
if ($c < $$interval[0]) { |
|
|
100
|
|
|
|
|
|
443
|
46706
|
|
|
|
|
84021
|
return 0; |
444
|
|
|
|
|
|
|
} |
445
|
|
|
|
|
|
|
elsif ($c <= $$interval[1]) { |
446
|
27591
|
|
|
|
|
52211
|
return 1; |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
} |
449
|
43258
|
|
|
|
|
75823
|
return 0; |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=item cc_union(@char_classes) |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Returns the unique C<$char_class> containing all characters of all given |
455
|
|
|
|
|
|
|
C<@char_classes>. |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=cut |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
sub cc_union { |
460
|
2098
|
|
|
2098
|
1
|
3225
|
return interval_list_to_cc( [ map { map { [@$_] } @$_ } @_ ] ); |
|
4964
|
|
|
|
|
4103
|
|
|
5492
|
|
|
|
|
9557
|
|
461
|
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
sub cc_is_subset { |
464
|
56
|
|
|
56
|
0
|
64
|
my ($char_class_0, $char_class_1) = @_; |
465
|
56
|
|
|
|
|
66
|
for my $c ( map { @$_ } @$char_class_0 ) { |
|
59
|
|
|
|
|
102
|
|
466
|
89
|
100
|
|
|
|
113
|
if (!cc_match($char_class_1, $c)) { return 0; } |
|
32
|
|
|
|
|
122
|
|
467
|
|
|
|
|
|
|
} |
468
|
24
|
|
|
|
|
66
|
return 1; |
469
|
|
|
|
|
|
|
} |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
# $to_perlre (boolean) |
472
|
|
|
|
|
|
|
# true : perl syntax |
473
|
|
|
|
|
|
|
# false: ere syntax |
474
|
|
|
|
|
|
|
sub cc_to_regex { |
475
|
564
|
|
|
564
|
0
|
615
|
my ($char_class, $to_perlre) = (@_, 0); |
476
|
|
|
|
|
|
|
|
477
|
564
|
|
|
|
|
424
|
my @items; |
478
|
564
|
50
|
66
|
|
|
1872
|
if (@$char_class && $$char_class[0][0] < 0) { |
479
|
0
|
0
|
|
|
|
0
|
if ($$char_class[0][0] == -2) { |
480
|
0
|
0
|
|
|
|
0
|
if ($$char_class[0][1] == -1) { |
481
|
0
|
|
|
|
|
0
|
push(@items, '^'); |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
else { |
484
|
0
|
|
|
|
|
0
|
push(@items, '^$'); |
485
|
|
|
|
|
|
|
} |
486
|
|
|
|
|
|
|
} |
487
|
|
|
|
|
|
|
else { |
488
|
0
|
0
|
|
|
|
0
|
if ($$char_class[0][1] == -2) { |
489
|
0
|
|
|
|
|
0
|
push(@items, '$'); |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
else { |
492
|
0
|
|
|
|
|
0
|
push(@items, '^', '$'); |
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
} |
495
|
0
|
|
|
|
|
0
|
$char_class = [@$char_class[1..$#$char_class]]; |
496
|
|
|
|
|
|
|
} |
497
|
564
|
100
|
|
|
|
759
|
if (@$char_class) { |
498
|
524
|
100
|
100
|
|
|
1625
|
if ( |
|
|
100
|
100
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
499
|
|
|
|
|
|
|
@$char_class == 1 |
500
|
|
|
|
|
|
|
&& $$char_class[0][0] == $$char_class[0][1] |
501
|
|
|
|
|
|
|
) { |
502
|
491
|
|
|
|
|
537
|
my $c = chr($$char_class[0][0]); |
503
|
491
|
100
|
|
|
|
498
|
if ($to_perlre) { |
504
|
115
|
|
|
|
|
154
|
push(@items, quotemeta($c)) |
505
|
|
|
|
|
|
|
} |
506
|
|
|
|
|
|
|
else { |
507
|
376
|
100
|
|
|
|
1117
|
push(@items, |
508
|
|
|
|
|
|
|
$c =~ /$ERE_literal/o |
509
|
|
|
|
|
|
|
? $c |
510
|
|
|
|
|
|
|
: "\\$c" |
511
|
|
|
|
|
|
|
); |
512
|
|
|
|
|
|
|
} |
513
|
|
|
|
|
|
|
} |
514
|
|
|
|
|
|
|
elsif ( |
515
|
|
|
|
|
|
|
@$char_class == 1 |
516
|
|
|
|
|
|
|
&& $$char_class[0][0] == 0 |
517
|
|
|
|
|
|
|
&& $$char_class[0][1] == MAX_CHAR |
518
|
|
|
|
|
|
|
) { |
519
|
4
|
|
|
|
|
8
|
push(@items, '.'); |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
elsif ($$char_class[$#$char_class][1] == MAX_CHAR) { |
522
|
8
|
100
|
|
|
|
15
|
if ($to_perlre) { |
523
|
2
|
|
|
|
|
4
|
push(@items, |
524
|
|
|
|
|
|
|
'[^' . _cc_to_perlre(cc_neg($char_class)) . ']' |
525
|
|
|
|
|
|
|
); |
526
|
|
|
|
|
|
|
} |
527
|
|
|
|
|
|
|
else { |
528
|
6
|
|
|
|
|
17
|
push(@items, |
529
|
|
|
|
|
|
|
'[^' . _cc_to_ere(cc_neg($char_class)) . ']' |
530
|
|
|
|
|
|
|
); |
531
|
|
|
|
|
|
|
} |
532
|
|
|
|
|
|
|
} |
533
|
|
|
|
|
|
|
else { |
534
|
21
|
100
|
|
|
|
35
|
if ($to_perlre) { |
535
|
18
|
|
|
|
|
35
|
push(@items, '[' . _cc_to_perlre($char_class) . ']'); |
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
else { |
538
|
3
|
|
|
|
|
11
|
push(@items, '[' . _cc_to_ere($char_class) . ']'); |
539
|
|
|
|
|
|
|
} |
540
|
|
|
|
|
|
|
} |
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
|
543
|
564
|
|
|
|
|
411
|
my $regex; |
544
|
564
|
100
|
|
|
|
951
|
if (@items == 0) { |
|
|
50
|
|
|
|
|
|
545
|
40
|
|
|
|
|
139
|
return ''; |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
elsif (@items == 1) { |
548
|
524
|
|
|
|
|
1935
|
return $items[0]; |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
else { |
551
|
0
|
0
|
|
|
|
0
|
if ($to_perlre) { |
552
|
0
|
|
|
|
|
0
|
return '(?:' . join('|', @items) . ')'; |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
else { |
555
|
0
|
|
|
|
|
0
|
return '(' . join('|', @items) . ')'; |
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
} |
558
|
|
|
|
|
|
|
} |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
sub _cc_to_ere { |
561
|
9
|
|
|
9
|
|
11
|
my ($char_class) = @_; |
562
|
9
|
|
|
|
|
11
|
my $has_minus; |
563
|
|
|
|
|
|
|
my $has_r_bracket; |
564
|
|
|
|
|
|
|
my $ere = join('', |
565
|
|
|
|
|
|
|
map { |
566
|
9
|
100
|
|
|
|
14
|
if ($$_[0] == $$_[1]) { |
|
11
|
|
|
|
|
30
|
|
567
|
6
|
50
|
|
|
|
37
|
if ($$_[0] == ord('-')) { |
|
|
50
|
|
|
|
|
|
568
|
0
|
|
|
|
|
0
|
$has_minus = 1; |
569
|
0
|
|
|
|
|
0
|
''; |
570
|
|
|
|
|
|
|
} |
571
|
|
|
|
|
|
|
elsif ($$_[0] == ord(']')) { |
572
|
0
|
|
|
|
|
0
|
$has_r_bracket = 1; |
573
|
0
|
|
|
|
|
0
|
''; |
574
|
|
|
|
|
|
|
} |
575
|
|
|
|
|
|
|
else { |
576
|
6
|
|
|
|
|
17
|
chr($$_[0]); |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
} |
579
|
|
|
|
|
|
|
else { |
580
|
5
|
50
|
33
|
|
|
28
|
if ( |
581
|
|
|
|
|
|
|
$$_[0] == ord('-') |
582
|
|
|
|
|
|
|
|| $$_[0] == ord(']') |
583
|
|
|
|
|
|
|
) { |
584
|
0
|
0
|
|
|
|
0
|
if ($$_[0] == ord('-')) { |
585
|
0
|
|
|
|
|
0
|
$has_minus = 1; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
else { |
588
|
0
|
|
|
|
|
0
|
$has_r_bracket = 1; |
589
|
|
|
|
|
|
|
} |
590
|
0
|
0
|
|
|
|
0
|
if ($$_[1] == $$_[0] + 1) { |
|
|
0
|
|
|
|
|
|
591
|
0
|
|
|
|
|
0
|
chr($$_[1]); |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
elsif ($$_[1] == $$_[0] + 2) { |
594
|
0
|
|
|
|
|
0
|
chr($$_[0] + 1) . chr($$_[1]); |
595
|
|
|
|
|
|
|
} |
596
|
|
|
|
|
|
|
else { |
597
|
0
|
|
|
|
|
0
|
chr($$_[0] + 1) . '-' . chr($$_[1]); |
598
|
|
|
|
|
|
|
} |
599
|
|
|
|
|
|
|
} |
600
|
|
|
|
|
|
|
else { |
601
|
5
|
100
|
|
|
|
17
|
if ($$_[1] == $$_[0] + 1) { |
602
|
4
|
|
|
|
|
19
|
chr($$_[0]) . chr($$_[1]); |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
else { |
605
|
1
|
|
|
|
|
4
|
chr($$_[0]) . '-' . chr($$_[1]); |
606
|
|
|
|
|
|
|
} |
607
|
|
|
|
|
|
|
} |
608
|
|
|
|
|
|
|
} |
609
|
|
|
|
|
|
|
} |
610
|
|
|
|
|
|
|
@$char_class |
611
|
|
|
|
|
|
|
); |
612
|
9
|
50
|
|
|
|
24
|
if ($has_minus) { $ere .= '-'; } |
|
0
|
|
|
|
|
0
|
|
613
|
9
|
50
|
|
|
|
18
|
if ($has_r_bracket) { $ere = "]$ere"; } |
|
0
|
|
|
|
|
0
|
|
614
|
9
|
|
|
|
|
23
|
return $ere; |
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
sub _cc_to_perlre { |
618
|
20
|
|
|
20
|
|
23
|
my ($char_class) = @_; |
619
|
|
|
|
|
|
|
return join('', |
620
|
|
|
|
|
|
|
map { |
621
|
20
|
100
|
|
|
|
30
|
if ($$_[0] == $$_[1]) { |
|
49
|
|
|
|
|
104
|
|
622
|
44
|
|
|
|
|
55
|
my $c = chr($$_[0]); |
623
|
44
|
50
|
|
|
|
141
|
$c =~ /$PERLRE_char_class_special/o ? "\\$c" : $c; |
624
|
|
|
|
|
|
|
} |
625
|
|
|
|
|
|
|
else { |
626
|
5
|
|
|
|
|
14
|
my ($c1, $c2) = (chr($$_[0]), chr($$_[1])); |
627
|
5
|
50
|
|
|
|
47
|
($c1 =~ /$PERLRE_char_class_special/o ? "\\$c1" : $c1) |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
628
|
|
|
|
|
|
|
. ($$_[0] + 1 < $$_[1] ? '-' : '') |
629
|
|
|
|
|
|
|
. ($c2 =~ /$PERLRE_char_class_special/o ? "\\$c2" : $c2) |
630
|
|
|
|
|
|
|
} |
631
|
|
|
|
|
|
|
} @$char_class |
632
|
|
|
|
|
|
|
); |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
############################################################################## |
637
|
|
|
|
|
|
|
# $nfa |
638
|
|
|
|
|
|
|
############################################################################## |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=back |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
=head2 Nfa |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
WARNING: C routines are destructive, |
646
|
|
|
|
|
|
|
the C<$nfa> references given as arguments will not be valid C<$nfa> any more. |
647
|
|
|
|
|
|
|
Furthermore, the same C<$nfa> reference must be used only once as argument. |
648
|
|
|
|
|
|
|
For instance, for concatenating a C<$nfa> with itself, C |
649
|
|
|
|
|
|
|
does not work; instead, C must be used; |
650
|
|
|
|
|
|
|
or even C if the original |
651
|
|
|
|
|
|
|
C<$nfa> is to be used further. |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
$nfa = [ $state_0, $state_1, ... ] |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
$state = [ |
656
|
|
|
|
|
|
|
$accepting |
657
|
|
|
|
|
|
|
, $transitions |
658
|
|
|
|
|
|
|
] |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
$transitions = [ |
661
|
|
|
|
|
|
|
[ $char_class_0 => $state_ind_0 ] |
662
|
|
|
|
|
|
|
, [ $char_class_1 => $state_ind_1 ] |
663
|
|
|
|
|
|
|
, ... |
664
|
|
|
|
|
|
|
] |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
In the same C<$transition>, C<$state_ind_i> are pairwise different and are |
667
|
|
|
|
|
|
|
valid indexes of C<@$nfa>. There is exactly one initial state at index 0. |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
=over 4 |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
=item C |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
Maps each of the given C<@nfas> to a clone. |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
=cut |
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
sub nfa_clone { |
678
|
|
|
|
|
|
|
return |
679
|
298
|
|
|
298
|
1
|
335
|
map { [ |
|
600
|
|
|
|
|
1170
|
|
680
|
344
|
|
|
|
|
374
|
map { [ |
681
|
|
|
|
|
|
|
$$_[0] # accepting |
682
|
826
|
|
|
|
|
685
|
, [ map { [ @$_ ] } @{$$_[1]} ] # transitions |
|
826
|
|
|
|
|
1411
|
|
683
|
|
|
|
|
|
|
] } |
684
|
|
|
|
|
|
|
@$_ # states of the $nfa |
685
|
|
|
|
|
|
|
] } @_ # list of $nfas |
686
|
|
|
|
|
|
|
; |
687
|
|
|
|
|
|
|
} |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
sub _transitions_is_subset { |
690
|
461
|
|
|
461
|
|
498
|
my ($transitions_0, $transitions_1, $state_ind_map) = @_; |
691
|
521
|
100
|
66
|
|
|
2630
|
my %state_ind_to_t_1 |
692
|
461
|
|
|
|
|
481
|
= map {( |
693
|
|
|
|
|
|
|
$state_ind_map && exists($$state_ind_map{$$_[1]}) |
694
|
|
|
|
|
|
|
? $$state_ind_map{$$_[1]} |
695
|
|
|
|
|
|
|
: $$_[1] |
696
|
|
|
|
|
|
|
=> $_ |
697
|
|
|
|
|
|
|
)} |
698
|
|
|
|
|
|
|
@$transitions_1 |
699
|
|
|
|
|
|
|
; |
700
|
461
|
|
|
|
|
728
|
for my $t_0 (@$transitions_0) { |
701
|
212
|
100
|
66
|
|
|
829
|
my $state_ind_0 |
702
|
|
|
|
|
|
|
= $state_ind_map && exists($$state_ind_map{$$t_0[1]}) |
703
|
|
|
|
|
|
|
? $$state_ind_map{$$t_0[1]} |
704
|
|
|
|
|
|
|
: $$t_0[1] |
705
|
|
|
|
|
|
|
; |
706
|
212
|
100
|
|
|
|
372
|
if (!exists($state_ind_to_t_1{$state_ind_0})) { return 0; } |
|
185
|
|
|
|
|
931
|
|
707
|
27
|
|
|
|
|
31
|
my $t_1 = $state_ind_to_t_1{$state_ind_0}; |
708
|
27
|
100
|
|
|
|
59
|
if (!cc_is_subset($$t_0[0], $$t_1[0])) { return 0; } |
|
17
|
|
|
|
|
72
|
|
709
|
|
|
|
|
|
|
} |
710
|
259
|
|
|
|
|
553
|
return 1; |
711
|
|
|
|
|
|
|
} |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
# The keys of %$state_ind_to_equiv are state_inds of @$nfa to be removed. |
714
|
|
|
|
|
|
|
# State indexes in transitions are remapped following %$state_ind_to_equiv. |
715
|
|
|
|
|
|
|
# A state index mapped to itself denotes an unreachable state index. |
716
|
|
|
|
|
|
|
sub _nfa_shrink_equiv { |
717
|
835
|
|
|
835
|
|
966
|
my ($nfa, $state_ind_to_equiv) = @_; |
718
|
835
|
|
|
|
|
795
|
my $i = 0; |
719
|
3927
|
|
|
|
|
6625
|
my %compact_map |
720
|
5190
|
|
|
|
|
7293
|
= map { ($_ => $i++) } |
721
|
|
|
|
|
|
|
my @active_state_inds |
722
|
835
|
|
|
|
|
1295
|
= grep { !exists($$state_ind_to_equiv{$_}) } |
723
|
|
|
|
|
|
|
(0..$#$nfa) |
724
|
|
|
|
|
|
|
; |
725
|
|
|
|
|
|
|
|
726
|
835
|
|
|
|
|
1342
|
my %equiv_index_to_char_classes; |
727
|
|
|
|
|
|
|
my %plain_index_to_char_class; |
728
|
835
|
|
|
|
|
2682
|
for (@$nfa = @$nfa[@active_state_inds]) { |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
# update $state_ind |
731
|
|
|
|
|
|
|
# -> $compact_map{$state_ind} |
732
|
|
|
|
|
|
|
# or $compact_map{$$state_ind_to_equiv{$state_ind}} |
733
|
3927
|
|
|
|
|
3529
|
%equiv_index_to_char_classes = (); |
734
|
3927
|
|
|
|
|
3362
|
%plain_index_to_char_class = (); |
735
|
3927
|
|
|
|
|
2803
|
for (@{$$_[1]}) { # transition list |
|
3927
|
|
|
|
|
4768
|
|
736
|
7095
|
100
|
|
|
|
8948
|
if (exists($$state_ind_to_equiv{$$_[1]})) { |
737
|
|
|
|
|
|
|
push( |
738
|
1018
|
|
|
|
|
759
|
@{$equiv_index_to_char_classes{ |
739
|
1018
|
|
|
|
|
2586
|
$$_[1] |
740
|
|
|
|
|
|
|
= $compact_map{$$state_ind_to_equiv{$$_[1]}} |
741
|
|
|
|
|
|
|
}} |
742
|
|
|
|
|
|
|
, $$_[0] |
743
|
|
|
|
|
|
|
); |
744
|
|
|
|
|
|
|
} |
745
|
|
|
|
|
|
|
else { |
746
|
|
|
|
|
|
|
$plain_index_to_char_class{ |
747
|
6077
|
|
|
|
|
9605
|
$$_[1] |
748
|
|
|
|
|
|
|
= $compact_map{$$_[1]} |
749
|
|
|
|
|
|
|
} = $$_[0]; |
750
|
|
|
|
|
|
|
} |
751
|
|
|
|
|
|
|
} |
752
|
|
|
|
|
|
|
# merge char_classes to the same state index |
753
|
3927
|
100
|
|
|
|
6774
|
if (keys(%equiv_index_to_char_classes)) { |
754
|
804
|
|
|
|
|
1958
|
@{$$_[1]} = (( |
|
115
|
|
|
|
|
194
|
|
755
|
811
|
|
|
|
|
2042
|
map {[ |
756
|
|
|
|
|
|
|
exists($equiv_index_to_char_classes{$_}) |
757
|
|
|
|
|
|
|
? cc_union( |
758
|
|
|
|
|
|
|
$plain_index_to_char_class{$_} |
759
|
641
|
100
|
|
|
|
1353
|
, @{$equiv_index_to_char_classes{$_}} |
760
|
|
|
|
|
|
|
) |
761
|
|
|
|
|
|
|
: $plain_index_to_char_class{$_} |
762
|
|
|
|
|
|
|
, $_ |
763
|
|
|
|
|
|
|
]} |
764
|
|
|
|
|
|
|
keys(%plain_index_to_char_class) |
765
|
|
|
|
|
|
|
) , ( |
766
|
926
|
|
|
|
|
1387
|
map {[ |
767
|
16
|
|
|
|
|
32
|
@{$equiv_index_to_char_classes{$_}} == 1 |
768
|
|
|
|
|
|
|
? $equiv_index_to_char_classes{$_}[0] |
769
|
811
|
100
|
|
|
|
603
|
: cc_union(@{$equiv_index_to_char_classes{$_}}) |
770
|
|
|
|
|
|
|
, $_ |
771
|
|
|
|
|
|
|
]} |
772
|
804
|
|
|
|
|
1218
|
grep { !exists($plain_index_to_char_class{$_}) } |
773
|
|
|
|
|
|
|
keys(%equiv_index_to_char_classes) |
774
|
|
|
|
|
|
|
)) |
775
|
|
|
|
|
|
|
} |
776
|
|
|
|
|
|
|
} |
777
|
835
|
|
|
|
|
5399
|
return $nfa; |
778
|
|
|
|
|
|
|
} |
779
|
|
|
|
|
|
|
|
780
|
|
|
|
|
|
|
=item C |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
Precondition: C<0 <= $min && ( $max eq '' || $min <= $max)> |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
Returns C<$out_nfa>, a C<$nfa> computed from C<$in_nfa>. |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
Let L be the language accepted by C<$in_nfa> and M the language accepted |
787
|
|
|
|
|
|
|
by C<$out_nfa>. Then a word m belongs to M if and only if and ordered list |
788
|
|
|
|
|
|
|
(l_1, ..., l_r) of words belonging to L exists such that: |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
$min <= r |
791
|
|
|
|
|
|
|
and ($max eq '' or r <= $max) |
792
|
|
|
|
|
|
|
and m is the concatenation of (l_1, ..., l_r) |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
Examples with C<$in_nfa> being a C<$nfa> accepting C<'^a$'>: |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
nfa_quant($in_nfa, 2, 4 ) accepts '^a{2,4}$' |
797
|
|
|
|
|
|
|
nfa_quant($in_nfa, 0, '') accepts '^a{0,}$' (i.e. '^a*$') |
798
|
|
|
|
|
|
|
|
799
|
|
|
|
|
|
|
C<$pref_has_prefix> and C<$next_has_prefix> are hints for dispatching C<$min>, |
800
|
|
|
|
|
|
|
for example: |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
'a+' => 'a*a' (!$prev_has_suffix && $next_has_prefix) |
803
|
|
|
|
|
|
|
'a+' => 'aa*' ( $prev_has_suffix && !$next_has_prefix) |
804
|
|
|
|
|
|
|
'a{2,}' => 'aa*a' ( $prev_has_suffix && $next_has_prefix) |
805
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
=cut |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
sub nfa_quant { |
809
|
271
|
|
|
271
|
1
|
349
|
my ($nfa, $min, $max, $prev_has_suffix, $next_has_prefix) = @_; |
810
|
271
|
|
|
|
|
227
|
my @quant_parts; |
811
|
|
|
|
|
|
|
my $optional_part; |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
# dispatch min left and right: a+b => a*ab, ba+ => baa* |
814
|
8
|
|
|
8
|
|
91
|
use integer; |
|
8
|
|
|
|
|
13
|
|
|
8
|
|
|
|
|
28
|
|
815
|
271
|
100
|
66
|
|
|
516
|
my ($min_left, $min_right) |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
|
|
|
|
|
816
|
|
|
|
|
|
|
= |
817
|
|
|
|
|
|
|
# no suffix, no prefix |
818
|
|
|
|
|
|
|
$min == 0 ? (0 , 0 ) |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
# no suffix, maybe prefix |
821
|
|
|
|
|
|
|
: !($next_has_prefix && _nfa_has_suffix($nfa)) ? ($min , 0 ) |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
# suffix, no prefix |
824
|
|
|
|
|
|
|
: !($prev_has_suffix && _nfa_has_prefix($nfa)) ? (0 , $min ) |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
# suffix and prefix |
827
|
|
|
|
|
|
|
: (($min+1)/2, $min/2) |
828
|
|
|
|
|
|
|
; |
829
|
|
|
|
|
|
|
|
830
|
271
|
100
|
|
|
|
444
|
if ($min_left > 0) { |
831
|
13
|
|
|
|
|
27
|
push(@quant_parts, nfa_concat(nfa_clone(($nfa) x $min_left))); |
832
|
|
|
|
|
|
|
} |
833
|
271
|
100
|
100
|
|
|
672
|
if (length($max) == 0 || $max > $min) { |
834
|
270
|
100
|
|
|
|
579
|
if ($$nfa[0][0]) { |
|
495
|
100
|
|
|
|
855
|
|
835
|
|
|
|
|
|
|
# initial state already accepting |
836
|
7
|
|
|
|
|
15
|
($optional_part) = nfa_clone($nfa); |
837
|
|
|
|
|
|
|
} |
838
|
|
|
|
|
|
|
elsif ( |
839
|
639
|
|
|
|
|
909
|
!grep { $$_[1] == 0 } |
840
|
639
|
|
|
|
|
428
|
map { @{$$_[1]} } |
841
|
|
|
|
|
|
|
@$nfa |
842
|
|
|
|
|
|
|
) { |
843
|
|
|
|
|
|
|
# initial state not accepting and unreachable |
844
|
251
|
|
|
|
|
441
|
($optional_part) = nfa_clone($nfa); |
845
|
251
|
|
|
|
|
438
|
$$optional_part[0][0] = 1; |
846
|
|
|
|
|
|
|
} |
847
|
|
|
|
|
|
|
else { |
848
|
|
|
|
|
|
|
# initial state not accepting and reachable |
849
|
24
|
|
|
|
|
70
|
$optional_part = [ |
850
|
|
|
|
|
|
|
# additional root initial state accepting state |
851
|
|
|
|
|
|
|
[ |
852
|
|
|
|
|
|
|
1 # accepting |
853
|
12
|
|
|
|
|
27
|
, [ map {[$$_[0] , $$_[1]+1]} @{$$nfa[0][1]} ] # transitions |
|
32
|
|
|
|
|
70
|
|
854
|
|
|
|
|
|
|
] |
855
|
|
|
|
|
|
|
# original states with offset 1 |
856
|
12
|
|
|
|
|
25
|
, map { [ |
857
|
|
|
|
|
|
|
$$_[0] # accepting |
858
|
32
|
|
|
|
|
86
|
, [ map {[ $$_[0], $$_[1]+1 ]} @{$$_[1]} ] # transitions |
|
32
|
|
|
|
|
66
|
|
859
|
|
|
|
|
|
|
] } |
860
|
|
|
|
|
|
|
@$nfa |
861
|
|
|
|
|
|
|
]; |
862
|
|
|
|
|
|
|
} |
863
|
|
|
|
|
|
|
} |
864
|
271
|
100
|
|
|
|
452
|
if (length($max) == 0) { |
|
|
100
|
|
|
|
|
|
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
# starify optional part |
867
|
|
|
|
|
|
|
|
868
|
302
|
|
|
|
|
763
|
my %root_index_to_char_class |
869
|
256
|
|
|
|
|
396
|
= map { ($$_[1] => $$_[0]) } |
870
|
256
|
|
|
|
|
219
|
@{$$optional_part[0][1]} |
871
|
|
|
|
|
|
|
; |
872
|
|
|
|
|
|
|
|
873
|
256
|
|
|
|
|
364
|
my $state_ind_to_equiv = {}; |
874
|
|
|
|
|
|
|
# loop over accepting state indexes |
875
|
256
|
|
|
|
|
409
|
for (grep { $$optional_part[$_][0] } (1..$#$optional_part)) { |
|
366
|
|
|
|
|
586
|
|
876
|
256
|
100
|
|
|
|
768
|
if ( |
877
|
|
|
|
|
|
|
_transitions_is_subset( |
878
|
|
|
|
|
|
|
$$optional_part[$_][1] |
879
|
|
|
|
|
|
|
, $$optional_part[0][1] |
880
|
|
|
|
|
|
|
, { $_ => 0 } |
881
|
|
|
|
|
|
|
) |
882
|
|
|
|
|
|
|
) { |
883
|
|
|
|
|
|
|
# Accepting states whose transitions are |
884
|
|
|
|
|
|
|
# a subset of the transitions of the initial state |
885
|
|
|
|
|
|
|
# are equivalent to the initial state. |
886
|
251
|
|
|
|
|
668
|
$$state_ind_to_equiv{$_} = 0; |
887
|
|
|
|
|
|
|
} |
888
|
|
|
|
|
|
|
else { |
889
|
5
|
100
|
|
|
|
7
|
if ( |
890
|
5
|
|
|
|
|
15
|
grep { exists($root_index_to_char_class{$_}) } |
|
5
|
|
|
|
|
31
|
|
891
|
5
|
|
|
|
|
13
|
map { $$_[1] } |
892
|
|
|
|
|
|
|
@{$$optional_part[$_][1]} |
893
|
|
|
|
|
|
|
) { |
894
|
|
|
|
|
|
|
# merge char classes to the same state index |
895
|
2
|
|
|
|
|
6
|
my %new_index_to_char_classes |
896
|
2
|
|
|
|
|
4
|
= map { ($$_[1] => [$$_[0]]) } |
897
|
2
|
|
|
|
|
2
|
@{$$optional_part[$_][1]} |
898
|
|
|
|
|
|
|
; |
899
|
2
|
|
|
|
|
4
|
for (keys(%root_index_to_char_class)) { |
900
|
4
|
|
|
|
|
8
|
push ( |
901
|
4
|
|
|
|
|
5
|
@{$new_index_to_char_classes{$_}} |
902
|
|
|
|
|
|
|
, $root_index_to_char_class{$_} |
903
|
|
|
|
|
|
|
); |
904
|
|
|
|
|
|
|
} |
905
|
2
|
|
|
|
|
9
|
@{$$optional_part[$_][1]} |
|
4
|
|
|
|
|
10
|
|
906
|
2
|
|
|
|
|
4
|
= map {[ |
907
|
2
|
|
|
|
|
3
|
@{$new_index_to_char_classes{$_}} == 1 |
908
|
|
|
|
|
|
|
? $new_index_to_char_classes{$_}[0] |
909
|
4
|
100
|
|
|
|
5
|
: cc_union(@{$new_index_to_char_classes{$_}}) |
910
|
|
|
|
|
|
|
, $_ |
911
|
|
|
|
|
|
|
]} |
912
|
|
|
|
|
|
|
keys(%new_index_to_char_classes) |
913
|
|
|
|
|
|
|
; |
914
|
|
|
|
|
|
|
} |
915
|
|
|
|
|
|
|
else { |
916
|
3
|
|
|
|
|
7
|
push( |
917
|
5
|
|
|
|
|
15
|
@{$$optional_part[$_][1]} |
918
|
3
|
|
|
|
|
7
|
, map { [@$_] } @{$$optional_part[0][1]} |
|
3
|
|
|
|
|
11
|
|
919
|
|
|
|
|
|
|
); |
920
|
|
|
|
|
|
|
} |
921
|
|
|
|
|
|
|
} |
922
|
|
|
|
|
|
|
} |
923
|
256
|
100
|
|
|
|
828
|
push(@quant_parts, |
924
|
|
|
|
|
|
|
keys(%$state_ind_to_equiv) |
925
|
|
|
|
|
|
|
? _nfa_shrink_equiv($optional_part, $state_ind_to_equiv) |
926
|
|
|
|
|
|
|
: $optional_part |
927
|
|
|
|
|
|
|
); |
928
|
|
|
|
|
|
|
} |
929
|
|
|
|
|
|
|
elsif ($max > $min) { |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
# concatenate optional_part $max - $min times |
932
|
|
|
|
|
|
|
|
933
|
14
|
|
|
|
|
37
|
push(@quant_parts, _nfa_concat(1, nfa_clone( |
934
|
|
|
|
|
|
|
($optional_part) x ($max - $min) |
935
|
|
|
|
|
|
|
))); |
936
|
|
|
|
|
|
|
} |
937
|
271
|
100
|
|
|
|
529
|
if ($min_right > 0) { |
938
|
13
|
|
|
|
|
37
|
push(@quant_parts, nfa_concat(nfa_clone(($nfa) x $min_right))); |
939
|
|
|
|
|
|
|
} |
940
|
271
|
100
|
|
|
|
1034
|
return @quant_parts == 1 ? $quant_parts[0] : nfa_concat(@quant_parts); |
941
|
|
|
|
|
|
|
} |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
=item C |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
Returns C<$out_nfa>, a C<$nfa> computed from C<@in_nfas>. |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
Let r be the number of given C<@in_nfas>, |
948
|
|
|
|
|
|
|
L_i the language accepted by C<$in_nfas[$i]> and M the language accepted |
949
|
|
|
|
|
|
|
by C<$out_nfa>. Then a word m belongs to M if and only if an ordered list |
950
|
|
|
|
|
|
|
(l_1, ..., l_r) of words exists, l_i belonging to L_i, such that |
951
|
|
|
|
|
|
|
m is the concatenation of (l_1, ..., l_r). |
952
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
=cut |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
sub nfa_concat { |
956
|
302
|
|
|
302
|
1
|
534
|
_nfa_concat(0, @_); |
957
|
|
|
|
|
|
|
} |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
sub _nfa_concat { |
960
|
316
|
|
|
316
|
|
386
|
my $starifying = shift(@_); |
961
|
316
|
50
|
|
|
|
552
|
if (!@_) { |
962
|
0
|
|
|
|
|
0
|
return [[1, []]]; # neutral element: accepting empty string |
963
|
|
|
|
|
|
|
} |
964
|
316
|
|
|
|
|
323
|
my $concat = shift(@_); # result, to be extended |
965
|
316
|
|
|
|
|
452
|
my @accepting_state_inds = grep { $$concat[$_][0] } (0..$#$concat); |
|
785
|
|
|
|
|
1051
|
|
966
|
316
|
|
|
|
|
358
|
my $state_ind_to_equiv = {}; |
967
|
|
|
|
|
|
|
my ( |
968
|
316
|
|
|
|
|
307
|
$state |
969
|
|
|
|
|
|
|
, $init_state_ind |
970
|
|
|
|
|
|
|
, $init_reachable |
971
|
|
|
|
|
|
|
, $init_equiv_reachable |
972
|
|
|
|
|
|
|
, $init_skipped |
973
|
|
|
|
|
|
|
, @new_accepting_state_inds |
974
|
|
|
|
|
|
|
); |
975
|
|
|
|
|
|
|
# extend @$concat |
976
|
316
|
|
|
|
|
404
|
for my $nfa (@_) { |
977
|
401
|
|
|
|
|
347
|
$init_state_ind = @$concat; |
978
|
401
|
|
|
|
|
333
|
$init_reachable = 0; |
979
|
401
|
|
|
|
|
296
|
$init_equiv_reachable = 0; |
980
|
401
|
|
|
|
|
311
|
$init_skipped = 0; |
981
|
|
|
|
|
|
|
@new_accepting_state_inds |
982
|
466
|
|
|
|
|
733
|
= map { $_ + $init_state_ind } |
|
1115
|
|
|
|
|
1213
|
|
983
|
401
|
|
|
|
|
534
|
grep { $$nfa[$_][0] } |
984
|
|
|
|
|
|
|
(0..$#$nfa) |
985
|
|
|
|
|
|
|
; |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
# renumber states, count states with transition to the initial state |
988
|
401
|
|
|
|
|
450
|
for (map { @{$$_[1]} } @$nfa) { |
|
1115
|
|
|
|
|
799
|
|
|
1115
|
|
|
|
|
1462
|
|
989
|
1063
|
100
|
|
|
|
2100
|
($$_[1] += $init_state_ind) == $init_state_ind |
990
|
|
|
|
|
|
|
&& $init_reachable++; |
991
|
|
|
|
|
|
|
} |
992
|
|
|
|
|
|
|
# join old accepting states with new initial state |
993
|
401
|
|
|
|
|
540
|
for my $acc_ind (@accepting_state_inds) { |
994
|
510
|
|
|
|
|
511
|
$state = $$concat[$acc_ind]; # old accepting state |
995
|
510
|
|
|
|
|
534
|
$$state[0] = $$nfa[0][0]; # overwrite accepting |
996
|
510
|
100
|
100
|
|
|
452
|
if ( |
|
|
100
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
997
|
510
|
|
|
|
|
1134
|
@{$$state[1]} == 0 # no transition |
|
274
|
|
|
|
|
1028
|
|
998
|
|
|
|
|
|
|
|| @{$$state[1]} == 1 # one transition |
999
|
|
|
|
|
|
|
&& _transitions_is_subset( |
1000
|
|
|
|
|
|
|
$$state[1] # transition of the old accepting state |
1001
|
|
|
|
|
|
|
, $$nfa[0][1] # transitions of the new initial state |
1002
|
|
|
|
|
|
|
, { $acc_ind => $init_state_ind } |
1003
|
|
|
|
|
|
|
) |
1004
|
42
|
|
|
|
|
165
|
) { |
1005
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
# Old accepting states whose transitions are |
1007
|
|
|
|
|
|
|
# a subset of the transitions of the new initial state |
1008
|
|
|
|
|
|
|
# are equivalent to the initial state. |
1009
|
|
|
|
|
|
|
# |
1010
|
|
|
|
|
|
|
# Note that such an old accepting states can have either |
1011
|
|
|
|
|
|
|
# no transition or one self-transition; |
1012
|
|
|
|
|
|
|
# the case that the old accepting state has no transition |
1013
|
|
|
|
|
|
|
# occurs very often. |
1014
|
|
|
|
|
|
|
# |
1015
|
|
|
|
|
|
|
# %$state_ind_to_equiv gets extended by |
1016
|
|
|
|
|
|
|
# |
1017
|
|
|
|
|
|
|
# $acc_ind (old accepting state) => $init_state_ind |
1018
|
|
|
|
|
|
|
# |
1019
|
|
|
|
|
|
|
# But the keys and the values of %$state_ind_to_equiv |
1020
|
|
|
|
|
|
|
# MUST remain disjoint (except for pairs key = val). |
1021
|
|
|
|
|
|
|
# |
1022
|
|
|
|
|
|
|
# Since $init_state_index are growing |
1023
|
|
|
|
|
|
|
# and $acc_ind < $init_state_index: |
1024
|
|
|
|
|
|
|
# - the new value does not belong the the keys |
1025
|
|
|
|
|
|
|
# - the new key may belong to the vals, |
1026
|
|
|
|
|
|
|
# such values must be updated. |
1027
|
|
|
|
|
|
|
# |
1028
|
|
|
|
|
|
|
# Example: |
1029
|
|
|
|
|
|
|
# 0 => 1 ( %$state_ind_to_equiv ) |
1030
|
|
|
|
|
|
|
# 1 => 2 ( $acc_ind => $init_state_index ) |
1031
|
|
|
|
|
|
|
# %$state_ind_to_equiv must be updated to |
1032
|
|
|
|
|
|
|
# 0 => 2 |
1033
|
|
|
|
|
|
|
# before being extended by |
1034
|
|
|
|
|
|
|
# 1 => 2 |
1035
|
244
|
|
|
|
|
473
|
for (grep { $_ == $acc_ind } values(%$state_ind_to_equiv)) { |
|
151
|
|
|
|
|
222
|
|
1036
|
3
|
|
|
|
|
7
|
$_ = $init_state_ind; |
1037
|
|
|
|
|
|
|
} |
1038
|
244
|
|
|
|
|
414
|
$$state_ind_to_equiv{$acc_ind} = $init_state_ind; |
1039
|
244
|
|
|
|
|
436
|
$init_equiv_reachable++; |
1040
|
|
|
|
|
|
|
} |
1041
|
|
|
|
|
|
|
elsif ( |
1042
|
|
|
|
|
|
|
$init_reachable == 1 |
1043
|
34
|
|
|
|
|
50
|
&& (grep { $$_[1] == $init_state_ind } @{$$nfa[0][1]}) |
|
29
|
|
|
|
|
46
|
|
1044
|
|
|
|
|
|
|
&& cc_is_subset( |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
# char_class of the self-transition |
1047
|
|
|
|
|
|
|
# of the new initial state |
1048
|
|
|
|
|
|
|
( |
1049
|
33
|
|
|
|
|
48
|
map { $$_[0] } |
1050
|
29
|
|
|
|
|
47
|
grep { $$_[1] == $init_state_ind } |
1051
|
22
|
|
|
|
|
36
|
@{$$nfa[0][1]} |
1052
|
|
|
|
|
|
|
) |
1053
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
# char_class of the self-transition |
1055
|
|
|
|
|
|
|
# of the old accepting state |
1056
|
|
|
|
|
|
|
, ( |
1057
|
42
|
|
|
|
|
63
|
map { $$_[0] } |
1058
|
29
|
|
|
|
|
35
|
grep { $$_[1] == $acc_ind } |
1059
|
|
|
|
|
|
|
@{$$state[1]} |
1060
|
|
|
|
|
|
|
) |
1061
|
|
|
|
|
|
|
) |
1062
|
|
|
|
|
|
|
) { |
1063
|
|
|
|
|
|
|
# If the self-transitions of the new init state are |
1064
|
|
|
|
|
|
|
# a subset of the transitions of the old accepting state, |
1065
|
|
|
|
|
|
|
# the new state is not needed for looping; |
1066
|
|
|
|
|
|
|
# the transition to the new init state can be skipped. |
1067
|
|
|
|
|
|
|
# |
1068
|
|
|
|
|
|
|
# Example 1: |
1069
|
|
|
|
|
|
|
# [ab]*a* |
1070
|
|
|
|
|
|
|
# the state for a* is superfluous. |
1071
|
|
|
|
|
|
|
# Example 2: |
1072
|
|
|
|
|
|
|
# ( x[ab]* | y[ac]* | z[bc]* ) a* c |
1073
|
|
|
|
|
|
|
# the state for a* is only needed after [bc]* |
1074
|
|
|
|
|
|
|
# the regular expression is equivalent to: |
1075
|
|
|
|
|
|
|
# x[ab]*c | y[ac]*c | z[bc]*a*c |
1076
|
|
|
|
|
|
|
# |
1077
|
|
|
|
|
|
|
# Note that this one-letter-star optimization is |
1078
|
|
|
|
|
|
|
# probably not very useful for practical purposes; |
1079
|
|
|
|
|
|
|
# more general equivalences like (abc)*(abc)* ~ (abc)* |
1080
|
|
|
|
|
|
|
# are not caught up, while the focused use cases |
1081
|
|
|
|
|
|
|
# of prefix and suffix recognition require no star at all. |
1082
|
|
|
|
|
|
|
# |
1083
|
|
|
|
|
|
|
# It is merely a toy optimization for solving some exercises |
1084
|
|
|
|
|
|
|
# of an introductory course on regexs. |
1085
|
|
|
|
|
|
|
# |
1086
|
14
|
|
|
|
|
19
|
push(@{$$state[1]}, |
|
2
|
|
|
|
|
4
|
|
1087
|
16
|
|
|
|
|
26
|
map { [ @$_ ] } |
1088
|
14
|
|
|
|
|
26
|
grep { $$_[1] != $init_state_ind} |
1089
|
14
|
|
|
|
|
15
|
@{$$nfa[0][1]}) |
1090
|
|
|
|
|
|
|
; |
1091
|
14
|
|
|
|
|
27
|
$init_skipped++; |
1092
|
|
|
|
|
|
|
} |
1093
|
|
|
|
|
|
|
else { |
1094
|
252
|
|
|
|
|
314
|
push(@{$$state[1]}, |
|
279
|
|
|
|
|
765
|
|
1095
|
252
|
|
|
|
|
345
|
map { [ @$_ ] } |
1096
|
252
|
|
|
|
|
193
|
@{$$nfa[0][1]}) |
1097
|
|
|
|
|
|
|
; |
1098
|
|
|
|
|
|
|
} |
1099
|
|
|
|
|
|
|
} |
1100
|
401
|
100
|
100
|
|
|
1877
|
if ( |
|
|
|
100
|
|
|
|
|
1101
|
|
|
|
|
|
|
!$init_reachable && !$init_equiv_reachable |
1102
|
|
|
|
|
|
|
|| $init_skipped == @accepting_state_inds |
1103
|
|
|
|
|
|
|
) { |
1104
|
|
|
|
|
|
|
# for being removed by _nfa_shrink_equiv() |
1105
|
146
|
|
|
|
|
256
|
$$state_ind_to_equiv{$init_state_ind} = $init_state_ind; |
1106
|
|
|
|
|
|
|
} |
1107
|
|
|
|
|
|
|
|
1108
|
401
|
100
|
|
|
|
748
|
if (!$$nfa[0][0]) { |
|
|
100
|
|
|
|
|
|
1109
|
205
|
|
|
|
|
254
|
@accepting_state_inds = (); |
1110
|
|
|
|
|
|
|
} |
1111
|
|
|
|
|
|
|
elsif ($starifying) { |
1112
|
|
|
|
|
|
|
# $starifying set for optimizing x{n,m}. |
1113
|
|
|
|
|
|
|
# The old accepting states are redundant, |
1114
|
|
|
|
|
|
|
# since reachable iff the newer ones are. |
1115
|
13
|
|
|
|
|
22
|
for (@accepting_state_inds[1..$#accepting_state_inds]) { |
1116
|
15
|
|
|
|
|
21
|
$$concat[$_][0] = 0; |
1117
|
|
|
|
|
|
|
} |
1118
|
13
|
50
|
|
|
|
26
|
if (!$init_reachable) { |
1119
|
13
|
|
|
|
|
12
|
$$nfa[0][0] = 0; |
1120
|
13
|
|
|
|
|
13
|
shift(@new_accepting_state_inds); |
1121
|
|
|
|
|
|
|
} |
1122
|
13
|
|
|
|
|
15
|
@accepting_state_inds = (0); |
1123
|
|
|
|
|
|
|
} |
1124
|
|
|
|
|
|
|
else { |
1125
|
|
|
|
|
|
|
@accepting_state_inds |
1126
|
183
|
|
|
|
|
198
|
= grep { !exists($$state_ind_to_equiv{$_}) } |
|
200
|
|
|
|
|
428
|
|
1127
|
|
|
|
|
|
|
@accepting_state_inds |
1128
|
|
|
|
|
|
|
; |
1129
|
|
|
|
|
|
|
} |
1130
|
|
|
|
|
|
|
|
1131
|
401
|
|
|
|
|
612
|
push(@$concat, @$nfa); |
1132
|
401
|
|
|
|
|
527
|
push(@accepting_state_inds, @new_accepting_state_inds); |
1133
|
|
|
|
|
|
|
} |
1134
|
316
|
100
|
|
|
|
537
|
if (keys(%$state_ind_to_equiv)) { |
1135
|
257
|
|
|
|
|
358
|
return _nfa_shrink_equiv($concat, $state_ind_to_equiv); |
1136
|
|
|
|
|
|
|
} |
1137
|
|
|
|
|
|
|
else { |
1138
|
59
|
|
|
|
|
153
|
return $concat; |
1139
|
|
|
|
|
|
|
} |
1140
|
|
|
|
|
|
|
} |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
=item C |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
Returns C<$out_nfa>, a C<$nfa> computed from C<@in_nfas>. |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
C<$out_nfa> accepts a word w if and only if at least one of C<@in_nfas> |
1147
|
|
|
|
|
|
|
accepts w. |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
=cut |
1150
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
# Adds the total number of states |
1152
|
|
|
|
|
|
|
sub nfa_union { |
1153
|
130
|
|
|
130
|
1
|
259
|
my $union = [[0, []]]; # root, neutral element: accepting nothing |
1154
|
130
|
|
|
|
|
166
|
my $state_ind_to_equiv = {}; |
1155
|
130
|
|
|
|
|
119
|
my $first_trivial_accepting_state_ind; |
1156
|
|
|
|
|
|
|
my ( |
1157
|
130
|
|
|
|
|
143
|
$nfa |
1158
|
|
|
|
|
|
|
, $init_state_ind |
1159
|
|
|
|
|
|
|
, $init_reachable |
1160
|
|
|
|
|
|
|
, $orig_state |
1161
|
|
|
|
|
|
|
); |
1162
|
|
|
|
|
|
|
|
1163
|
130
|
|
|
|
|
188
|
for $nfa (@_) { |
1164
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
# merge initial $accepting |
1166
|
300
|
|
100
|
|
|
868
|
$$union[0][0] ||= $$nfa[0][0]; |
1167
|
300
|
100
|
100
|
|
|
623
|
if (@$nfa == 1 && @{$$nfa[0][1]} == 0) { |
|
68
|
|
|
|
|
194
|
|
1168
|
57
|
|
|
|
|
86
|
next; |
1169
|
|
|
|
|
|
|
# Must be skipped because such a trivial state |
1170
|
|
|
|
|
|
|
# would be removed below (!$init_reachable) |
1171
|
|
|
|
|
|
|
# although it may be the $first_trivial_accepting state. |
1172
|
|
|
|
|
|
|
# |
1173
|
|
|
|
|
|
|
# On the other side, a well defined $nfa |
1174
|
|
|
|
|
|
|
# with a single state and with a non-empty transition list |
1175
|
|
|
|
|
|
|
# must loop to itself, thus $init_reachable. |
1176
|
|
|
|
|
|
|
} |
1177
|
|
|
|
|
|
|
|
1178
|
243
|
|
|
|
|
216
|
$init_state_ind = @$union; |
1179
|
243
|
|
|
|
|
210
|
$init_reachable = 0; |
1180
|
243
|
|
|
|
|
397
|
for (0..$#$nfa) { |
1181
|
1261
|
|
|
|
|
1174
|
$orig_state = $$nfa[$_]; |
1182
|
1261
|
100
|
100
|
|
|
2181
|
if ( |
1183
|
266
|
|
|
|
|
623
|
$$orig_state[0] # accepting |
1184
|
|
|
|
|
|
|
&& !@{$$orig_state[1]} # trivial |
1185
|
|
|
|
|
|
|
) { |
1186
|
164
|
100
|
|
|
|
225
|
if (defined($first_trivial_accepting_state_ind)) { |
1187
|
87
|
|
|
|
|
205
|
$$state_ind_to_equiv{$_ + $init_state_ind} |
1188
|
|
|
|
|
|
|
= $first_trivial_accepting_state_ind; |
1189
|
|
|
|
|
|
|
} |
1190
|
|
|
|
|
|
|
else { |
1191
|
77
|
|
|
|
|
134
|
$first_trivial_accepting_state_ind |
1192
|
|
|
|
|
|
|
= $_ + $init_state_ind; |
1193
|
|
|
|
|
|
|
} |
1194
|
|
|
|
|
|
|
} |
1195
|
|
|
|
|
|
|
else { |
1196
|
1097
|
|
|
|
|
701
|
for ( @{$$orig_state[1]} ) { # transition list |
|
1097
|
|
|
|
|
1245
|
|
1197
|
1311
|
100
|
100
|
|
|
2806
|
($$_[1] += $init_state_ind) == $init_state_ind |
1198
|
|
|
|
|
|
|
&& ($init_reachable ||= 1); |
1199
|
|
|
|
|
|
|
} |
1200
|
|
|
|
|
|
|
} |
1201
|
|
|
|
|
|
|
}; |
1202
|
243
|
|
|
|
|
529
|
push(@$union, @$nfa); |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
# merge initial $transitions |
1205
|
243
|
|
|
|
|
187
|
push(@{$$union[0][1]}, map { [ @$_ ] } @{$$nfa[0][1]}); |
|
243
|
|
|
|
|
318
|
|
|
274
|
|
|
|
|
496
|
|
|
243
|
|
|
|
|
324
|
|
1206
|
243
|
100
|
|
|
|
426
|
if (!$init_reachable) { |
1207
|
|
|
|
|
|
|
# for being removed by _nfa_shrink_equiv() |
1208
|
214
|
|
|
|
|
449
|
$$state_ind_to_equiv{$init_state_ind} = $init_state_ind; |
1209
|
|
|
|
|
|
|
} |
1210
|
|
|
|
|
|
|
}; |
1211
|
130
|
100
|
|
|
|
298
|
if (keys(%$state_ind_to_equiv)) { |
1212
|
120
|
|
|
|
|
185
|
return _nfa_shrink_equiv($union, $state_ind_to_equiv); |
1213
|
|
|
|
|
|
|
} |
1214
|
|
|
|
|
|
|
else { |
1215
|
10
|
|
|
|
|
30
|
return $union; |
1216
|
|
|
|
|
|
|
} |
1217
|
|
|
|
|
|
|
} |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
{ |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
my %cached_cc_inter2; |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
=item C |
1224
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
Returns C<$out_nfa>, a $C<$nfa> computed from C<@in_nfas>. |
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
C<$out_nfa> accepts a word w if and only if each of C<@in_nfas> accepts w. |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
=cut |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
sub nfa_inter { |
1232
|
30
|
|
|
30
|
1
|
635
|
my ($inter, @nfas) = sort { @$a <=> @$b } @_; |
|
28
|
|
|
|
|
93
|
|
1233
|
30
|
|
|
|
|
53
|
for (@nfas) { $inter = nfa_inter2($inter, $_); } |
|
25
|
|
|
|
|
65
|
|
1234
|
|
|
|
|
|
|
return |
1235
|
30
|
|
50
|
|
|
159
|
$inter |
1236
|
|
|
|
|
|
|
|| [[1, [[$cc_any, 0]]]] # neutral element: accepting anything |
1237
|
|
|
|
|
|
|
; |
1238
|
|
|
|
|
|
|
} |
1239
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
# Multiplies the total number of states |
1241
|
|
|
|
|
|
|
sub nfa_inter2 { |
1242
|
25
|
|
|
25
|
0
|
32
|
my ($nfa_0, $nfa_1) = @_; |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
# computed states |
1245
|
25
|
|
|
|
|
46
|
my @todo = (0); |
1246
|
25
|
|
|
|
|
31
|
my %todo_seen; # set of state_inds |
1247
|
|
|
|
|
|
|
my %done; # key-subset of %todo_seen (values are states) |
1248
|
|
|
|
|
|
|
# After the following while, %done are %todo_seen the same set. |
1249
|
|
|
|
|
|
|
|
1250
|
|
|
|
|
|
|
# dead end detection |
1251
|
0
|
|
|
|
|
0
|
my %path_tr; |
1252
|
0
|
|
|
|
|
0
|
my @cur_livings; |
1253
|
0
|
|
|
|
|
0
|
my %livings; |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
# tmp variables |
1256
|
|
|
|
|
|
|
my ( |
1257
|
0
|
|
|
|
|
0
|
$from_state_ind, $to_state_ind |
1258
|
|
|
|
|
|
|
, $nfa_0_accepting, $nfa_0_transitions |
1259
|
|
|
|
|
|
|
, $nfa_1_accepting, $nfa_1_transitions |
1260
|
|
|
|
|
|
|
, $t_0, $t_1 |
1261
|
|
|
|
|
|
|
, $char_class |
1262
|
|
|
|
|
|
|
, $accepting |
1263
|
|
|
|
|
|
|
, @keys_path_to_state_ind |
1264
|
|
|
|
|
|
|
); |
1265
|
|
|
|
|
|
|
|
1266
|
25
|
|
|
|
|
40
|
my $nfa_1_len = @$nfa_1; |
1267
|
|
|
|
|
|
|
|
1268
|
25
|
|
|
|
|
65
|
while (@todo) { |
1269
|
1428
|
|
|
|
|
1538
|
$todo_seen{$from_state_ind} = $from_state_ind = pop(@todo); |
1270
|
|
|
|
|
|
|
|
1271
|
1428
|
|
|
|
|
1783
|
($nfa_0_accepting, $nfa_0_transitions) |
1272
|
1428
|
|
|
|
|
1062
|
= @{$$nfa_0[$from_state_ind / $nfa_1_len]}; # i-th state |
1273
|
1428
|
|
|
|
|
1529
|
($nfa_1_accepting, $nfa_1_transitions) |
1274
|
1428
|
|
|
|
|
991
|
= @{$$nfa_1[$from_state_ind % $nfa_1_len]}; # j-th state |
1275
|
|
|
|
|
|
|
|
1276
|
1428
|
|
|
|
|
1312
|
my $new_transitions = []; |
1277
|
1428
|
|
|
|
|
1342
|
for $t_0 (@$nfa_0_transitions) { |
1278
|
2953
|
|
|
|
|
2418
|
for $t_1 (@$nfa_1_transitions) { |
1279
|
|
|
|
|
|
|
|
1280
|
5804
|
100
|
66
|
|
|
16775
|
if ( |
1281
|
|
|
|
|
|
|
( |
1282
|
|
|
|
|
|
|
$char_class |
1283
|
|
|
|
|
|
|
= $cached_cc_inter2{$$t_0[0]}{$$t_1[0]} |
1284
|
|
|
|
|
|
|
||= &cc_inter2($$t_0[0], $$t_1[0]) |
1285
|
|
|
|
|
|
|
) != $cc_none |
1286
|
|
|
|
|
|
|
) { |
1287
|
2936
|
|
|
|
|
4322
|
push (@$new_transitions, [ |
1288
|
|
|
|
|
|
|
$char_class |
1289
|
|
|
|
|
|
|
, $to_state_ind = $$t_0[1] * $nfa_1_len + $$t_1[1] |
1290
|
|
|
|
|
|
|
]); |
1291
|
2936
|
100
|
|
|
|
4283
|
if (!exists($todo_seen{$to_state_ind})) { |
1292
|
1403
|
|
|
|
|
2248
|
push(@todo, |
1293
|
|
|
|
|
|
|
$todo_seen{$to_state_ind} = $to_state_ind); |
1294
|
|
|
|
|
|
|
} |
1295
|
2936
|
|
|
|
|
5250
|
$path_tr{$to_state_ind}{$from_state_ind} = undef; |
1296
|
|
|
|
|
|
|
} |
1297
|
|
|
|
|
|
|
} |
1298
|
|
|
|
|
|
|
} |
1299
|
1428
|
100
|
100
|
|
|
3021
|
if ($accepting = $nfa_0_accepting && $nfa_1_accepting) { |
1300
|
26
|
|
|
|
|
41
|
push(@cur_livings, $from_state_ind); |
1301
|
|
|
|
|
|
|
} |
1302
|
1428
|
|
|
|
|
3216
|
$done{$from_state_ind} = [ |
1303
|
|
|
|
|
|
|
$accepting |
1304
|
|
|
|
|
|
|
, $new_transitions |
1305
|
|
|
|
|
|
|
]; |
1306
|
|
|
|
|
|
|
} |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
# remove dead ends |
1309
|
25
|
|
|
|
|
54
|
%livings = map { ($_ => $_) } @cur_livings; |
|
26
|
|
|
|
|
90
|
|
1310
|
25
|
|
|
|
|
65
|
while (@cur_livings) { |
1311
|
945
|
|
|
|
|
2032
|
push(@cur_livings, |
1312
|
2072
|
|
|
|
|
2360
|
map { $livings{$_} = $_ } |
1313
|
971
|
|
|
|
|
1540
|
grep { !exists($livings{$_}) } |
1314
|
971
|
|
|
|
|
677
|
keys(%{$path_tr{pop(@cur_livings)}}) |
1315
|
|
|
|
|
|
|
); |
1316
|
|
|
|
|
|
|
} |
1317
|
|
|
|
|
|
|
|
1318
|
25
|
50
|
|
|
|
84
|
if (keys(%livings) == 0) { |
1319
|
0
|
|
|
|
|
0
|
return [[0, []]]; |
1320
|
|
|
|
|
|
|
} |
1321
|
|
|
|
|
|
|
|
1322
|
|
|
|
|
|
|
# compact renumbering |
1323
|
25
|
|
|
|
|
36
|
my @sorted_keys; |
1324
|
|
|
|
|
|
|
my $inter = [@done{ |
1325
|
25
|
|
|
|
|
219
|
@sorted_keys = sort { $a <=> $b } keys(%livings) |
|
4434
|
|
|
|
|
3443
|
|
1326
|
|
|
|
|
|
|
}]; |
1327
|
25
|
|
|
|
|
66
|
my $i = 0; |
1328
|
25
|
|
|
|
|
48
|
my %compact_map = map { ($_ => $i++) } @sorted_keys; |
|
971
|
|
|
|
|
1056
|
|
1329
|
|
|
|
|
|
|
|
1330
|
25
|
|
|
|
|
99
|
for ( |
1331
|
971
|
|
|
|
|
1597
|
map { |
1332
|
2380
|
|
|
|
|
3193
|
@{$$_[1]} |
1333
|
971
|
|
|
|
|
957
|
= grep { exists($compact_map{$$_[1]}) } |
1334
|
971
|
|
|
|
|
602
|
@{$$_[1]} |
1335
|
|
|
|
|
|
|
} |
1336
|
|
|
|
|
|
|
@$inter |
1337
|
|
|
|
|
|
|
) { |
1338
|
2072
|
|
|
|
|
1913
|
$$_[1] = $compact_map{$$_[1]}; |
1339
|
|
|
|
|
|
|
} |
1340
|
25
|
|
|
|
|
1175
|
return $inter; |
1341
|
|
|
|
|
|
|
} |
1342
|
|
|
|
|
|
|
} |
1343
|
|
|
|
|
|
|
|
1344
|
|
|
|
|
|
|
sub nfa_resolve_anchors { |
1345
|
10
|
|
|
10
|
0
|
14
|
my ($nfa) = @_; |
1346
|
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
|
# find state_inds reachable from the root by begin-anchor transitions |
1348
|
10
|
|
|
|
|
21
|
my %begs = (0 => undef); |
1349
|
10
|
|
|
|
|
18
|
my @todo = (0); |
1350
|
10
|
|
|
|
|
33
|
while (defined(my $beg = pop(@todo))) { |
1351
|
13
|
|
|
|
|
15
|
for ( |
1352
|
3
|
|
|
|
|
6
|
map { $$_[1] } # state_ind |
|
23
|
|
|
|
|
66
|
|
1353
|
13
|
|
|
|
|
83
|
grep { $$_[0][0][1] == -1 } # begin-anchor |
1354
|
|
|
|
|
|
|
@{$$nfa[$beg][1]} |
1355
|
|
|
|
|
|
|
) { |
1356
|
3
|
50
|
|
|
|
7
|
if (!exists($begs{$_})) { |
1357
|
3
|
|
|
|
|
3
|
$begs{$_} = undef; |
1358
|
3
|
|
|
|
|
9
|
push(@todo, $_); |
1359
|
|
|
|
|
|
|
} |
1360
|
|
|
|
|
|
|
} |
1361
|
|
|
|
|
|
|
} |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
# find state_inds leading to an accepting state by end-anchor transitions |
1364
|
10
|
|
|
|
|
12
|
my @cur_livings; |
1365
|
|
|
|
|
|
|
my %path_tr; |
1366
|
10
|
|
|
|
|
28
|
for my $from_state_ind (0..$#$nfa) { |
1367
|
38
|
|
|
|
|
24
|
for (@{$$nfa[$from_state_ind][1]}) { |
|
38
|
|
|
|
|
61
|
|
1368
|
46
|
|
|
|
|
84
|
$path_tr{$$_[1]}{$from_state_ind} = $$_[0]; |
1369
|
|
|
|
|
|
|
} |
1370
|
38
|
100
|
|
|
|
82
|
if ($$nfa[$from_state_ind][0]) { |
1371
|
10
|
|
|
|
|
20
|
push(@cur_livings, $from_state_ind); |
1372
|
|
|
|
|
|
|
} |
1373
|
|
|
|
|
|
|
} |
1374
|
10
|
|
|
|
|
14
|
my %livings = map {($_ => undef)} @cur_livings; |
|
10
|
|
|
|
|
23
|
|
1375
|
10
|
|
|
|
|
28
|
while (defined(my $end = pop(@cur_livings))) { |
1376
|
10
|
|
|
|
|
11
|
for ( |
1377
|
16
|
|
|
|
|
55
|
grep { |
1378
|
10
|
|
|
|
|
24
|
$path_tr{$end}{$_}[0][0] == -3; # end-anchor |
1379
|
|
|
|
|
|
|
} |
1380
|
|
|
|
|
|
|
keys(%{$path_tr{$end}}) |
1381
|
|
|
|
|
|
|
) { |
1382
|
0
|
0
|
|
|
|
0
|
if (!exists($livings{$_})) { |
1383
|
0
|
|
|
|
|
0
|
push(@cur_livings, $livings{$_} = undef); |
1384
|
0
|
|
|
|
|
0
|
$$nfa[$_][0] = 1; |
1385
|
|
|
|
|
|
|
} |
1386
|
|
|
|
|
|
|
} |
1387
|
|
|
|
|
|
|
} |
1388
|
|
|
|
|
|
|
|
1389
|
10
|
|
|
|
|
11
|
my $accept_empty; |
1390
|
10
|
100
|
|
|
|
16
|
if (!($accept_empty = scalar(grep {$$nfa[$_][0]} keys(%begs)) ? 1 : 0)) { |
|
13
|
100
|
|
|
|
47
|
|
1391
|
|
|
|
|
|
|
# special case for $^ for and the like: empty string matches |
1392
|
9
|
|
|
|
|
10
|
my %begends; |
1393
|
9
|
|
|
|
|
18
|
my @todo = keys(%begs); |
1394
|
9
|
|
|
|
|
22
|
while (defined(my $begend = pop(@todo))) { |
1395
|
23
|
|
|
|
|
21
|
for ( |
1396
|
17
|
|
|
|
|
28
|
map { $$_[1] } # state_ind |
|
33
|
|
|
|
|
64
|
|
1397
|
23
|
|
|
|
|
34
|
grep { $$_[0][0][1] < 0 } # anchor |
1398
|
|
|
|
|
|
|
@{$$nfa[$begend][1]} |
1399
|
|
|
|
|
|
|
) { |
1400
|
17
|
50
|
66
|
|
|
57
|
if (!exists($begs{$_}) && !exists($begends{$_})) { |
1401
|
14
|
100
|
|
|
|
31
|
if ($$nfa[$_][0]) { |
1402
|
3
|
|
|
|
|
8
|
$accept_empty = 1; |
1403
|
3
|
|
|
|
|
6
|
@todo = (); |
1404
|
3
|
|
|
|
|
10
|
last; |
1405
|
|
|
|
|
|
|
} |
1406
|
11
|
|
|
|
|
12
|
$begends{$_} = undef; |
1407
|
11
|
|
|
|
|
33
|
push(@todo, $_); |
1408
|
|
|
|
|
|
|
} |
1409
|
|
|
|
|
|
|
} |
1410
|
|
|
|
|
|
|
} |
1411
|
|
|
|
|
|
|
} |
1412
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
# remove anchors |
1414
|
10
|
|
|
|
|
24
|
for my $from_state_ind ( |
|
46
|
|
|
|
|
71
|
|
1415
|
|
|
|
|
|
|
grep { |
1416
|
38
|
|
|
|
|
50
|
grep { $$_[0][0][0] < 0 } # anchor |
1417
|
38
|
|
|
|
|
26
|
@{$$nfa[$_][1]} # transitions |
1418
|
|
|
|
|
|
|
} |
1419
|
|
|
|
|
|
|
(0..$#$nfa) |
1420
|
|
|
|
|
|
|
) { |
1421
|
20
|
|
|
|
|
24
|
my $state = $$nfa[$from_state_ind]; |
1422
|
|
|
|
|
|
|
$$state[1] = [ |
1423
|
|
|
|
|
|
|
map { |
1424
|
31
|
100
|
|
|
|
52
|
if ($$_[0][0][0] >= 0) { |
|
22
|
50
|
|
|
|
35
|
|
|
20
|
|
|
|
|
23
|
|
1425
|
9
|
|
|
|
|
30
|
$_; |
1426
|
|
|
|
|
|
|
} |
1427
|
|
|
|
|
|
|
elsif ( @{$$_[0]} == 1 ) { |
1428
|
22
|
|
|
|
|
32
|
delete($path_tr{$$_[1]}{$from_state_ind}); |
1429
|
22
|
|
|
|
|
45
|
(); |
1430
|
|
|
|
|
|
|
} |
1431
|
|
|
|
|
|
|
else { |
1432
|
0
|
|
|
|
|
0
|
$path_tr{$$_[1]}{$from_state_ind} |
1433
|
|
|
|
|
|
|
= $$_[0] |
1434
|
0
|
|
|
|
|
0
|
= interval_list_to_cc(@{$$_[0]}[1..$#{$$_[0]}]); |
|
0
|
|
|
|
|
0
|
|
1435
|
0
|
|
|
|
|
0
|
$_; |
1436
|
|
|
|
|
|
|
} |
1437
|
|
|
|
|
|
|
} |
1438
|
20
|
|
|
|
|
17
|
@{$$state[1]} # transitions |
1439
|
|
|
|
|
|
|
]; |
1440
|
|
|
|
|
|
|
} |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
# ensure that the initial state cannot be reached |
1443
|
10
|
100
|
|
|
|
12
|
if (@{$$nfa[0][1]}) { |
|
10
|
|
|
|
|
26
|
|
1444
|
|
|
|
|
|
|
# proper init transitions (clone of the initial state needed) |
1445
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
# replace transitions to the initial state |
1447
|
|
|
|
|
|
|
# with transitions to the cloned initial state |
1448
|
8
|
|
|
|
|
10
|
my $new_state_ind = @$nfa; |
1449
|
8
|
|
|
|
|
8
|
my $clone_reachable; |
1450
|
8
|
|
|
|
|
22
|
for my $transition ( |
|
22
|
|
|
|
|
39
|
|
1451
|
33
|
|
|
|
|
40
|
grep { $$_[1] == 0 } # to initial state |
1452
|
33
|
|
|
|
|
26
|
map { @{$$_[1]} } # transitions |
1453
|
|
|
|
|
|
|
@$nfa |
1454
|
|
|
|
|
|
|
) { |
1455
|
8
|
|
|
|
|
13
|
$$transition[1] = $new_state_ind; |
1456
|
8
|
|
|
|
|
11
|
$clone_reachable = 1; |
1457
|
|
|
|
|
|
|
} |
1458
|
|
|
|
|
|
|
|
1459
|
8
|
50
|
|
|
|
19
|
if ($clone_reachable) { |
1460
|
8
|
|
|
|
|
19
|
my $new_state = [ |
1461
|
|
|
|
|
|
|
$$nfa[0][0] |
1462
|
8
|
|
|
|
|
13
|
, [@{$$nfa[0][1]}] |
1463
|
|
|
|
|
|
|
]; |
1464
|
8
|
|
|
|
|
11
|
push(@$nfa, $new_state); |
1465
|
8
|
|
|
|
|
13
|
$path_tr{$new_state_ind} = $path_tr{0}; |
1466
|
8
|
|
|
|
|
10
|
for (@{$$nfa[0][1]}) { |
|
8
|
|
|
|
|
16
|
|
1467
|
10
|
|
|
|
|
19
|
$path_tr{$$_[1]}{$new_state_ind} = $$_[0]; |
1468
|
|
|
|
|
|
|
} |
1469
|
8
|
50
|
|
|
|
23
|
if ($$nfa[0][0]) { |
1470
|
0
|
|
|
|
|
0
|
$livings{$new_state_ind} = undef; |
1471
|
|
|
|
|
|
|
} |
1472
|
|
|
|
|
|
|
} |
1473
|
|
|
|
|
|
|
} |
1474
|
|
|
|
|
|
|
else { |
1475
|
|
|
|
|
|
|
# no proper init transitions |
1476
|
|
|
|
|
|
|
|
1477
|
|
|
|
|
|
|
# drop transitions to the initial state |
1478
|
2
|
|
|
|
|
4
|
for my $state (@$nfa) { |
1479
|
5
|
|
|
|
|
5
|
@{$$state[1]} = grep { $$_[1] != 0 } @{$$state[1]}; |
|
5
|
|
|
|
|
8
|
|
|
2
|
|
|
|
|
5
|
|
|
5
|
|
|
|
|
6
|
|
1480
|
|
|
|
|
|
|
} |
1481
|
|
|
|
|
|
|
} |
1482
|
10
|
|
|
|
|
15
|
delete($path_tr{0}); |
1483
|
|
|
|
|
|
|
|
1484
|
|
|
|
|
|
|
# extend initial state (merge all initial states of %begs) |
1485
|
10
|
100
|
|
|
|
25
|
if (keys(%begs) > 1) { |
1486
|
2
|
|
|
|
|
3
|
my %state_ind_to_char_classes; |
1487
|
2
|
|
|
|
|
2
|
for ( map { @{$$nfa[$_][1]} } keys(%begs) ) { |
|
5
|
|
|
|
|
3
|
|
|
5
|
|
|
|
|
12
|
|
1488
|
6
|
|
|
|
|
7
|
push(@{$state_ind_to_char_classes{$$_[1]}}, $$_[0]); |
|
6
|
|
|
|
|
12
|
|
1489
|
|
|
|
|
|
|
} |
1490
|
2
|
|
|
|
|
7
|
@{$$nfa[0][1]} |
|
6
|
|
|
|
|
13
|
|
1491
|
2
|
|
|
|
|
5
|
= map { [ |
1492
|
6
|
|
|
|
|
7
|
$path_tr{$_}{0} = cc_union(@{$state_ind_to_char_classes{$_}}) |
1493
|
|
|
|
|
|
|
, int($_) |
1494
|
|
|
|
|
|
|
] } |
1495
|
|
|
|
|
|
|
keys(%state_ind_to_char_classes) |
1496
|
|
|
|
|
|
|
; |
1497
|
|
|
|
|
|
|
} |
1498
|
10
|
100
|
|
|
|
25
|
if ($$nfa[0][0] = $accept_empty) { |
1499
|
4
|
|
|
|
|
7
|
$livings{0} = undef; |
1500
|
|
|
|
|
|
|
} |
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
# remove unreachable states |
1503
|
10
|
|
|
|
|
15
|
my @cur_reachables = (0); |
1504
|
10
|
|
|
|
|
17
|
my %reachables = (0 => 0); |
1505
|
10
|
|
|
|
|
31
|
while (@cur_reachables) { |
1506
|
24
|
|
|
|
|
27
|
my $from_state_ind = shift(@cur_reachables); |
1507
|
24
|
|
|
|
|
20
|
for ( |
1508
|
25
|
|
|
|
|
38
|
map { $$_[1] } |
|
24
|
|
|
|
|
36
|
|
1509
|
|
|
|
|
|
|
@{$$nfa[$from_state_ind][1]} |
1510
|
|
|
|
|
|
|
) { |
1511
|
25
|
100
|
|
|
|
58
|
if (!exists($reachables{$_})) { |
1512
|
14
|
|
|
|
|
35
|
push(@cur_reachables, $reachables{$_} = $_); |
1513
|
|
|
|
|
|
|
} |
1514
|
|
|
|
|
|
|
} |
1515
|
|
|
|
|
|
|
} |
1516
|
|
|
|
|
|
|
|
1517
|
|
|
|
|
|
|
# remove dead ends |
1518
|
10
|
|
|
|
|
17
|
delete(@livings{grep { !exists($reachables{$_}) } keys(%livings)}); |
|
13
|
|
|
|
|
22
|
|
1519
|
10
|
|
|
|
|
17
|
@cur_livings = keys(%livings); |
1520
|
10
|
|
|
|
|
23
|
while (@cur_livings) { |
1521
|
11
|
|
|
|
|
11
|
for ( |
1522
|
10
|
|
|
|
|
16
|
grep { exists($reachables{$_}) } |
|
11
|
|
|
|
|
140
|
|
1523
|
|
|
|
|
|
|
keys(%{$path_tr{pop(@cur_livings)}}) |
1524
|
|
|
|
|
|
|
) { |
1525
|
8
|
100
|
|
|
|
15
|
if (!exists($livings{$_})) { |
1526
|
5
|
|
|
|
|
6
|
push(@cur_livings, $_); |
1527
|
5
|
|
|
|
|
9
|
$livings{$_} = undef; |
1528
|
|
|
|
|
|
|
} |
1529
|
|
|
|
|
|
|
} |
1530
|
|
|
|
|
|
|
} |
1531
|
|
|
|
|
|
|
|
1532
|
10
|
100
|
|
|
|
34
|
if (keys(%livings) == 0) { |
|
|
50
|
|
|
|
|
|
1533
|
4
|
|
|
|
|
30
|
return [[0, []]]; |
1534
|
|
|
|
|
|
|
} |
1535
|
|
|
|
|
|
|
elsif (keys(%livings) == @$nfa) { |
1536
|
0
|
|
|
|
|
0
|
return $nfa; |
1537
|
|
|
|
|
|
|
} |
1538
|
|
|
|
|
|
|
|
1539
|
|
|
|
|
|
|
# compact renumbering |
1540
|
6
|
|
|
|
|
15
|
my @sorted_keys = sort { $a <=> $b } keys(%livings); |
|
6
|
|
|
|
|
9
|
|
1541
|
6
|
|
|
|
|
8
|
my $i = 0; |
1542
|
6
|
|
|
|
|
11
|
my %compact_map = map { ($_ => $i++) } @sorted_keys; |
|
11
|
|
|
|
|
20
|
|
1543
|
|
|
|
|
|
|
|
1544
|
|
|
|
|
|
|
return [ |
1545
|
11
|
|
|
|
|
15
|
map { |
1546
|
6
|
|
|
|
|
16
|
@{$$_[1]} |
|
8
|
|
|
|
|
10
|
|
1547
|
|
|
|
|
|
|
= map { |
1548
|
13
|
|
|
|
|
23
|
$$_[1] = $compact_map{$$_[1]}; |
1549
|
8
|
|
|
|
|
8
|
$_; |
1550
|
|
|
|
|
|
|
} |
1551
|
11
|
|
|
|
|
14
|
grep { exists($compact_map{$$_[1]}) } |
1552
|
11
|
|
|
|
|
12
|
@{$$_[1]} |
1553
|
|
|
|
|
|
|
; |
1554
|
11
|
|
|
|
|
53
|
$_; |
1555
|
|
|
|
|
|
|
} |
1556
|
|
|
|
|
|
|
@$nfa[@sorted_keys] |
1557
|
|
|
|
|
|
|
]; |
1558
|
|
|
|
|
|
|
} |
1559
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
=item C |
1561
|
|
|
|
|
|
|
|
1562
|
|
|
|
|
|
|
Returns true if and only if C<$in_nfa> accepts C<$str>. |
1563
|
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
|
=cut |
1565
|
|
|
|
|
|
|
|
1566
|
|
|
|
|
|
|
sub nfa_match { |
1567
|
19
|
|
|
19
|
1
|
3360
|
my ($nfa, $str) = @_; |
1568
|
|
|
|
|
|
|
|
1569
|
19
|
|
|
|
|
36
|
my %state_inds = (0 => 0); |
1570
|
19
|
|
|
|
|
52
|
for my $c ( map { ord($_) } split('', $str) ) { |
|
119
|
|
|
|
|
114
|
|
1571
|
66
|
|
|
|
|
177
|
%state_inds |
1572
|
105
|
|
|
|
|
128
|
= map { $$_[1] => $$_[1] } |
1573
|
74
|
|
|
|
|
115
|
grep { cc_match($$_[0], $c) } # matching transition list |
1574
|
119
|
|
|
|
|
143
|
map { @{$$_[1]} } # all transition list |
|
74
|
|
|
|
|
51
|
|
1575
|
|
|
|
|
|
|
@$nfa[values(%state_inds)] # current states |
1576
|
|
|
|
|
|
|
; |
1577
|
|
|
|
|
|
|
} |
1578
|
|
|
|
|
|
|
|
1579
|
19
|
|
|
|
|
62
|
return grep { $$_[0] } @$nfa[values(%state_inds)]; |
|
11
|
|
|
|
|
60
|
|
1580
|
|
|
|
|
|
|
} |
1581
|
|
|
|
|
|
|
|
1582
|
|
|
|
|
|
|
sub nfa_dump { |
1583
|
0
|
|
|
0
|
0
|
0
|
my ($nfa) = @_; |
1584
|
0
|
|
|
|
|
0
|
my $dump = ''; |
1585
|
0
|
|
|
|
|
0
|
for my $i (0..$#$nfa) { |
1586
|
0
|
0
|
|
|
|
0
|
$dump |
1587
|
|
|
|
|
|
|
.= "$i:" |
1588
|
|
|
|
|
|
|
. ($$nfa[$i][0] ? " (accepting)" : "") |
1589
|
|
|
|
|
|
|
. "\n" |
1590
|
|
|
|
|
|
|
; |
1591
|
0
|
|
|
|
|
0
|
for my $transition (@{$$nfa[$i][1]}) { |
|
0
|
|
|
|
|
0
|
|
1592
|
0
|
|
|
|
|
0
|
$dump |
1593
|
|
|
|
|
|
|
.= " " |
1594
|
|
|
|
|
|
|
. cc_to_regex($$transition[0]) . " => $$transition[1]\n"; |
1595
|
|
|
|
|
|
|
} |
1596
|
|
|
|
|
|
|
} |
1597
|
0
|
|
|
|
|
0
|
return $dump; |
1598
|
|
|
|
|
|
|
} |
1599
|
|
|
|
|
|
|
|
1600
|
|
|
|
|
|
|
=item C |
1601
|
|
|
|
|
|
|
|
1602
|
|
|
|
|
|
|
Returns true if and only if the labeled graphs represented by C<$nfa1> |
1603
|
|
|
|
|
|
|
and C<$nfa2> are isomorph. While isomorph C<$nfa>s accept the same language, |
1604
|
|
|
|
|
|
|
the converse is not true. |
1605
|
|
|
|
|
|
|
|
1606
|
|
|
|
|
|
|
=cut |
1607
|
|
|
|
|
|
|
|
1608
|
|
|
|
|
|
|
sub nfa_isomorph { |
1609
|
99
|
|
|
99
|
1
|
1606
|
my ($nfa1, $nfa2) = @_; |
1610
|
|
|
|
|
|
|
|
1611
|
99
|
|
|
|
|
220
|
my %nfa1_nfa2_indexes = (0 => 0); |
1612
|
99
|
|
|
|
|
167
|
my %nfa2_nfa1_indexes = (0 => 0); |
1613
|
99
|
|
|
|
|
162
|
my @nfa1_index_todo = (0); |
1614
|
|
|
|
|
|
|
|
1615
|
99
|
|
|
|
|
287
|
while (defined(my $nfa1_index = pop(@nfa1_index_todo))) { |
1616
|
|
|
|
|
|
|
|
1617
|
583
|
|
|
|
|
621
|
my $state1 = $$nfa1[$nfa1_index]; |
1618
|
583
|
|
|
|
|
577
|
my $state2 = $$nfa2[$nfa1_nfa2_indexes{$nfa1_index}]; |
1619
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
# accepting |
1621
|
583
|
50
|
|
|
|
969
|
if ($$state1[0] != $$state2[0]) { |
1622
|
0
|
|
|
|
|
0
|
return 0; |
1623
|
|
|
|
|
|
|
} |
1624
|
|
|
|
|
|
|
|
1625
|
|
|
|
|
|
|
# transitions |
1626
|
583
|
|
|
|
|
435
|
my $transitions1 = [sort { $$a[0] <=> $$b[0] } @{$$state1[1]}]; |
|
2079
|
|
|
|
|
2283
|
|
|
583
|
|
|
|
|
1026
|
|
1627
|
583
|
|
|
|
|
502
|
my $transitions2 = [sort { $$a[0] <=> $$b[0] } @{$$state2[1]}]; |
|
2085
|
|
|
|
|
1961
|
|
|
583
|
|
|
|
|
737
|
|
1628
|
583
|
50
|
|
|
|
912
|
if (@$transitions1 != @$transitions2) { |
1629
|
0
|
|
|
|
|
0
|
return 0; |
1630
|
|
|
|
|
|
|
} |
1631
|
583
|
|
|
|
|
765
|
for my $i (0..$#$transitions1) { |
1632
|
1662
|
|
|
|
|
1117
|
my ($cc1, $next_index1) = @{$$transitions1[$i]}; |
|
1662
|
|
|
|
|
1900
|
|
1633
|
1662
|
|
|
|
|
1229
|
my ($cc2, $next_index2) = @{$$transitions2[$i]}; |
|
1662
|
|
|
|
|
1557
|
|
1634
|
1662
|
50
|
|
|
|
2655
|
if ($cc1 ne $cc2) { |
1635
|
0
|
|
|
|
|
0
|
return 0; |
1636
|
|
|
|
|
|
|
} |
1637
|
1662
|
100
|
|
|
|
2224
|
if (exists($nfa1_nfa2_indexes{$next_index1})) { |
|
|
50
|
|
|
|
|
|
1638
|
1178
|
50
|
|
|
|
2983
|
if ($nfa1_nfa2_indexes{$next_index1} != $next_index2) { |
1639
|
0
|
|
|
|
|
0
|
return 0; |
1640
|
|
|
|
|
|
|
} |
1641
|
|
|
|
|
|
|
} |
1642
|
|
|
|
|
|
|
elsif (exists($nfa2_nfa1_indexes{$next_index2})) { |
1643
|
|
|
|
|
|
|
# $nfa2_nfa1_indexes{$next_index2} != $next_index1 |
1644
|
|
|
|
|
|
|
# because |
1645
|
|
|
|
|
|
|
# - !exists($nfa1_nfa2_indexes{$next_index1}) |
1646
|
|
|
|
|
|
|
# - $nfa1_nfa2_indexes and $nfa2_nfa1_indexes |
1647
|
|
|
|
|
|
|
# are reverse to each other by construction |
1648
|
0
|
|
|
|
|
0
|
return 0; |
1649
|
|
|
|
|
|
|
} |
1650
|
|
|
|
|
|
|
else { |
1651
|
484
|
|
|
|
|
565
|
$nfa1_nfa2_indexes{$next_index1} = $next_index2; |
1652
|
484
|
|
|
|
|
455
|
$nfa2_nfa1_indexes{$next_index2} = $next_index1; |
1653
|
484
|
|
|
|
|
862
|
push(@nfa1_index_todo, $next_index1); |
1654
|
|
|
|
|
|
|
} |
1655
|
|
|
|
|
|
|
} |
1656
|
|
|
|
|
|
|
} |
1657
|
99
|
|
|
|
|
573
|
return 1; |
1658
|
|
|
|
|
|
|
} |
1659
|
|
|
|
|
|
|
|
1660
|
|
|
|
|
|
|
|
1661
|
|
|
|
|
|
|
############################################################################## |
1662
|
|
|
|
|
|
|
# $dfa |
1663
|
|
|
|
|
|
|
############################################################################## |
1664
|
|
|
|
|
|
|
|
1665
|
|
|
|
|
|
|
# input X: |
1666
|
|
|
|
|
|
|
# Arbitrary list of intervals. |
1667
|
|
|
|
|
|
|
# output Y: |
1668
|
|
|
|
|
|
|
# List of pairwise disjoint intervals spanning the same subset such that |
1669
|
|
|
|
|
|
|
# for any intersections/unions of intervals of X |
1670
|
|
|
|
|
|
|
# an equal union of intervals of Y exists. |
1671
|
|
|
|
|
|
|
# In short, all boundaries of X are preserved. |
1672
|
|
|
|
|
|
|
# |
1673
|
|
|
|
|
|
|
# Motivation: |
1674
|
|
|
|
|
|
|
# nfas use character classes as alphabet (instead of single code points). |
1675
|
|
|
|
|
|
|
# dfa operations needs a common refinement of sets of character classes. |
1676
|
|
|
|
|
|
|
# |
1677
|
|
|
|
|
|
|
# Example: |
1678
|
|
|
|
|
|
|
# interval_cases( [ [0, 5], [2, 8] ] ) |
1679
|
|
|
|
|
|
|
# = [ [0, 1], [2, 5], [6, 8] ] |
1680
|
|
|
|
|
|
|
# |
1681
|
|
|
|
|
|
|
# X: |0 1 2 3 4 5| |
1682
|
|
|
|
|
|
|
# |2 3 4 5 6 7 8| |
1683
|
|
|
|
|
|
|
# Y: |0 1|2 3 4 5|6 7 8| |
1684
|
|
|
|
|
|
|
# |
1685
|
|
|
|
|
|
|
sub interval_cases { |
1686
|
1525
|
|
|
1525
|
0
|
1342
|
my ($interval_list) = @_; |
1687
|
|
|
|
|
|
|
my @sorted |
1688
|
45323
|
50
|
|
|
|
61168
|
= sort { |
1689
|
1525
|
|
|
|
|
2612
|
$$a[0] <=> $$b[0] |
1690
|
|
|
|
|
|
|
|| $$b[1] <=> $$a[1] |
1691
|
|
|
|
|
|
|
} |
1692
|
|
|
|
|
|
|
@$interval_list |
1693
|
|
|
|
|
|
|
; |
1694
|
1525
|
|
|
|
|
1267
|
my %los; |
1695
|
|
|
|
|
|
|
my %his; |
1696
|
1525
|
|
|
|
|
1331
|
my $i = 0; |
1697
|
1525
|
|
|
|
|
2497
|
while ($i < @sorted) { |
1698
|
4757
|
|
|
|
|
5938
|
$los{$sorted[$i][0]} = undef; |
1699
|
4757
|
|
|
|
|
4409
|
$his{$sorted[$i][1]} = undef; |
1700
|
4757
|
|
|
|
|
4062
|
my $j = $i + 1; |
1701
|
4757
|
|
100
|
|
|
17235
|
while ( |
|
|
|
100
|
|
|
|
|
1702
|
|
|
|
|
|
|
$j < @sorted |
1703
|
|
|
|
|
|
|
&& $sorted[$j][0] == $sorted[$i][0] |
1704
|
|
|
|
|
|
|
&& $sorted[$j][1] == $sorted[$i][1] |
1705
|
|
|
|
|
|
|
) { |
1706
|
|
|
|
|
|
|
# $sorted[$i] --------- |
1707
|
|
|
|
|
|
|
# $sorted[$j] --------- |
1708
|
2418
|
|
|
|
|
8733
|
$j++; |
1709
|
|
|
|
|
|
|
} |
1710
|
4757
|
|
100
|
|
|
14267
|
while ( |
|
|
|
66
|
|
|
|
|
1711
|
|
|
|
|
|
|
$j < @sorted |
1712
|
|
|
|
|
|
|
&& $sorted[$j][0] == $sorted[$i][0] |
1713
|
|
|
|
|
|
|
&& $sorted[$j][1] < $sorted[$i][1] |
1714
|
|
|
|
|
|
|
) { |
1715
|
|
|
|
|
|
|
# $sorted[$i] --------- |
1716
|
|
|
|
|
|
|
# $sorted[$j] ----- |
1717
|
1712
|
|
|
|
|
1695
|
$his{$sorted[$j][1]} = undef; |
1718
|
1712
|
|
|
|
|
1749
|
$los{$sorted[$j][1]+1} = undef; |
1719
|
1712
|
|
|
|
|
6933
|
$j++; |
1720
|
|
|
|
|
|
|
} |
1721
|
|
|
|
|
|
|
# $sorted[$j][0] > $sorted[$i][0] |
1722
|
4757
|
|
100
|
|
|
12853
|
while ( |
1723
|
|
|
|
|
|
|
$j < @sorted |
1724
|
|
|
|
|
|
|
&& $sorted[$j][1] < $sorted[$i][1] |
1725
|
|
|
|
|
|
|
) { |
1726
|
|
|
|
|
|
|
# $sorted[$i] --------- |
1727
|
|
|
|
|
|
|
# $sorted[$j] ----- |
1728
|
3625
|
|
|
|
|
3549
|
$his{$sorted[$j][0]-1} = undef; |
1729
|
3625
|
|
|
|
|
3229
|
$los{$sorted[$j][0]} = undef; |
1730
|
3625
|
|
|
|
|
3067
|
$his{$sorted[$j][1]} = undef; |
1731
|
3625
|
|
|
|
|
3279
|
$los{$sorted[$j][1]+1} = undef; |
1732
|
3625
|
|
|
|
|
10220
|
$j++; |
1733
|
|
|
|
|
|
|
} |
1734
|
4757
|
100
|
100
|
|
|
12280
|
if ( |
1735
|
|
|
|
|
|
|
$j < @sorted |
1736
|
|
|
|
|
|
|
&& $sorted[$j][0] <= $sorted[$i][1] |
1737
|
|
|
|
|
|
|
) { |
1738
|
|
|
|
|
|
|
# $sorted[$j][0] > $sorted[$i][0] |
1739
|
|
|
|
|
|
|
# && $sorted[$j][0] <= $sorted[$i][1] |
1740
|
|
|
|
|
|
|
# && $sorted[$j][1] >= $sorted[$i][1] |
1741
|
|
|
|
|
|
|
# |
1742
|
|
|
|
|
|
|
# $sorted[$i] --------- |
1743
|
|
|
|
|
|
|
# $sorted[$j] ----- |
1744
|
446
|
|
|
|
|
519
|
$his{$sorted[$j][0]-1} = undef; |
1745
|
446
|
50
|
|
|
|
815
|
if ($sorted[$i][1] != $sorted[$j][1]) { |
1746
|
0
|
|
|
|
|
0
|
$los{$sorted[$i][1]+1} = undef; |
1747
|
|
|
|
|
|
|
} |
1748
|
|
|
|
|
|
|
} |
1749
|
4757
|
|
|
|
|
7248
|
$i = $j; |
1750
|
|
|
|
|
|
|
} |
1751
|
1525
|
|
|
|
|
3055
|
my @sorted_los = sort( { $a <=> $b } keys(%los)); |
|
8876
|
|
|
|
|
8007
|
|
1752
|
1525
|
|
|
|
|
2597
|
my @sorted_his = sort( { $a <=> $b } keys(%his)); |
|
8796
|
|
|
|
|
7602
|
|
1753
|
1525
|
|
|
|
|
2446
|
return [ map { [$sorted_los[$_], $sorted_his[$_]] } (0..$#sorted_los) ]; |
|
5452
|
|
|
|
|
10348
|
|
1754
|
|
|
|
|
|
|
} |
1755
|
|
|
|
|
|
|
|
1756
|
|
|
|
|
|
|
=item C |
1757
|
|
|
|
|
|
|
|
1758
|
|
|
|
|
|
|
Compute a deterministic finite automaton from C<$in_nfa> |
1759
|
|
|
|
|
|
|
(powerset construction). |
1760
|
|
|
|
|
|
|
|
1761
|
|
|
|
|
|
|
The data structure of a deterministic finite automaton (dfa) is |
1762
|
|
|
|
|
|
|
the same as that of a non-deterministic one, but it is further constrained: |
1763
|
|
|
|
|
|
|
For each state and each unicode character there exist exactly one transition |
1764
|
|
|
|
|
|
|
(i.e. a pair C<($char_class, $state_index)>) matching this character. |
1765
|
|
|
|
|
|
|
|
1766
|
|
|
|
|
|
|
Note that the following constraint hold for both a C<$dfa> and a C<$nfa>: |
1767
|
|
|
|
|
|
|
For each pair of state p1 and p2, there exists at most one transition |
1768
|
|
|
|
|
|
|
from p1 to p2 (artefact of this implementation). |
1769
|
|
|
|
|
|
|
|
1770
|
|
|
|
|
|
|
=cut |
1771
|
|
|
|
|
|
|
|
1772
|
|
|
|
|
|
|
sub nfa_to_dfa { |
1773
|
207
|
|
|
207
|
1
|
212
|
my ($nfa) = @_; |
1774
|
207
|
|
|
|
|
283
|
my $dfa = []; |
1775
|
207
|
50
|
|
|
|
419
|
if (!@$nfa) { |
1776
|
0
|
|
|
|
|
0
|
return [[0, [$cc_any, 0]]]; |
1777
|
|
|
|
|
|
|
} |
1778
|
207
|
|
|
|
|
217
|
my $trap_needed = 0; |
1779
|
207
|
|
|
|
|
176
|
my $dfa_size = 0; |
1780
|
207
|
|
|
|
|
473
|
my %dfa_indexes = ("0" => $dfa_size++); |
1781
|
207
|
|
|
|
|
365
|
my @todo = ([0]); |
1782
|
207
|
|
|
|
|
392
|
while (@todo) { |
1783
|
1326
|
|
|
|
|
1406
|
my $nfa_indexes = pop(@todo); |
1784
|
1326
|
|
|
|
|
2110
|
my $dfa_index = $dfa_indexes{join('.', @$nfa_indexes)}; |
1785
|
1326
|
|
|
|
|
2499
|
my @nfa_states = @$nfa[@$nfa_indexes]; |
1786
|
|
|
|
|
|
|
|
1787
|
|
|
|
|
|
|
# accepting |
1788
|
1326
|
100
|
|
|
|
1329
|
$$dfa[$dfa_index][0] = scalar(grep { $$_[0] } @nfa_states) ? 1 : 0; |
|
2189
|
|
|
|
|
4226
|
|
1789
|
|
|
|
|
|
|
|
1790
|
|
|
|
|
|
|
# transitions |
1791
|
4986
|
|
|
|
|
7097
|
my $cases = interval_cases([ |
1792
|
4986
|
|
|
|
|
3273
|
map { @{$$_[0]} } |
|
2189
|
|
|
|
|
3351
|
|
1793
|
1326
|
|
|
|
|
1302
|
map { @{$$_[1]} } |
|
2189
|
|
|
|
|
1468
|
|
1794
|
|
|
|
|
|
|
@nfa_states |
1795
|
|
|
|
|
|
|
]); |
1796
|
1326
|
|
|
|
|
1944
|
my %dfa_index_to_intervals; |
1797
|
1326
|
|
|
|
|
1689
|
for my $interval (@$cases) { |
1798
|
|
|
|
|
|
|
my @next_nfa_indexes |
1799
|
4335
|
|
|
|
|
3146
|
= sort(keys(%{ { |
|
6234
|
|
|
|
|
15516
|
|
1800
|
28291
|
|
|
|
|
32787
|
map { ($$_[1] => undef) } |
1801
|
7122
|
|
|
|
|
10371
|
grep { cc_match($$_[0], $$interval[0]) } |
1802
|
4335
|
|
|
|
|
3968
|
map { @{$$_[1]} } |
|
7122
|
|
|
|
|
4474
|
|
1803
|
|
|
|
|
|
|
@nfa_states |
1804
|
|
|
|
|
|
|
} })) |
1805
|
|
|
|
|
|
|
; |
1806
|
4335
|
|
|
|
|
8436
|
my $next_index_key = join('.', @next_nfa_indexes); |
1807
|
4335
|
100
|
|
|
|
6497
|
if (!exists($dfa_indexes{$next_index_key})) { |
1808
|
1119
|
|
|
|
|
1444
|
$dfa_indexes{$next_index_key} = $dfa_size++; |
1809
|
1119
|
|
|
|
|
1218
|
push(@todo, \@next_nfa_indexes); |
1810
|
|
|
|
|
|
|
} |
1811
|
4335
|
|
|
|
|
3095
|
push(@{$dfa_index_to_intervals{$dfa_indexes{$next_index_key}}}, |
|
4335
|
|
|
|
|
8549
|
|
1812
|
|
|
|
|
|
|
$interval |
1813
|
|
|
|
|
|
|
); |
1814
|
|
|
|
|
|
|
} |
1815
|
|
|
|
|
|
|
|
1816
|
1326
|
|
|
|
|
1230
|
my @any_ccs; |
1817
|
3831
|
|
|
|
|
4772
|
$$dfa[$dfa_index][1] = [ |
1818
|
|
|
|
|
|
|
map { |
1819
|
1326
|
|
|
|
|
2375
|
my $cc = interval_list_to_cc($dfa_index_to_intervals{$_}); |
1820
|
3831
|
|
|
|
|
3544
|
push(@any_ccs, $cc); |
1821
|
3831
|
|
|
|
|
6003
|
[$cc, $_ ]; |
1822
|
|
|
|
|
|
|
} |
1823
|
|
|
|
|
|
|
sort(keys(%dfa_index_to_intervals)) |
1824
|
|
|
|
|
|
|
]; |
1825
|
1326
|
100
|
|
|
|
2130
|
if ((my $all_cc = cc_union(@any_ccs)) != $cc_any) { |
1826
|
1177
|
|
|
|
|
981
|
$trap_needed = 1; |
1827
|
1177
|
|
|
|
|
879
|
push(@{$$dfa[$dfa_index][1]}, |
|
1177
|
|
|
|
|
2056
|
|
1828
|
|
|
|
|
|
|
[ cc_neg($all_cc), -1 ] |
1829
|
|
|
|
|
|
|
); |
1830
|
|
|
|
|
|
|
} |
1831
|
|
|
|
|
|
|
} |
1832
|
|
|
|
|
|
|
|
1833
|
207
|
100
|
|
|
|
389
|
if ($trap_needed) { |
1834
|
195
|
|
|
|
|
280
|
for ( |
1835
|
4889
|
|
|
|
|
5534
|
grep { $$_[1] == -1 } |
|
1284
|
|
|
|
|
1910
|
|
1836
|
1284
|
|
|
|
|
800
|
map { @{$$_[1]} } |
1837
|
|
|
|
|
|
|
@$dfa |
1838
|
|
|
|
|
|
|
) { |
1839
|
1177
|
|
|
|
|
1075
|
$$_[1] = $dfa_size; |
1840
|
|
|
|
|
|
|
} |
1841
|
195
|
|
|
|
|
540
|
$$dfa[$dfa_size] = [0, [[$cc_any, $dfa_size]]]; |
1842
|
|
|
|
|
|
|
} |
1843
|
|
|
|
|
|
|
|
1844
|
207
|
|
|
|
|
775
|
return $dfa; |
1845
|
|
|
|
|
|
|
} |
1846
|
|
|
|
|
|
|
|
1847
|
|
|
|
|
|
|
|
1848
|
|
|
|
|
|
|
=item C |
1849
|
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
|
|
1851
|
|
|
|
|
|
|
Computes a minimal deterministic C<$dfa> from the given C<$in_dfa> |
1852
|
|
|
|
|
|
|
(Hopcroft's algorithm). |
1853
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
Note that the given C<$in_dfa> must be a C<$dfa>, as |
1855
|
|
|
|
|
|
|
returned from C, and not a mere C<$nfa>. |
1856
|
|
|
|
|
|
|
|
1857
|
|
|
|
|
|
|
Myhill-Nerode theorem: two minimal dfa accepting |
1858
|
|
|
|
|
|
|
the same language are isomorph (i.e. C returns true). |
1859
|
|
|
|
|
|
|
|
1860
|
|
|
|
|
|
|
=cut |
1861
|
|
|
|
|
|
|
|
1862
|
|
|
|
|
|
|
sub dfa_to_min_dfa { |
1863
|
207
|
|
|
207
|
1
|
337
|
my ($dfa) = @_; |
1864
|
207
|
|
|
|
|
225
|
my @acceptings; |
1865
|
|
|
|
|
|
|
my @non_acceptings; |
1866
|
0
|
|
|
|
|
0
|
my @intervals; |
1867
|
207
|
|
|
|
|
423
|
for my $index (0..$#$dfa) { |
1868
|
1521
|
100
|
|
|
|
1905
|
if ($$dfa[$index][0]) { |
1869
|
275
|
|
|
|
|
311
|
push(@acceptings, $index); |
1870
|
|
|
|
|
|
|
} |
1871
|
|
|
|
|
|
|
else { |
1872
|
1246
|
|
|
|
|
1023
|
push(@non_acceptings, $index); |
1873
|
|
|
|
|
|
|
} |
1874
|
1521
|
|
|
|
|
1007
|
push(@intervals, map { @{$$_[0]} } @{$$dfa[$index][1]}) |
|
5203
|
|
|
|
|
3406
|
|
|
5203
|
|
|
|
|
6818
|
|
|
1521
|
|
|
|
|
1657
|
|
1875
|
|
|
|
|
|
|
} |
1876
|
207
|
|
|
|
|
238
|
my $partition; |
1877
|
207
|
100
|
|
|
|
326
|
if (@non_acceptings) { |
1878
|
199
|
|
|
|
|
416
|
$partition = [\@non_acceptings, \@acceptings]; |
1879
|
199
|
|
|
|
|
764
|
my %todo = (join('.', @non_acceptings) => \@non_acceptings); |
1880
|
199
|
|
|
|
|
337
|
my $cases = interval_cases(\@intervals); |
1881
|
199
|
|
|
|
|
611
|
while (my ($todo_key) = keys(%todo)) { |
1882
|
994
|
|
|
|
|
926
|
my %indexes = map { ($_ => undef) } @{delete($todo{$todo_key})}; |
|
2325
|
|
|
|
|
3626
|
|
|
994
|
|
|
|
|
1736
|
|
1883
|
994
|
|
|
|
|
1489
|
for my $interval (@$cases) { |
1884
|
21234
|
|
|
|
|
27236
|
my %prev_inds = ( |
1885
|
269076
|
|
|
|
|
175312
|
map { ($_ => undef) } |
1886
|
|
|
|
|
|
|
grep { |
1887
|
9909
|
|
|
|
|
18146
|
my $i = $_; |
1888
|
914283
|
100
|
|
|
|
1504678
|
grep { |
1889
|
269076
|
|
|
|
|
274494
|
exists($indexes{$$_[1]}) |
1890
|
|
|
|
|
|
|
&& cc_match($$_[0], $$interval[0]) |
1891
|
|
|
|
|
|
|
} |
1892
|
269076
|
|
|
|
|
170903
|
@{$$dfa[$i][1]} |
1893
|
|
|
|
|
|
|
} |
1894
|
|
|
|
|
|
|
(0..$#$dfa) |
1895
|
|
|
|
|
|
|
); |
1896
|
9909
|
|
|
|
|
13289
|
my $refined_partition; |
1897
|
9909
|
|
|
|
|
9813
|
for my $partition_indexes (@$partition) { |
1898
|
127676
|
|
|
|
|
77475
|
my (@inter, @diff); |
1899
|
127676
|
|
|
|
|
105996
|
for (@$partition_indexes) { |
1900
|
269076
|
100
|
|
|
|
269220
|
if (exists($prev_inds{$_})) { |
1901
|
21234
|
|
|
|
|
23006
|
push(@inter, $_); |
1902
|
|
|
|
|
|
|
} |
1903
|
|
|
|
|
|
|
else { |
1904
|
247842
|
|
|
|
|
258570
|
push(@diff, $_); |
1905
|
|
|
|
|
|
|
} |
1906
|
|
|
|
|
|
|
} |
1907
|
127676
|
100
|
100
|
|
|
217504
|
if (!@inter || !@diff) { |
1908
|
126881
|
|
|
|
|
158392
|
push(@$refined_partition, $partition_indexes); |
1909
|
|
|
|
|
|
|
} |
1910
|
|
|
|
|
|
|
else { |
1911
|
795
|
|
|
|
|
1306
|
push(@$refined_partition, \@inter, \@diff); |
1912
|
795
|
|
|
|
|
3328
|
my $prev_inds_key = join('.', sort(keys(%prev_inds))); |
1913
|
795
|
50
|
|
|
|
2078
|
if ($todo{$prev_inds_key}) { |
|
|
100
|
|
|
|
|
|
1914
|
0
|
|
|
|
|
0
|
delete($todo{$prev_inds_key}); |
1915
|
0
|
|
|
|
|
0
|
$todo{join('.', @diff)} = \@diff; |
1916
|
0
|
|
|
|
|
0
|
$todo{join('.', @inter)} = \@inter; |
1917
|
|
|
|
|
|
|
} |
1918
|
|
|
|
|
|
|
elsif (@diff < @inter) { |
1919
|
174
|
|
|
|
|
479
|
$todo{join('.', @diff)} = \@diff; |
1920
|
|
|
|
|
|
|
} |
1921
|
|
|
|
|
|
|
else { |
1922
|
621
|
|
|
|
|
1541
|
$todo{join('.', @inter)} = \@inter; |
1923
|
|
|
|
|
|
|
} |
1924
|
|
|
|
|
|
|
} |
1925
|
|
|
|
|
|
|
} |
1926
|
9909
|
|
|
|
|
21149
|
$partition = $refined_partition; |
1927
|
|
|
|
|
|
|
} |
1928
|
|
|
|
|
|
|
} |
1929
|
|
|
|
|
|
|
} |
1930
|
|
|
|
|
|
|
else { |
1931
|
8
|
|
|
|
|
15
|
$partition = [\@acceptings]; |
1932
|
|
|
|
|
|
|
} |
1933
|
207
|
|
|
|
|
237
|
my $state_ind_to_equiv; |
1934
|
207
|
|
|
|
|
302
|
for (grep { @$_ != 1 } @$partition) { |
|
1201
|
|
|
|
|
1449
|
|
1935
|
150
|
|
|
|
|
615
|
@$state_ind_to_equiv{@$_[1..$#$_]} = ($$_[0]) x $#$_; |
1936
|
|
|
|
|
|
|
} |
1937
|
207
|
|
|
|
|
487
|
return _nfa_shrink_equiv($dfa, $state_ind_to_equiv); |
1938
|
|
|
|
|
|
|
} |
1939
|
|
|
|
|
|
|
|
1940
|
|
|
|
|
|
|
|
1941
|
|
|
|
|
|
|
############################################################################## |
1942
|
|
|
|
|
|
|
# $tree |
1943
|
|
|
|
|
|
|
############################################################################## |
1944
|
|
|
|
|
|
|
|
1945
|
|
|
|
|
|
|
=back |
1946
|
|
|
|
|
|
|
|
1947
|
|
|
|
|
|
|
=head2 Tree |
1948
|
|
|
|
|
|
|
|
1949
|
|
|
|
|
|
|
$tree = [ $star, [ $alt_0, $alt_1, ... ] ] |
1950
|
|
|
|
|
|
|
or $char_class # ref($char_class) eq CHAR_CLASS |
1951
|
|
|
|
|
|
|
or undef # accepting nothing |
1952
|
|
|
|
|
|
|
$alt = [ $tree_0, $tree_1, ... ] |
1953
|
|
|
|
|
|
|
|
1954
|
|
|
|
|
|
|
A C<$tree> is a hierarchical data structure used as intermediate form for |
1955
|
|
|
|
|
|
|
regular expression generation routines. |
1956
|
|
|
|
|
|
|
|
1957
|
|
|
|
|
|
|
Similar to a parse tree, except that the C<$tree>s described here are not the |
1958
|
|
|
|
|
|
|
direct result of the parsing routines C; indeed, the parsing |
1959
|
|
|
|
|
|
|
routines generate a C<$nfa>, which then can be converted to a C<$tree>. |
1960
|
|
|
|
|
|
|
|
1961
|
|
|
|
|
|
|
A string is spanned by C<$tree = [$star, [ $alt_0, $alt_1, ... ] ]> if it is |
1962
|
|
|
|
|
|
|
spanned by one of the C<$alt_i> (if C<$star> is false) of a repetition thereof |
1963
|
|
|
|
|
|
|
(if C<$star> is true). |
1964
|
|
|
|
|
|
|
|
1965
|
|
|
|
|
|
|
A string is spanned by C<$alt = [ $tree_0, $tree_1, ...]> if it is the |
1966
|
|
|
|
|
|
|
concatenation of C<@substrings>, each C<$substrings[$i]> being spanned by |
1967
|
|
|
|
|
|
|
C<$$alt[$i]>. |
1968
|
|
|
|
|
|
|
|
1969
|
|
|
|
|
|
|
=over 4 |
1970
|
|
|
|
|
|
|
|
1971
|
|
|
|
|
|
|
=item C |
1972
|
|
|
|
|
|
|
|
1973
|
|
|
|
|
|
|
Converts a C<$nfa> to a C<$tree>. |
1974
|
|
|
|
|
|
|
Returns C if the C<$nfa> accepts nothing (not even the empty string). |
1975
|
|
|
|
|
|
|
|
1976
|
|
|
|
|
|
|
=cut |
1977
|
|
|
|
|
|
|
|
1978
|
|
|
|
|
|
|
sub nfa_to_tree { |
1979
|
131
|
|
|
131
|
1
|
183
|
my ($nfa) = @_; |
1980
|
|
|
|
|
|
|
|
1981
|
|
|
|
|
|
|
# Warshall algorithm (Kleene's theorem) |
1982
|
|
|
|
|
|
|
# with preliminary computations: |
1983
|
|
|
|
|
|
|
# - words-paths (unbranched paths) are shrunken |
1984
|
|
|
|
|
|
|
# - unique accepting state is ensured |
1985
|
|
|
|
|
|
|
# - branches (with single parent) are skipped |
1986
|
|
|
|
|
|
|
|
1987
|
131
|
|
|
|
|
184
|
my $path = {}; |
1988
|
131
|
|
|
|
|
159
|
my $path_tr = {}; |
1989
|
131
|
|
|
|
|
133
|
my %accepting_state_inds; |
1990
|
|
|
|
|
|
|
|
1991
|
|
|
|
|
|
|
# Initialization of the paths |
1992
|
|
|
|
|
|
|
|
1993
|
131
|
|
|
|
|
272
|
for my $i (0..$#$nfa) { |
1994
|
923
|
100
|
|
|
|
1440
|
if ($$nfa[$i][0]) { |
1995
|
153
|
|
|
|
|
273
|
$accepting_state_inds{$i} = $i; |
1996
|
|
|
|
|
|
|
} |
1997
|
923
|
|
|
|
|
633
|
for (@{$$nfa[$i][1]}) { |
|
923
|
|
|
|
|
1175
|
|
1998
|
1362
|
|
|
|
|
3841
|
$$path{$i}{$$_[1]} |
1999
|
|
|
|
|
|
|
= $$path_tr{$$_[1]}{$i} |
2000
|
|
|
|
|
|
|
= $$_[0]; |
2001
|
|
|
|
|
|
|
} |
2002
|
|
|
|
|
|
|
} |
2003
|
|
|
|
|
|
|
|
2004
|
131
|
|
|
|
|
144
|
if (TRACE_NFA_TO_TREE) { |
2005
|
|
|
|
|
|
|
print STDERR "before word shrink\n"; |
2006
|
|
|
|
|
|
|
for my $i (sort {$a <=> $b} (keys(%$path))) { |
2007
|
|
|
|
|
|
|
for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) { |
2008
|
|
|
|
|
|
|
print STDERR "$i $j: " . cc_to_regex($$path{$i}{$j}) . "\n"; |
2009
|
|
|
|
|
|
|
}} |
2010
|
|
|
|
|
|
|
} |
2011
|
|
|
|
|
|
|
|
2012
|
131
|
|
|
|
|
186
|
my @tree_list; |
2013
|
|
|
|
|
|
|
my @state_ind_path; |
2014
|
|
|
|
|
|
|
|
2015
|
|
|
|
|
|
|
# word-paths (unbranched paths) are shrunken |
2016
|
131
|
|
|
|
|
220
|
for my $first (0..$#$nfa) { |
2017
|
923
|
100
|
|
|
|
1427
|
if (!exists($$path{$first})) { next; } |
|
411
|
|
|
|
|
343
|
|
2018
|
|
|
|
|
|
|
my @todo |
2019
|
452
|
|
|
|
|
508
|
= sort { |
2020
|
452
|
50
|
|
|
|
328
|
keys(%{$$path_tr{$b}}) <=> keys(%{$$path_tr{$a}}) |
|
452
|
|
|
|
|
1223
|
|
|
956
|
|
|
|
|
1614
|
|
2021
|
|
|
|
|
|
|
|| $b <=> $a |
2022
|
|
|
|
|
|
|
} |
2023
|
512
|
|
|
|
|
920
|
grep { $_ != $first } |
2024
|
512
|
|
|
|
|
402
|
keys(%{$$path{$first}}) |
2025
|
|
|
|
|
|
|
; |
2026
|
512
|
|
|
|
|
522
|
my %todo_ctrl; |
2027
|
512
|
|
|
|
|
389
|
my $todo_sorted = 1; |
2028
|
512
|
|
100
|
|
|
1289
|
while ( |
|
|
|
66
|
|
|
|
|
2029
|
|
|
|
|
|
|
@todo |
2030
|
|
|
|
|
|
|
&& ( |
2031
|
|
|
|
|
|
|
!$todo_sorted |
2032
|
|
|
|
|
|
|
|| keys(%{$$path_tr{$todo[-1]}}) == 1 |
2033
|
|
|
|
|
|
|
) |
2034
|
|
|
|
|
|
|
) { |
2035
|
461
|
|
|
|
|
596
|
$todo_ctrl{my $i = pop(@todo)} = undef; |
2036
|
461
|
100
|
|
|
|
367
|
if (keys(%{$$path_tr{$i}}) != 1) { |
|
461
|
|
|
|
|
832
|
|
2037
|
165
|
100
|
33
|
|
|
839
|
if ($i != $first && !$todo_sorted && @todo) { |
|
|
|
66
|
|
|
|
|
2038
|
|
|
|
|
|
|
@todo |
2039
|
87
|
|
|
|
|
104
|
= sort { |
2040
|
87
|
50
|
|
|
|
71
|
keys(%{$$path_tr{$b}}) <=> keys(%{$$path_tr{$a}}) |
|
87
|
|
|
|
|
219
|
|
|
162
|
|
|
|
|
343
|
|
2041
|
|
|
|
|
|
|
|| $b <=> $a |
2042
|
|
|
|
|
|
|
} |
2043
|
69
|
|
|
|
|
77
|
keys(%{ { map { ($_ => undef) } (@todo, $i) } }) |
|
69
|
|
|
|
|
83
|
|
2044
|
|
|
|
|
|
|
; |
2045
|
69
|
|
|
|
|
138
|
$todo_sorted = 1; |
2046
|
|
|
|
|
|
|
} |
2047
|
165
|
|
|
|
|
449
|
next; |
2048
|
|
|
|
|
|
|
} |
2049
|
296
|
|
|
|
|
262
|
$todo_sorted = 0; |
2050
|
|
|
|
|
|
|
|
2051
|
296
|
|
|
|
|
420
|
my @tree_list = ($$path{$first}{$i}); |
2052
|
296
|
|
|
|
|
309
|
my @state_ind_path = ($i); |
2053
|
|
|
|
|
|
|
|
2054
|
296
|
|
66
|
|
|
214
|
while ( |
2055
|
595
|
|
|
|
|
1229
|
keys(%{$$path{$i}}) == 1 |
|
406
|
|
|
|
|
1104
|
|
2056
|
|
|
|
|
|
|
&& (my $j = (keys(%{$$path{$i}}))[0]) != $first |
2057
|
|
|
|
|
|
|
) { |
2058
|
406
|
|
|
|
|
429
|
push(@tree_list, $$path{$i}{$j}); |
2059
|
406
|
|
|
|
|
380
|
push(@state_ind_path, $i = $j); |
2060
|
406
|
100
|
|
|
|
243
|
if (keys(%{$$path_tr{$j}}) != 1) { |
|
406
|
|
|
|
|
823
|
|
2061
|
107
|
|
|
|
|
116
|
last; |
2062
|
|
|
|
|
|
|
} |
2063
|
|
|
|
|
|
|
} |
2064
|
|
|
|
|
|
|
|
2065
|
296
|
|
|
|
|
293
|
if (TRACE_NFA_TO_TREE) { |
2066
|
|
|
|
|
|
|
print STDERR "first, state_ind_path: $first, @state_ind_path\n"; |
2067
|
|
|
|
|
|
|
} |
2068
|
|
|
|
|
|
|
|
2069
|
296
|
100
|
|
|
|
795
|
if (@state_ind_path > 1) { |
2070
|
|
|
|
|
|
|
|
2071
|
137
|
|
|
|
|
116
|
if (TRACE_NFA_TO_TREE) { |
2072
|
|
|
|
|
|
|
print STDERR "delete head $first -> $state_ind_path[0]\n"; |
2073
|
|
|
|
|
|
|
} |
2074
|
137
|
|
|
|
|
229
|
delete($$path{$first}{$state_ind_path[0]}); |
2075
|
137
|
|
|
|
|
313
|
for (@state_ind_path[0..$#state_ind_path-1]) { |
2076
|
406
|
|
|
|
|
432
|
delete($$path{$_}); |
2077
|
406
|
|
|
|
|
370
|
delete($$path_tr{$_}); |
2078
|
406
|
|
|
|
|
380
|
if (TRACE_NFA_TO_TREE) { |
2079
|
|
|
|
|
|
|
print STDERR "delete path $_ -> *\n"; |
2080
|
|
|
|
|
|
|
print STDERR "delete path * <- $_\n"; |
2081
|
|
|
|
|
|
|
} |
2082
|
|
|
|
|
|
|
} |
2083
|
137
|
|
|
|
|
189
|
delete($$path_tr{$state_ind_path[-1]}{$state_ind_path[-2]}); |
2084
|
137
|
100
|
|
|
|
236
|
if (!exists($todo_ctrl{$state_ind_path[-1]})) { |
2085
|
122
|
|
|
|
|
140
|
$todo_ctrl{$state_ind_path[-1]} = undef; |
2086
|
122
|
|
|
|
|
131
|
push(@todo, $state_ind_path[-1]); |
2087
|
|
|
|
|
|
|
} |
2088
|
137
|
|
|
|
|
96
|
if (TRACE_NFA_TO_TREE) { |
2089
|
|
|
|
|
|
|
print STDERR "delete tail $state_ind_path[-1] <- $state_ind_path[-2]\n"; |
2090
|
|
|
|
|
|
|
} |
2091
|
|
|
|
|
|
|
|
2092
|
|
|
|
|
|
|
|
2093
|
|
|
|
|
|
|
# $first -> $last |
2094
|
137
|
|
|
|
|
139
|
my $last = $state_ind_path[-1]; |
2095
|
137
|
100
|
|
|
|
399
|
$$path{$first}{$last} |
2096
|
|
|
|
|
|
|
= $$path_tr{$last}{$first} |
2097
|
|
|
|
|
|
|
= exists($$path{$first}{$last}) |
2098
|
|
|
|
|
|
|
? tree_alt( |
2099
|
|
|
|
|
|
|
$$path{$first}{$last} |
2100
|
|
|
|
|
|
|
, tree_concat(@tree_list) |
2101
|
|
|
|
|
|
|
) |
2102
|
|
|
|
|
|
|
: tree_concat(@tree_list) |
2103
|
|
|
|
|
|
|
; |
2104
|
|
|
|
|
|
|
|
2105
|
137
|
|
|
|
|
193
|
if (TRACE_NFA_TO_TREE) { |
2106
|
|
|
|
|
|
|
print STDERR |
2107
|
|
|
|
|
|
|
"$first -> $last created (first ->last): " |
2108
|
|
|
|
|
|
|
. join('', map {_tree_to_regex($_)} @tree_list) . "\n"; |
2109
|
|
|
|
|
|
|
} |
2110
|
|
|
|
|
|
|
|
2111
|
137
|
|
|
|
|
227
|
for (0..$#state_ind_path-1) { |
2112
|
|
|
|
|
|
|
|
2113
|
|
|
|
|
|
|
# $first -> accepting |
2114
|
406
|
100
|
|
|
|
1159
|
if ($accepting_state_inds{ |
2115
|
|
|
|
|
|
|
my $state_ind = $state_ind_path[$_] |
2116
|
|
|
|
|
|
|
}) { |
2117
|
30
|
50
|
|
|
|
85
|
$$path{$first}{$state_ind} |
2118
|
|
|
|
|
|
|
= $$path_tr{$state_ind}{$first} |
2119
|
|
|
|
|
|
|
= exists($$path{$first}{$state_ind}) |
2120
|
|
|
|
|
|
|
? tree_alt( |
2121
|
|
|
|
|
|
|
$$path{$first}{$state_ind} |
2122
|
|
|
|
|
|
|
, tree_concat(@tree_list[0..$_]) |
2123
|
|
|
|
|
|
|
) |
2124
|
|
|
|
|
|
|
: tree_concat(@tree_list[0..$_]) |
2125
|
|
|
|
|
|
|
; |
2126
|
30
|
|
|
|
|
133
|
if (TRACE_NFA_TO_TREE) { |
2127
|
|
|
|
|
|
|
print STDERR |
2128
|
|
|
|
|
|
|
"$first -> $state_ind created (first -> accepting): " |
2129
|
|
|
|
|
|
|
. join('', map {_tree_to_regex($_)} @tree_list[0..$_]) . "\n"; |
2130
|
|
|
|
|
|
|
} |
2131
|
|
|
|
|
|
|
} |
2132
|
|
|
|
|
|
|
} |
2133
|
|
|
|
|
|
|
} |
2134
|
|
|
|
|
|
|
} |
2135
|
|
|
|
|
|
|
} |
2136
|
|
|
|
|
|
|
|
2137
|
131
|
|
|
|
|
140
|
if (TRACE_NFA_TO_TREE) { |
2138
|
|
|
|
|
|
|
print STDERR "after word shrink\n"; |
2139
|
|
|
|
|
|
|
for my $i (sort {$a <=> $b} (keys(%$path))) { |
2140
|
|
|
|
|
|
|
for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) { |
2141
|
|
|
|
|
|
|
print STDERR "$i $j: " . tree_dump($$path{$i}{$j}) . "\n"; |
2142
|
|
|
|
|
|
|
}} |
2143
|
|
|
|
|
|
|
for my $j (sort {$a <=> $b} (keys(%$path_tr))) { |
2144
|
|
|
|
|
|
|
for my $i (sort {$a <=> $b} (keys(%{$$path_tr{$j}}))) { |
2145
|
|
|
|
|
|
|
print STDERR "$j <- $i: " . tree_dump($$path_tr{$j}{$i}) . "\n"; |
2146
|
|
|
|
|
|
|
}} |
2147
|
|
|
|
|
|
|
} |
2148
|
|
|
|
|
|
|
|
2149
|
|
|
|
|
|
|
# unique accepting state is ensured |
2150
|
|
|
|
|
|
|
# (pseudo-unique: the initial state may additionally be accepting) |
2151
|
131
|
|
|
|
|
194
|
my $unique_accepting_state_ind = @$nfa; |
2152
|
131
|
100
|
100
|
|
|
319
|
if ( |
|
|
100
|
|
|
|
|
|
2153
|
|
|
|
|
|
|
keys(%accepting_state_inds) == 1 |
2154
|
|
|
|
|
|
|
) { |
2155
|
114
|
|
|
|
|
252
|
$unique_accepting_state_ind = (keys(%accepting_state_inds))[0]; |
2156
|
|
|
|
|
|
|
} |
2157
|
|
|
|
|
|
|
elsif ( |
2158
|
|
|
|
|
|
|
keys(%accepting_state_inds) == 2 |
2159
|
|
|
|
|
|
|
&& exists($accepting_state_inds{0}) |
2160
|
|
|
|
|
|
|
) { |
2161
|
6
|
|
|
|
|
8
|
$unique_accepting_state_ind |
2162
|
3
|
|
|
|
|
7
|
= (grep {$_} keys(%accepting_state_inds))[0]; |
2163
|
|
|
|
|
|
|
} |
2164
|
|
|
|
|
|
|
else { |
2165
|
14
|
|
|
|
|
24
|
$unique_accepting_state_ind = @$nfa; |
2166
|
14
|
|
|
|
|
32
|
for my $to_state_ind (keys(%accepting_state_inds)) { |
2167
|
33
|
|
|
|
|
30
|
for my $from_state_ind (keys(%{$$path_tr{$to_state_ind}})) { |
|
33
|
|
|
|
|
51
|
|
2168
|
58
|
|
|
|
|
119
|
push( |
2169
|
58
|
|
|
|
|
41
|
@{$$path_tr{$unique_accepting_state_ind}{$from_state_ind}} |
2170
|
|
|
|
|
|
|
, $$path_tr{$to_state_ind}{$from_state_ind} |
2171
|
|
|
|
|
|
|
); |
2172
|
|
|
|
|
|
|
} |
2173
|
|
|
|
|
|
|
} |
2174
|
14
|
|
|
|
|
18
|
for my $from_state_ind ( |
|
14
|
|
|
|
|
39
|
|
2175
|
|
|
|
|
|
|
keys(%{$$path_tr{$unique_accepting_state_ind}}) |
2176
|
|
|
|
|
|
|
) { |
2177
|
50
|
|
|
|
|
92
|
$$path_tr{$unique_accepting_state_ind}{$from_state_ind} |
2178
|
|
|
|
|
|
|
= $$path{$from_state_ind}{$unique_accepting_state_ind} |
2179
|
|
|
|
|
|
|
= tree_alt( |
2180
|
50
|
|
|
|
|
44
|
@{$$path_tr{$unique_accepting_state_ind}{$from_state_ind}} |
2181
|
|
|
|
|
|
|
); |
2182
|
|
|
|
|
|
|
} |
2183
|
|
|
|
|
|
|
} |
2184
|
|
|
|
|
|
|
|
2185
|
131
|
|
|
|
|
121
|
if (TRACE_NFA_TO_TREE) { |
2186
|
|
|
|
|
|
|
print STDERR "after unique state addition\n"; |
2187
|
|
|
|
|
|
|
for my $i (sort {$a <=> $b} (keys(%$path))) { |
2188
|
|
|
|
|
|
|
for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) { |
2189
|
|
|
|
|
|
|
print STDERR "$i $j: " . tree_dump($$path{$i}{$j}) . "\n"; |
2190
|
|
|
|
|
|
|
}} |
2191
|
|
|
|
|
|
|
for my $j (sort {$a <=> $b} (keys(%$path_tr))) { |
2192
|
|
|
|
|
|
|
for my $i (sort {$a <=> $b} (keys(%{$$path_tr{$j}}))) { |
2193
|
|
|
|
|
|
|
print STDERR "$j <- $i: " . tree_dump($$path_tr{$j}{$i}) . "\n"; |
2194
|
|
|
|
|
|
|
}} |
2195
|
|
|
|
|
|
|
} |
2196
|
|
|
|
|
|
|
|
2197
|
131
|
|
|
|
|
223
|
for my $reversed (0, 1) { |
2198
|
262
|
100
|
|
|
|
524
|
my ($tmp_path, $tmp_path_tr) |
2199
|
|
|
|
|
|
|
= $reversed |
2200
|
|
|
|
|
|
|
? ($path_tr, $path) |
2201
|
|
|
|
|
|
|
: ($path, $path_tr) |
2202
|
|
|
|
|
|
|
; |
2203
|
|
|
|
|
|
|
|
2204
|
|
|
|
|
|
|
# branches (with single parent) are skipped |
2205
|
|
|
|
|
|
|
my @branch_inds |
2206
|
480
|
|
|
|
|
510
|
= $reversed |
2207
|
781
|
|
|
|
|
811
|
? sort {$a <=> $b} (keys(%$tmp_path)) |
2208
|
262
|
100
|
|
|
|
837
|
: sort {$b <=> $a} (keys(%$tmp_path)) |
2209
|
|
|
|
|
|
|
; |
2210
|
262
|
|
|
|
|
483
|
while (@branch_inds) { |
2211
|
1150
|
|
|
|
|
996
|
my $branch = pop(@branch_inds); |
2212
|
1150
|
100
|
100
|
|
|
4847
|
if ( |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
2213
|
642
|
|
|
|
|
1616
|
!exists($$tmp_path{$branch}) |
2214
|
|
|
|
|
|
|
# root cannot be un-branched |
2215
|
|
|
|
|
|
|
|| $branch == 0 |
2216
|
|
|
|
|
|
|
# accepting states cannot be un-branched |
2217
|
|
|
|
|
|
|
|| $branch == $unique_accepting_state_ind |
2218
|
|
|
|
|
|
|
# single parent (non-root have one or more parents) |
2219
|
|
|
|
|
|
|
|| keys(%{$$tmp_path_tr{$branch}}) != 1 |
2220
|
|
|
|
|
|
|
) { |
2221
|
881
|
|
|
|
|
1400
|
next; |
2222
|
|
|
|
|
|
|
} |
2223
|
|
|
|
|
|
|
|
2224
|
269
|
|
|
|
|
209
|
if (TRACE_NFA_TO_TREE) { |
2225
|
|
|
|
|
|
|
print STDERR "branch at $branch\n"; |
2226
|
|
|
|
|
|
|
} |
2227
|
269
|
|
|
|
|
220
|
my ($parent) = keys(%{$$tmp_path_tr{$branch}}); # single parent |
|
269
|
|
|
|
|
387
|
|
2228
|
269
|
100
|
66
|
|
|
793
|
if ( |
|
|
|
66
|
|
|
|
|
2229
|
|
|
|
|
|
|
ref($$tmp_path{$parent}{$branch}) ne CHAR_CLASS |
2230
|
|
|
|
|
|
|
&& ( |
2231
|
|
|
|
|
|
|
# starified parent |
2232
|
|
|
|
|
|
|
$$tmp_path{$parent}{$branch}[0] |
2233
|
|
|
|
|
|
|
# parent containing several paths |
2234
|
|
|
|
|
|
|
|| @{$$tmp_path{$parent}{$branch}[1]} > 1 |
2235
|
|
|
|
|
|
|
) |
2236
|
|
|
|
|
|
|
) { |
2237
|
35
|
|
|
|
|
67
|
next; |
2238
|
|
|
|
|
|
|
} |
2239
|
|
|
|
|
|
|
|
2240
|
234
|
|
|
|
|
204
|
my (@children) = keys(%{$$tmp_path{$branch}}); |
|
234
|
|
|
|
|
453
|
|
2241
|
|
|
|
|
|
|
|
2242
|
234
|
|
|
|
|
290
|
for my $child (@children) { |
2243
|
533
|
100
|
|
|
|
1804
|
$$tmp_path{$parent}{$child} |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
2244
|
|
|
|
|
|
|
= $$tmp_path_tr{$child}{$parent} |
2245
|
|
|
|
|
|
|
= exists($$tmp_path{$parent}{$child}) |
2246
|
|
|
|
|
|
|
? tree_alt( |
2247
|
|
|
|
|
|
|
$$tmp_path{$parent}{$child} |
2248
|
|
|
|
|
|
|
, tree_concat2( |
2249
|
|
|
|
|
|
|
$reversed |
2250
|
|
|
|
|
|
|
? ( |
2251
|
|
|
|
|
|
|
$$tmp_path{$branch}{$child} |
2252
|
|
|
|
|
|
|
, $$tmp_path{$parent}{$branch} |
2253
|
|
|
|
|
|
|
) |
2254
|
|
|
|
|
|
|
: ( |
2255
|
|
|
|
|
|
|
$$tmp_path{$parent}{$branch} |
2256
|
|
|
|
|
|
|
, $$tmp_path{$branch}{$child} |
2257
|
|
|
|
|
|
|
) |
2258
|
|
|
|
|
|
|
) |
2259
|
|
|
|
|
|
|
) |
2260
|
|
|
|
|
|
|
: tree_concat2( |
2261
|
|
|
|
|
|
|
$reversed |
2262
|
|
|
|
|
|
|
? ( |
2263
|
|
|
|
|
|
|
$$tmp_path{$branch}{$child} |
2264
|
|
|
|
|
|
|
, $$tmp_path{$parent}{$branch} |
2265
|
|
|
|
|
|
|
) |
2266
|
|
|
|
|
|
|
: ( |
2267
|
|
|
|
|
|
|
$$tmp_path{$parent}{$branch} |
2268
|
|
|
|
|
|
|
, $$tmp_path{$branch}{$child} |
2269
|
|
|
|
|
|
|
) |
2270
|
|
|
|
|
|
|
) |
2271
|
|
|
|
|
|
|
; |
2272
|
533
|
|
|
|
|
849
|
delete($$tmp_path_tr{$child}{$branch}); |
2273
|
|
|
|
|
|
|
|
2274
|
533
|
|
|
|
|
568
|
if (TRACE_NFA_TO_TREE) { |
2275
|
|
|
|
|
|
|
print STDERR |
2276
|
|
|
|
|
|
|
"parent -> branch: " |
2277
|
|
|
|
|
|
|
. tree_dump($$tmp_path{$parent}{$branch}) . "\n"; |
2278
|
|
|
|
|
|
|
print STDERR |
2279
|
|
|
|
|
|
|
"branch -> child : " |
2280
|
|
|
|
|
|
|
. tree_dump($$tmp_path{$branch}{$child}) . "\n"; |
2281
|
|
|
|
|
|
|
print STDERR |
2282
|
|
|
|
|
|
|
"$parent -> $child created (un-branch): " |
2283
|
|
|
|
|
|
|
. tree_dump($$tmp_path{$parent}{$child}) |
2284
|
|
|
|
|
|
|
. ($reversed ? " (reversed)" : "" ) . "\n"; |
2285
|
|
|
|
|
|
|
print STDERR |
2286
|
|
|
|
|
|
|
"delete $child <- $branch\n"; |
2287
|
|
|
|
|
|
|
} |
2288
|
|
|
|
|
|
|
|
2289
|
|
|
|
|
|
|
} |
2290
|
234
|
|
|
|
|
305
|
delete($$tmp_path{$parent}{$branch}); |
2291
|
234
|
|
|
|
|
454
|
delete($$tmp_path{$branch}); |
2292
|
234
|
|
|
|
|
327
|
delete($$tmp_path_tr{$branch}); |
2293
|
|
|
|
|
|
|
|
2294
|
234
|
|
|
|
|
193
|
if (TRACE_NFA_TO_TREE) { |
2295
|
|
|
|
|
|
|
print STDERR "delete $parent -> $branch\n"; |
2296
|
|
|
|
|
|
|
print STDERR "delete $branch -> *\n"; |
2297
|
|
|
|
|
|
|
print STDERR "delete $branch <- *\n"; |
2298
|
|
|
|
|
|
|
} |
2299
|
|
|
|
|
|
|
|
2300
|
234
|
|
|
|
|
578
|
push(@branch_inds, $parent); |
2301
|
|
|
|
|
|
|
} |
2302
|
|
|
|
|
|
|
|
2303
|
262
|
|
|
|
|
400
|
if (TRACE_NFA_TO_TREE) { |
2304
|
|
|
|
|
|
|
print STDERR "after branch skip\n"; |
2305
|
|
|
|
|
|
|
for my $i (sort {$a <=> $b} (keys(%$tmp_path))) { |
2306
|
|
|
|
|
|
|
for my $j (sort {$a <=> $b} (keys(%{$$tmp_path{$i}}))) { |
2307
|
|
|
|
|
|
|
if ($reversed) { |
2308
|
|
|
|
|
|
|
print STDERR "$j $i: " . tree_dump($$tmp_path{$i}{$j}) . "\n"; |
2309
|
|
|
|
|
|
|
} |
2310
|
|
|
|
|
|
|
else { |
2311
|
|
|
|
|
|
|
print STDERR "$i $j: " . tree_dump($$tmp_path{$i}{$j}) . "\n"; |
2312
|
|
|
|
|
|
|
} |
2313
|
|
|
|
|
|
|
}} |
2314
|
|
|
|
|
|
|
for my $j (sort {$a <=> $b} (keys(%$tmp_path_tr))) { |
2315
|
|
|
|
|
|
|
for my $i (sort {$a <=> $b} (keys(%{$$tmp_path_tr{$j}}))) { |
2316
|
|
|
|
|
|
|
print STDERR |
2317
|
|
|
|
|
|
|
($reversed ? "$i <- $j: " : "$j <- $i:") |
2318
|
|
|
|
|
|
|
. tree_dump($$tmp_path_tr{$j}{$i}) . "\n"; |
2319
|
|
|
|
|
|
|
}} |
2320
|
|
|
|
|
|
|
} |
2321
|
|
|
|
|
|
|
|
2322
|
|
|
|
|
|
|
} |
2323
|
|
|
|
|
|
|
|
2324
|
|
|
|
|
|
|
|
2325
|
|
|
|
|
|
|
# starify diagonal |
2326
|
131
|
|
|
|
|
335
|
for (grep { exists($$path{$_}{$_}) } keys(%$path)) { |
|
300
|
|
|
|
|
511
|
|
2327
|
128
|
|
|
|
|
288
|
$$path{$_}{$_} |
2328
|
|
|
|
|
|
|
= $$path_tr{$_}{$_} |
2329
|
|
|
|
|
|
|
= tree_starify($$path{$_}{$_}); |
2330
|
|
|
|
|
|
|
} |
2331
|
|
|
|
|
|
|
|
2332
|
131
|
|
|
|
|
137
|
if (TRACE_NFA_TO_TREE) { |
2333
|
|
|
|
|
|
|
print STDERR "after diagonal starification\n"; |
2334
|
|
|
|
|
|
|
for my $i (sort {$a <=> $b} (keys(%$path))) { |
2335
|
|
|
|
|
|
|
for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) { |
2336
|
|
|
|
|
|
|
print STDERR "$i $j: "; |
2337
|
|
|
|
|
|
|
print STDERR tree_dump($$path{$i}{$j}) . "\n"; |
2338
|
|
|
|
|
|
|
}} |
2339
|
|
|
|
|
|
|
} |
2340
|
|
|
|
|
|
|
|
2341
|
|
|
|
|
|
|
# Warshall algorithm (Kleene's theorem) |
2342
|
131
|
|
|
|
|
131
|
my %updates; |
2343
|
300
|
|
|
|
|
269
|
my %weight = map { |
2344
|
131
|
|
|
|
|
203
|
my $w = 0; |
2345
|
300
|
|
|
|
|
212
|
for (values(%{$$path{$_}})) { $w += _tree_weight($_) } |
|
300
|
|
|
|
|
488
|
|
|
474
|
|
|
|
|
621
|
|
2346
|
300
|
|
|
|
|
534
|
($_ => $w); |
2347
|
|
|
|
|
|
|
} keys(%$path); |
2348
|
131
|
50
|
|
|
|
364
|
my @ks = sort { $weight{$a} <=> $weight{$b} || $a <=> $b } keys(%$path); |
|
302
|
|
|
|
|
607
|
|
2349
|
|
|
|
|
|
|
# note that keys(%$path_tr) are not additionally needed |
2350
|
|
|
|
|
|
|
# case i == k && k == j: nothing to do |
2351
|
|
|
|
|
|
|
# case i != k && k != j: $$path{$k}{$j} must exist |
2352
|
|
|
|
|
|
|
# case i == k && k != j: $$path{$k}{$k} must exist |
2353
|
|
|
|
|
|
|
# case i != k && k == j: $$path{$k}{$k} must exist |
2354
|
131
|
|
|
|
|
202
|
for my $k (@ks) { |
2355
|
300
|
|
|
|
|
265
|
for my $i (keys(%{$$path_tr{$k}})) { # i -> k |
|
300
|
|
|
|
|
545
|
|
2356
|
481
|
|
|
|
|
403
|
for my $j (keys(%{$$path{$k}})) { # k -> j |
|
481
|
|
|
|
|
793
|
|
2357
|
1336
|
100
|
100
|
|
|
3097
|
if ($i == $k && $k == $j) { next; } |
|
139
|
|
|
|
|
243
|
|
2358
|
1197
|
|
|
|
|
841
|
my @trees; |
2359
|
1197
|
100
|
100
|
|
|
4402
|
if ( |
|
|
|
66
|
|
|
|
|
2360
|
|
|
|
|
|
|
exists($$path{$i}{$j}) |
2361
|
|
|
|
|
|
|
&& ($i != $k && $k != $j) |
2362
|
|
|
|
|
|
|
) { |
2363
|
533
|
|
|
|
|
616
|
push(@trees, $$path{$i}{$j}); |
2364
|
|
|
|
|
|
|
} |
2365
|
1197
|
100
|
|
|
|
3462
|
my $new_tree |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
2366
|
|
|
|
|
|
|
= exists($$path{$k}{$k}) |
2367
|
|
|
|
|
|
|
? tree_concat( |
2368
|
|
|
|
|
|
|
( |
2369
|
|
|
|
|
|
|
$i != $k |
2370
|
|
|
|
|
|
|
? $$path{$i}{$k} |
2371
|
|
|
|
|
|
|
: () |
2372
|
|
|
|
|
|
|
) |
2373
|
|
|
|
|
|
|
, $$path{$k}{$k} |
2374
|
|
|
|
|
|
|
, ( |
2375
|
|
|
|
|
|
|
$k != $j |
2376
|
|
|
|
|
|
|
? $$path{$k}{$j} |
2377
|
|
|
|
|
|
|
: () |
2378
|
|
|
|
|
|
|
) |
2379
|
|
|
|
|
|
|
) |
2380
|
|
|
|
|
|
|
: tree_concat2($$path{$i}{$k}, $$path{$k}{$j}) |
2381
|
|
|
|
|
|
|
; |
2382
|
1197
|
100
|
|
|
|
1808
|
push(@trees, $i == $j ? tree_starify($new_tree) : $new_tree); |
2383
|
|
|
|
|
|
|
|
2384
|
1197
|
100
|
|
|
|
1454
|
if (@trees == 1) { |
2385
|
664
|
|
|
|
|
1470
|
$updates{$i}{$j} = $trees[0]; |
2386
|
|
|
|
|
|
|
} |
2387
|
|
|
|
|
|
|
else { |
2388
|
533
|
|
|
|
|
667
|
$updates{$i}{$j} = tree_alt(@trees); |
2389
|
|
|
|
|
|
|
} |
2390
|
|
|
|
|
|
|
} |
2391
|
|
|
|
|
|
|
} |
2392
|
300
|
|
|
|
|
541
|
for my $i (keys(%updates)) { |
2393
|
407
|
|
|
|
|
298
|
for my $j (keys(%{$updates{$i}})) { |
|
407
|
|
|
|
|
744
|
|
2394
|
1197
|
|
|
|
|
2046
|
$$path{$i}{$j} = $$path_tr{$j}{$i} = $updates{$i}{$j}; |
2395
|
|
|
|
|
|
|
} |
2396
|
|
|
|
|
|
|
} |
2397
|
|
|
|
|
|
|
|
2398
|
300
|
|
|
|
|
259
|
if (TRACE_NFA_TO_TREE) { |
2399
|
|
|
|
|
|
|
my $num_of_updates = map {keys(%{$updates{$_}})} keys(%updates); |
2400
|
|
|
|
|
|
|
print STDERR "k = $k ($num_of_updates updates)\n"; |
2401
|
|
|
|
|
|
|
if ($num_of_updates) { |
2402
|
|
|
|
|
|
|
for my $i (sort {$a <=> $b} (keys(%$path))) { |
2403
|
|
|
|
|
|
|
for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) { |
2404
|
|
|
|
|
|
|
print STDERR "$i $j: "; |
2405
|
|
|
|
|
|
|
print STDERR tree_dump($$path{$i}{$j}) . "\n"; |
2406
|
|
|
|
|
|
|
}} |
2407
|
|
|
|
|
|
|
} |
2408
|
|
|
|
|
|
|
} |
2409
|
|
|
|
|
|
|
|
2410
|
300
|
|
|
|
|
603
|
%updates = (); |
2411
|
|
|
|
|
|
|
} |
2412
|
|
|
|
|
|
|
|
2413
|
131
|
|
|
|
|
154
|
my $tree; |
2414
|
|
|
|
|
|
|
|
2415
|
|
|
|
|
|
|
# accepting empty init |
2416
|
131
|
100
|
|
|
|
284
|
if ($$nfa[0][0]) { |
2417
|
|
|
|
|
|
|
|
2418
|
53
|
100
|
|
|
|
142
|
my $path_0_0 = exists($$path{0}{0}) ? $$path{0}{0} : $cc_none; |
2419
|
|
|
|
|
|
|
|
2420
|
53
|
100
|
|
|
|
93
|
if ($unique_accepting_state_ind == 0) { |
2421
|
48
|
|
|
|
|
53
|
$tree = $path_0_0; |
2422
|
|
|
|
|
|
|
} |
2423
|
|
|
|
|
|
|
else { |
2424
|
5
|
|
|
|
|
9
|
my $path_0_end = $$path{0}{$unique_accepting_state_ind}; |
2425
|
|
|
|
|
|
|
|
2426
|
5
|
50
|
100
|
|
|
32
|
if ( |
|
|
|
66
|
|
|
|
|
2427
|
|
|
|
|
|
|
$path_0_0 == $cc_none |
2428
|
|
|
|
|
|
|
&& ref($path_0_end) ne CHAR_CLASS |
2429
|
|
|
|
|
|
|
&& $$path_0_end[0] |
2430
|
|
|
|
|
|
|
) { |
2431
|
|
|
|
|
|
|
# starified expression e* does not need (|e*) |
2432
|
0
|
|
|
|
|
0
|
$tree = $path_0_end; |
2433
|
|
|
|
|
|
|
} |
2434
|
|
|
|
|
|
|
else { |
2435
|
|
|
|
|
|
|
# non-starified expression e needs (|e) |
2436
|
5
|
|
|
|
|
7
|
$tree = tree_alt($path_0_0, $path_0_end); |
2437
|
|
|
|
|
|
|
} |
2438
|
|
|
|
|
|
|
} |
2439
|
|
|
|
|
|
|
} |
2440
|
|
|
|
|
|
|
else { |
2441
|
78
|
|
|
|
|
158
|
$tree = $$path{0}{$unique_accepting_state_ind}; |
2442
|
|
|
|
|
|
|
} |
2443
|
|
|
|
|
|
|
|
2444
|
131
|
|
|
|
|
107
|
if (TRACE_NFA_TO_TREE) { |
2445
|
|
|
|
|
|
|
print STDERR "tree: " . tree_dump($tree) . "\n"; |
2446
|
|
|
|
|
|
|
} |
2447
|
|
|
|
|
|
|
|
2448
|
131
|
|
|
|
|
275
|
_tree_factorize_fixes($tree); |
2449
|
|
|
|
|
|
|
|
2450
|
131
|
|
|
|
|
146
|
if (TRACE_NFA_TO_TREE) { |
2451
|
|
|
|
|
|
|
print STDERR "tree (after factorization): " . tree_dump($tree) . "\n"; |
2452
|
|
|
|
|
|
|
} |
2453
|
131
|
|
|
|
|
1124
|
return $tree; |
2454
|
|
|
|
|
|
|
} |
2455
|
|
|
|
|
|
|
|
2456
|
|
|
|
|
|
|
|
2457
|
|
|
|
|
|
|
# Recursively (bottom up) factorizes prefixes and suffixes out from |
2458
|
|
|
|
|
|
|
# alternations if at least one of them contains a sub-tree. |
2459
|
|
|
|
|
|
|
# |
2460
|
|
|
|
|
|
|
# Example 1: (ab1cd|ab2cd|ab3*cd) -> ab(1|2|3*)cd |
2461
|
|
|
|
|
|
|
# Example 2: (ab1cd|ab2cd|ab3cd) remains the same (no sub-tree) |
2462
|
|
|
|
|
|
|
# |
2463
|
|
|
|
|
|
|
# Example 2 does not need to be factorized |
2464
|
|
|
|
|
|
|
# because it can be represented by a drop-down list, |
2465
|
|
|
|
|
|
|
# which is the primary purpose of this module; |
2466
|
|
|
|
|
|
|
# in this case, a factorization may lead to counter-intuitive results, |
2467
|
|
|
|
|
|
|
# like words cut in the middle. |
2468
|
|
|
|
|
|
|
# |
2469
|
|
|
|
|
|
|
# But example 1 (less common) could only be represented as mere free-text |
2470
|
|
|
|
|
|
|
# if the common pre- and suf-fixes were not factorized out, |
2471
|
|
|
|
|
|
|
# thus loosing information for the input helper (xxx_to_input_constraints). |
2472
|
|
|
|
|
|
|
# |
2473
|
|
|
|
|
|
|
# This behavior can be changed by setting our $FULL_FACTORIZE_FIXES = 1; |
2474
|
|
|
|
|
|
|
# in this case, Example 2 would produce ab(1|2|3)cd. |
2475
|
|
|
|
|
|
|
# |
2476
|
|
|
|
|
|
|
# Modifies $tree in place |
2477
|
|
|
|
|
|
|
# |
2478
|
|
|
|
|
|
|
sub _tree_factorize_fixes { |
2479
|
1199
|
|
|
1199
|
|
969
|
my ($tree) = @_; |
2480
|
1199
|
100
|
100
|
|
|
4262
|
if ( |
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
2481
|
628
|
|
|
|
|
2277
|
!defined($tree) |
2482
|
|
|
|
|
|
|
|| ref($tree) eq CHAR_CLASS |
2483
|
|
|
|
|
|
|
|| @{$$tree[1]} == 0 |
2484
|
|
|
|
|
|
|
|| !$FULL_FACTORIZE_FIXES |
2485
|
|
|
|
|
|
|
&& ( |
2486
|
|
|
|
|
|
|
@{$$tree[1]} == 1 |
2487
|
|
|
|
|
|
|
|| !grep { ref($_) ne CHAR_CLASS } map { @$_ } @{$$tree[1]} |
2488
|
|
|
|
|
|
|
) |
2489
|
|
|
|
|
|
|
) { |
2490
|
954
|
|
|
|
|
1318
|
return $tree; |
2491
|
|
|
|
|
|
|
} |
2492
|
|
|
|
|
|
|
else { |
2493
|
245
|
|
|
|
|
206
|
for (grep { grep { ref($_) ne CHAR_CLASS } @$_ } @{$$tree[1]} ) { |
|
564
|
|
|
|
|
507
|
|
|
1503
|
|
|
|
|
1762
|
|
|
245
|
|
|
|
|
310
|
|
2494
|
1068
|
|
|
|
|
1304
|
my $tmp_tree = |
2495
|
320
|
|
|
|
|
332
|
tree_concat(map { _tree_factorize_fixes($_) } @$_) |
2496
|
|
|
|
|
|
|
; |
2497
|
320
|
100
|
66
|
|
|
1177
|
if ( |
|
|
|
66
|
|
|
|
|
2498
|
319
|
|
|
|
|
689
|
ref($tmp_tree) eq CHAR_CLASS |
2499
|
|
|
|
|
|
|
|| $$tmp_tree[0] |
2500
|
|
|
|
|
|
|
|| @{$$tmp_tree[1]} > 1 |
2501
|
|
|
|
|
|
|
) { |
2502
|
1
|
|
|
|
|
3
|
$_ = [$tmp_tree]; |
2503
|
|
|
|
|
|
|
} |
2504
|
|
|
|
|
|
|
else { |
2505
|
319
|
|
|
|
|
560
|
$_ = $$tmp_tree[1][0]; |
2506
|
|
|
|
|
|
|
} |
2507
|
|
|
|
|
|
|
} |
2508
|
|
|
|
|
|
|
|
2509
|
|
|
|
|
|
|
# flatten |
2510
|
245
|
|
|
|
|
460
|
@{$$tree[1]} = map { |
2511
|
245
|
|
|
|
|
289
|
[map { |
2512
|
1598
|
|
|
|
|
1662
|
ref($_) ne CHAR_CLASS |
2513
|
1
|
|
|
|
|
3
|
&& !$$_[0] && @{$$_[1]} == 1 |
2514
|
|
|
|
|
|
|
# non-starified with single alternation |
2515
|
1598
|
100
|
100
|
|
|
4477
|
? @{$$_[1][0]} |
2516
|
|
|
|
|
|
|
: $_ |
2517
|
564
|
|
|
|
|
543
|
} grep { defined($_) } @$_] |
2518
|
245
|
|
|
|
|
236
|
} @{$$tree[1]}; |
2519
|
|
|
|
|
|
|
|
2520
|
245
|
100
|
|
|
|
212
|
if (@{$$tree[1]} == 1) { |
|
245
|
|
|
|
|
462
|
|
2521
|
4
|
|
|
|
|
9
|
return $tree; |
2522
|
|
|
|
|
|
|
} |
2523
|
|
|
|
|
|
|
|
2524
|
241
|
|
|
|
|
189
|
my $fst_len = @{$$tree[1][0]}; |
|
241
|
|
|
|
|
279
|
|
2525
|
241
|
|
|
|
|
230
|
my ($pre_len, $suf_len) = (0, 0); |
2526
|
241
|
|
|
|
|
255
|
for (1, 0) { |
2527
|
392
|
|
|
|
|
482
|
my ($len_ref, @range) |
2528
|
|
|
|
|
|
|
= $_ |
2529
|
|
|
|
|
|
|
? (\$pre_len, (0..$fst_len-1)) |
2530
|
482
|
100
|
|
|
|
893
|
: (\$suf_len, map {-$_} (1..$fst_len-$pre_len)) |
2531
|
|
|
|
|
|
|
; |
2532
|
482
|
|
|
|
|
504
|
for my $i (@range) { |
2533
|
532
|
100
|
|
|
|
451
|
if ( |
2534
|
713
|
100
|
100
|
|
|
3705
|
grep { |
2535
|
532
|
|
|
|
|
545
|
$i >= @$_ |
2536
|
|
|
|
|
|
|
|| ref($$_[$i]) ne CHAR_CLASS |
2537
|
|
|
|
|
|
|
|| $$tree[1][0][$i] != $$_[$i] |
2538
|
|
|
|
|
|
|
} |
2539
|
532
|
|
|
|
|
581
|
@{$$tree[1]}[1..$#{$$tree[1]}] |
2540
|
|
|
|
|
|
|
) { |
2541
|
415
|
|
|
|
|
629
|
last; |
2542
|
|
|
|
|
|
|
} |
2543
|
117
|
|
|
|
|
179
|
$$len_ref++; |
2544
|
|
|
|
|
|
|
} |
2545
|
|
|
|
|
|
|
} |
2546
|
241
|
100
|
100
|
|
|
810
|
if ($pre_len == 0 && $suf_len == 0) { |
2547
|
172
|
|
|
|
|
330
|
return $tree; |
2548
|
|
|
|
|
|
|
} |
2549
|
|
|
|
|
|
|
|
2550
|
69
|
|
|
|
|
78
|
my $empty_seen = 0; |
2551
|
|
|
|
|
|
|
my $mid_tree = [ |
2552
|
|
|
|
|
|
|
0 |
2553
|
|
|
|
|
|
|
, [ |
2554
|
|
|
|
|
|
|
map { |
2555
|
157
|
100
|
|
|
|
279
|
if ($pre_len <= $#$_ - $suf_len) { |
|
69
|
50
|
|
|
|
115
|
|
2556
|
104
|
|
|
|
|
257
|
[ @$_[$pre_len..$#$_-$suf_len] ]; |
2557
|
|
|
|
|
|
|
} |
2558
|
|
|
|
|
|
|
elsif (!$empty_seen++) { |
2559
|
53
|
|
|
|
|
77
|
[]; |
2560
|
|
|
|
|
|
|
} |
2561
|
|
|
|
|
|
|
else { |
2562
|
0
|
|
|
|
|
0
|
(); |
2563
|
|
|
|
|
|
|
} |
2564
|
|
|
|
|
|
|
} |
2565
|
69
|
|
|
|
|
95
|
@{$$tree[1]} |
2566
|
|
|
|
|
|
|
] |
2567
|
|
|
|
|
|
|
]; |
2568
|
69
|
|
|
|
|
98
|
$$tree[1] = [[ |
2569
|
69
|
|
|
|
|
165
|
@{$$tree[1][0]}[0..$pre_len-1] |
2570
|
69
|
|
|
|
|
160
|
, $empty_seen == @{$$tree[1]} ? () : $mid_tree |
2571
|
69
|
50
|
|
|
|
124
|
, @{$$tree[1][0]}[$fst_len-$suf_len..$fst_len-1] |
2572
|
|
|
|
|
|
|
]]; |
2573
|
69
|
|
|
|
|
172
|
return $tree; |
2574
|
|
|
|
|
|
|
} |
2575
|
|
|
|
|
|
|
} |
2576
|
|
|
|
|
|
|
|
2577
|
|
|
|
|
|
|
=item C |
2578
|
|
|
|
|
|
|
|
2579
|
|
|
|
|
|
|
Converts a C<$tree> to an C<$ere> (if C<$to_perlre> is false) |
2580
|
|
|
|
|
|
|
or to a C<$perlre> (if C<$to_perlre> is true). |
2581
|
|
|
|
|
|
|
|
2582
|
|
|
|
|
|
|
=cut |
2583
|
|
|
|
|
|
|
|
2584
|
|
|
|
|
|
|
sub tree_to_regex { |
2585
|
119
|
100
|
|
119
|
1
|
316
|
my $re = defined($_[0]) ? &_tree_to_regex : '$.'; |
2586
|
119
|
100
|
|
|
|
1662
|
return $_[1] ? qr/\A$re\z/ms : "^$re\$"; |
2587
|
|
|
|
|
|
|
} |
2588
|
|
|
|
|
|
|
|
2589
|
|
|
|
|
|
|
{ |
2590
|
|
|
|
|
|
|
my %cc_to_regex_cache; |
2591
|
|
|
|
|
|
|
|
2592
|
|
|
|
|
|
|
sub _tree_to_regex { |
2593
|
930
|
|
|
930
|
|
996
|
my ($tree, $to_perlre) = (@_, 0); |
2594
|
930
|
100
|
100
|
|
|
1259
|
if (ref($tree) eq CHAR_CLASS) { |
|
906
|
50
|
|
|
|
1265
|
|
|
|
100
|
|
|
|
|
|
2595
|
|
|
|
|
|
|
return |
2596
|
24
|
|
100
|
|
|
108
|
$cc_to_regex_cache{$tree.$to_perlre} |
2597
|
|
|
|
|
|
|
||= cc_to_regex($tree, $to_perlre) |
2598
|
|
|
|
|
|
|
; |
2599
|
|
|
|
|
|
|
} |
2600
|
906
|
|
|
|
|
1739
|
elsif (@{$$tree[1]} == 0) { |
2601
|
0
|
|
|
|
|
0
|
return ''; |
2602
|
|
|
|
|
|
|
} |
2603
|
|
|
|
|
|
|
elsif ( |
2604
|
487
|
|
|
|
|
1155
|
@{$$tree[1]} == 1 # single alteration |
2605
|
|
|
|
|
|
|
&& @{$$tree[1][0]} == 1 # single atom |
2606
|
|
|
|
|
|
|
) { |
2607
|
312
|
|
|
|
|
333
|
my $atom = $$tree[1][0][0]; |
2608
|
312
|
50
|
|
|
|
416
|
if (ref($atom) eq CHAR_CLASS) { |
2609
|
312
|
100
|
100
|
|
|
1374
|
return join('', |
2610
|
|
|
|
|
|
|
$cc_to_regex_cache{$atom.$to_perlre} |
2611
|
|
|
|
|
|
|
||= cc_to_regex($atom, $to_perlre) |
2612
|
|
|
|
|
|
|
, $$tree[0] ? '*' : () |
2613
|
|
|
|
|
|
|
); |
2614
|
|
|
|
|
|
|
} |
2615
|
|
|
|
|
|
|
else { |
2616
|
0
|
|
0
|
|
|
0
|
return _tree_to_regex( |
2617
|
|
|
|
|
|
|
[$$tree[0] || $$atom[0], $$atom[1]] |
2618
|
|
|
|
|
|
|
, $to_perlre |
2619
|
|
|
|
|
|
|
); |
2620
|
|
|
|
|
|
|
} |
2621
|
|
|
|
|
|
|
} |
2622
|
|
|
|
|
|
|
else { |
2623
|
|
|
|
|
|
|
my $needs_parenthesis |
2624
|
|
|
|
|
|
|
= @{$$tree[1]} > 1 # (a|...) |
2625
|
594
|
|
66
|
|
|
421
|
|| $$tree[0] && @{$$tree[1][0]} > 1 # (ab...)* |
2626
|
|
|
|
|
|
|
; |
2627
|
|
|
|
|
|
|
|
2628
|
2878
|
100
|
100
|
|
|
8682
|
return join('' |
2629
|
|
|
|
|
|
|
, ($needs_parenthesis ? ($to_perlre ? '(?:' : '(') : ()) |
2630
|
|
|
|
|
|
|
, ( |
2631
|
|
|
|
|
|
|
join('|', |
2632
|
|
|
|
|
|
|
map { |
2633
|
594
|
|
|
|
|
635
|
join('', |
2634
|
|
|
|
|
|
|
map { |
2635
|
1127
|
|
|
|
|
1161
|
ref($_) eq CHAR_CLASS |
2636
|
|
|
|
|
|
|
? $cc_to_regex_cache{$_.$to_perlre} |
2637
|
|
|
|
|
|
|
||= cc_to_regex($_, $to_perlre) |
2638
|
|
|
|
|
|
|
: _tree_to_regex($_, $to_perlre) |
2639
|
|
|
|
|
|
|
} |
2640
|
|
|
|
|
|
|
@$_ # alternation |
2641
|
|
|
|
|
|
|
) |
2642
|
|
|
|
|
|
|
} |
2643
|
594
|
100
|
|
|
|
984
|
@{$$tree[1]} |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
2644
|
|
|
|
|
|
|
) |
2645
|
|
|
|
|
|
|
) |
2646
|
|
|
|
|
|
|
, ($needs_parenthesis ? ')' : ()) |
2647
|
|
|
|
|
|
|
, ($$tree[0] ? '*' : ()) |
2648
|
|
|
|
|
|
|
); |
2649
|
|
|
|
|
|
|
} |
2650
|
|
|
|
|
|
|
} |
2651
|
|
|
|
|
|
|
} |
2652
|
|
|
|
|
|
|
|
2653
|
|
|
|
|
|
|
# starification (regex)* |
2654
|
|
|
|
|
|
|
sub tree_starify { |
2655
|
285
|
|
|
285
|
0
|
286
|
my ($tree) = @_; |
2656
|
285
|
100
|
|
|
|
456
|
if (ref($tree) eq CHAR_CLASS) { |
2657
|
96
|
|
|
|
|
314
|
return [1, [[$tree]]]; |
2658
|
|
|
|
|
|
|
} |
2659
|
|
|
|
|
|
|
else { |
2660
|
189
|
|
|
|
|
354
|
return [1, $$tree[1]]; |
2661
|
|
|
|
|
|
|
} |
2662
|
|
|
|
|
|
|
} |
2663
|
|
|
|
|
|
|
|
2664
|
|
|
|
|
|
|
# The behavior of tree_concat2 can be altered |
2665
|
|
|
|
|
|
|
# by setting $TREE_CONCAT_FULL_EXPAND = 1; |
2666
|
|
|
|
|
|
|
sub tree_concat2 { |
2667
|
2995
|
|
|
2995
|
0
|
8407
|
my ($tree_0, $tree_1) = @_; |
2668
|
2995
|
|
|
|
|
2094
|
my $concat; |
2669
|
|
|
|
|
|
|
|
2670
|
|
|
|
|
|
|
# main criteria: |
2671
|
|
|
|
|
|
|
# CHAR_CLASS |
2672
|
|
|
|
|
|
|
# @{$$tree_n[1]} == 0 |
2673
|
|
|
|
|
|
|
# $$tree_n[0] |
2674
|
|
|
|
|
|
|
# @{$$tree_n[1]} == 1 |
2675
|
|
|
|
|
|
|
|
2676
|
2995
|
100
|
|
|
|
3732
|
if (ref($tree_0) eq CHAR_CLASS) { |
|
2155
|
100
|
|
|
|
4170
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
2677
|
840
|
100
|
|
|
|
1592
|
if (@$tree_0 == 0) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
2678
|
5
|
100
|
100
|
|
|
14
|
if ( |
2679
|
3
|
|
|
|
|
12
|
ref($tree_1) ne CHAR_CLASS |
2680
|
|
|
|
|
|
|
&& @{$$tree_1[1]} == 0 |
2681
|
|
|
|
|
|
|
) { |
2682
|
|
|
|
|
|
|
# () -> empty |
2683
|
1
|
|
|
|
|
1
|
$concat = $cc_none; |
2684
|
|
|
|
|
|
|
} |
2685
|
|
|
|
|
|
|
else { |
2686
|
|
|
|
|
|
|
# -> |
2687
|
4
|
|
|
|
|
5
|
$concat = $tree_1; |
2688
|
|
|
|
|
|
|
} |
2689
|
|
|
|
|
|
|
} |
2690
|
509
|
|
|
|
|
1075
|
elsif (ref($tree_1) eq CHAR_CLASS) { |
2691
|
326
|
100
|
|
|
|
425
|
if (@$tree_1 == 0) { |
2692
|
|
|
|
|
|
|
# a -> a |
2693
|
1
|
|
|
|
|
2
|
$concat = $tree_0; |
2694
|
|
|
|
|
|
|
} |
2695
|
|
|
|
|
|
|
else { |
2696
|
|
|
|
|
|
|
# a b -> (ab) |
2697
|
325
|
|
|
|
|
608
|
$concat = [0, [[ $tree_0, $tree_1 ]]]; |
2698
|
|
|
|
|
|
|
} |
2699
|
|
|
|
|
|
|
} |
2700
|
|
|
|
|
|
|
elsif (@{$$tree_1[1]} == 0) { |
2701
|
|
|
|
|
|
|
# a () -> a |
2702
|
1
|
|
|
|
|
2
|
$concat = $tree_0; |
2703
|
|
|
|
|
|
|
} |
2704
|
|
|
|
|
|
|
elsif ($$tree_1[0]) { |
2705
|
|
|
|
|
|
|
# a (b)* -> (a(b)*) |
2706
|
373
|
|
|
|
|
754
|
$concat = [0, [[ $tree_0, $tree_1 ]]]; |
2707
|
|
|
|
|
|
|
} |
2708
|
|
|
|
|
|
|
else { |
2709
|
135
|
100
|
100
|
|
|
266
|
if ( |
2710
|
519
|
100
|
|
|
|
1128
|
$FULL_FACTORIZE_FIXES |
2711
|
236
|
|
|
|
|
319
|
|| grep { ref($_) ne CHAR_CLASS && $$_[0] } |
2712
|
133
|
|
|
|
|
171
|
map {@$_} @{$$tree_1[1]} |
2713
|
|
|
|
|
|
|
) { |
2714
|
|
|
|
|
|
|
# a (bc|de) -> (a(bc|de)) |
2715
|
|
|
|
|
|
|
# one of bcde is starified |
2716
|
42
|
|
|
|
|
80
|
$concat = [0, [[ $tree_0, $tree_1 ]]]; |
2717
|
|
|
|
|
|
|
} |
2718
|
|
|
|
|
|
|
else { |
2719
|
|
|
|
|
|
|
# a (bc|de) -> (abc|ade) |
2720
|
|
|
|
|
|
|
# none of bcde is starified |
2721
|
161
|
|
|
|
|
373
|
$concat = [ |
2722
|
|
|
|
|
|
|
0 |
2723
|
93
|
|
|
|
|
99
|
, [ map { [ $tree_0, @$_ ] } @{$$tree_1[1]} ] |
|
93
|
|
|
|
|
136
|
|
2724
|
|
|
|
|
|
|
]; |
2725
|
|
|
|
|
|
|
} |
2726
|
|
|
|
|
|
|
} |
2727
|
|
|
|
|
|
|
} |
2728
|
|
|
|
|
|
|
elsif (@{$$tree_0[1]} == 0) { |
2729
|
5
|
100
|
100
|
|
|
14
|
if ( |
2730
|
3
|
|
|
|
|
9
|
ref($tree_1) ne CHAR_CLASS |
2731
|
|
|
|
|
|
|
&& @{$$tree_1[1]} == 0 |
2732
|
|
|
|
|
|
|
) { |
2733
|
|
|
|
|
|
|
# () () -> empty |
2734
|
1
|
|
|
|
|
2
|
$concat = $cc_none; |
2735
|
|
|
|
|
|
|
} |
2736
|
|
|
|
|
|
|
else { |
2737
|
|
|
|
|
|
|
# () -> |
2738
|
4
|
|
|
|
|
2
|
$concat = $tree_1; |
2739
|
|
|
|
|
|
|
} |
2740
|
|
|
|
|
|
|
} |
2741
|
1884
|
|
|
|
|
2394
|
elsif ($$tree_0[0]) { |
2742
|
266
|
100
|
|
|
|
392
|
if (ref($tree_1) eq CHAR_CLASS) { |
|
176
|
100
|
|
|
|
379
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
2743
|
90
|
100
|
|
|
|
144
|
if (@$tree_1 == 0) { |
2744
|
|
|
|
|
|
|
# (a)* -> (a)* |
2745
|
1
|
|
|
|
|
2
|
$concat = $tree_0; |
2746
|
|
|
|
|
|
|
} |
2747
|
|
|
|
|
|
|
else { |
2748
|
|
|
|
|
|
|
# (a)* b -> ((a)*b) |
2749
|
89
|
|
|
|
|
222
|
$concat = [0, [[ $tree_0, $tree_1 ]]]; |
2750
|
|
|
|
|
|
|
} |
2751
|
|
|
|
|
|
|
} |
2752
|
|
|
|
|
|
|
elsif (@{$$tree_1[1]} == 0) { |
2753
|
|
|
|
|
|
|
# (a)* () -> (a)* |
2754
|
1
|
|
|
|
|
2
|
$concat = $tree_0; |
2755
|
|
|
|
|
|
|
} |
2756
|
174
|
|
|
|
|
268
|
elsif ($$tree_1[0]) { |
2757
|
|
|
|
|
|
|
# (a)* (b)* -> ((a)*(b)*) |
2758
|
1
|
|
|
|
|
2
|
$concat = [0, [[ $tree_0, $tree_1 ]]]; |
2759
|
|
|
|
|
|
|
} |
2760
|
|
|
|
|
|
|
elsif (@{$$tree_1[1]} == 1) { |
2761
|
|
|
|
|
|
|
# (a)* (bcd) -> ((a)*bcd) |
2762
|
100
|
|
|
|
|
228
|
$concat = [ |
2763
|
|
|
|
|
|
|
0 |
2764
|
100
|
|
|
|
|
158
|
, [[ $tree_0, @{$$tree_1[1][0]} ]] |
2765
|
|
|
|
|
|
|
]; |
2766
|
|
|
|
|
|
|
} |
2767
|
|
|
|
|
|
|
else { |
2768
|
|
|
|
|
|
|
# (a)* (b|c) -> ((a)*(b|c)) |
2769
|
74
|
|
|
|
|
149
|
$concat = [0, [[ $tree_0, $tree_1 ]]]; |
2770
|
|
|
|
|
|
|
} |
2771
|
|
|
|
|
|
|
} |
2772
|
|
|
|
|
|
|
elsif (@{$$tree_0[1]} == 1) { |
2773
|
1473
|
100
|
|
|
|
1716
|
if (ref($tree_1) eq CHAR_CLASS) { |
|
1075
|
100
|
|
|
|
1809
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
2774
|
398
|
100
|
|
|
|
495
|
if (@$tree_1 == 0) { |
2775
|
|
|
|
|
|
|
# (ab) -> (ab) |
2776
|
1
|
|
|
|
|
2
|
$concat = $tree_0; |
2777
|
|
|
|
|
|
|
} |
2778
|
|
|
|
|
|
|
else { |
2779
|
|
|
|
|
|
|
# (ab) c -> (abc) |
2780
|
397
|
|
|
|
|
872
|
$concat = [ |
2781
|
|
|
|
|
|
|
0 |
2782
|
397
|
|
|
|
|
312
|
, [[ @{$$tree_0[1][0]}, $tree_1 ]] |
2783
|
|
|
|
|
|
|
]; |
2784
|
|
|
|
|
|
|
} |
2785
|
|
|
|
|
|
|
} |
2786
|
|
|
|
|
|
|
elsif (@{$$tree_1[1]} == 0) { |
2787
|
|
|
|
|
|
|
# (ab) () -> (ab) |
2788
|
1
|
|
|
|
|
2
|
$concat = $tree_0; |
2789
|
|
|
|
|
|
|
} |
2790
|
658
|
|
|
|
|
968
|
elsif ($$tree_1[0]) { |
2791
|
|
|
|
|
|
|
# (ab) (c)* -> (ab(c)*) |
2792
|
416
|
|
|
|
|
360
|
$concat = [0, [[@{$$tree_0[1][0]}, $tree_1]]]; |
|
416
|
|
|
|
|
871
|
|
2793
|
|
|
|
|
|
|
} |
2794
|
1092
|
|
|
|
|
1458
|
elsif (@{$$tree_1[1]} == 1) { |
2795
|
|
|
|
|
|
|
# (ab) (cd) -> (abcd) |
2796
|
298
|
|
|
|
|
344
|
$concat = [ |
2797
|
|
|
|
|
|
|
0 |
2798
|
298
|
|
|
|
|
276
|
, [[ @{$$tree_0[1][0]}, @{$$tree_1[1][0]} ]] |
|
298
|
|
|
|
|
716
|
|
2799
|
|
|
|
|
|
|
]; |
2800
|
|
|
|
|
|
|
} |
2801
|
|
|
|
|
|
|
elsif ( |
2802
|
360
|
|
|
|
|
431
|
!grep { ref($_) ne CHAR_CLASS } @{$$tree_0[1][0]} |
2803
|
|
|
|
|
|
|
) { |
2804
|
16
|
100
|
66
|
|
|
41
|
if ( |
2805
|
217
|
100
|
|
|
|
351
|
$FULL_FACTORIZE_FIXES |
2806
|
48
|
|
|
|
|
62
|
|| grep { ref($_) ne CHAR_CLASS && $$_[0] } |
2807
|
16
|
|
|
|
|
22
|
map {@$_} @{$$tree_1[1]} |
2808
|
|
|
|
|
|
|
) { |
2809
|
|
|
|
|
|
|
# (ab) (cd|ef) -> (ab(cd|ef)) |
2810
|
|
|
|
|
|
|
# neither a nor b is a tree |
2811
|
|
|
|
|
|
|
# one of cdef is starified |
2812
|
5
|
|
|
|
|
7
|
$concat = [0, [[@{$$tree_0[1][0]}, $tree_1]]]; |
|
5
|
|
|
|
|
11
|
|
2813
|
|
|
|
|
|
|
} |
2814
|
|
|
|
|
|
|
else { |
2815
|
|
|
|
|
|
|
# (ab) (cd|ef) -> (abcd|abef) |
2816
|
|
|
|
|
|
|
# neither a nor b is a tree |
2817
|
|
|
|
|
|
|
# none of cdef is starified |
2818
|
34
|
|
|
|
|
79
|
$concat = [ |
2819
|
|
|
|
|
|
|
0 |
2820
|
11
|
|
|
|
|
15
|
, [ map { [ @{$$tree_0[1][0]}, @$_ ] } @{$$tree_1[1]} ] |
|
34
|
|
|
|
|
25
|
|
|
11
|
|
|
|
|
18
|
|
2821
|
|
|
|
|
|
|
]; |
2822
|
|
|
|
|
|
|
} |
2823
|
|
|
|
|
|
|
} |
2824
|
|
|
|
|
|
|
else { |
2825
|
|
|
|
|
|
|
# (ab) (cd|ef) -> (ab(cd|ef)) |
2826
|
|
|
|
|
|
|
# a or b is a tree |
2827
|
344
|
|
|
|
|
291
|
$concat = [0, [[@{$$tree_0[1][0]} , $tree_1 ]]]; |
|
344
|
|
|
|
|
927
|
|
2828
|
|
|
|
|
|
|
} |
2829
|
|
|
|
|
|
|
} |
2830
|
|
|
|
|
|
|
else { |
2831
|
411
|
100
|
|
|
|
557
|
if (ref($tree_1) eq CHAR_CLASS) { |
|
352
|
50
|
|
|
|
650
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
2832
|
59
|
50
|
|
|
|
97
|
if (@$tree_1 == 0) { |
2833
|
|
|
|
|
|
|
# (ab|cd) -> (ab|cd) |
2834
|
0
|
|
|
|
|
0
|
$concat = $tree_0; |
2835
|
|
|
|
|
|
|
} |
2836
|
|
|
|
|
|
|
else { |
2837
|
59
|
100
|
100
|
|
|
135
|
if ( |
2838
|
371
|
100
|
|
|
|
678
|
$FULL_FACTORIZE_FIXES |
2839
|
148
|
|
|
|
|
189
|
|| grep { ref($_) ne CHAR_CLASS && $$_[0] } |
2840
|
52
|
|
|
|
|
69
|
map {@$_} @{$$tree_0[1]} |
2841
|
|
|
|
|
|
|
) { |
2842
|
|
|
|
|
|
|
# (ab|cd) e -> ((ab|cd)e) |
2843
|
|
|
|
|
|
|
# one of abcd is starified |
2844
|
12
|
|
|
|
|
31
|
$concat = [0, [[ $tree_0, $tree_1 ]]]; |
2845
|
|
|
|
|
|
|
} |
2846
|
|
|
|
|
|
|
else { |
2847
|
|
|
|
|
|
|
# (ab|cd) e -> (abe|cde) |
2848
|
|
|
|
|
|
|
# none of abcd is starified |
2849
|
138
|
|
|
|
|
233
|
$concat = [ |
2850
|
|
|
|
|
|
|
0 |
2851
|
47
|
|
|
|
|
57
|
, [ map { [@$_, $tree_1] } @{$$tree_0[1]} ] |
|
47
|
|
|
|
|
62
|
|
2852
|
|
|
|
|
|
|
]; |
2853
|
|
|
|
|
|
|
} |
2854
|
|
|
|
|
|
|
} |
2855
|
|
|
|
|
|
|
} |
2856
|
|
|
|
|
|
|
elsif (@{$$tree_1[1]} == 0) { |
2857
|
|
|
|
|
|
|
# (ab|cd) () -> (ab|cd) |
2858
|
0
|
|
|
|
|
0
|
$concat = $tree_0; |
2859
|
|
|
|
|
|
|
} |
2860
|
85
|
|
|
|
|
164
|
elsif ($$tree_1[0]) { |
2861
|
|
|
|
|
|
|
# (ab|cd) (e)* -> ((ab|cd)(e)*) |
2862
|
267
|
|
|
|
|
463
|
$concat = [0, [[ $tree_0, $tree_1 ]]]; |
2863
|
|
|
|
|
|
|
} |
2864
|
|
|
|
|
|
|
elsif ( |
2865
|
|
|
|
|
|
|
@{$$tree_1[1]} == 1 |
2866
|
|
|
|
|
|
|
) { |
2867
|
13
|
100
|
|
|
|
18
|
if (!grep { ref($_) ne CHAR_CLASS } @{$$tree_1[1][0]}) { |
|
50
|
|
|
|
|
79
|
|
|
13
|
|
|
|
|
18
|
|
2868
|
11
|
100
|
66
|
|
|
39
|
if ( |
2869
|
145
|
100
|
|
|
|
236
|
$FULL_FACTORIZE_FIXES |
2870
|
30
|
|
|
|
|
45
|
|| grep { ref($_) ne CHAR_CLASS && $$_[0] } |
2871
|
9
|
|
|
|
|
24
|
map {@$_} @{$$tree_0[1]} |
2872
|
|
|
|
|
|
|
) { |
2873
|
|
|
|
|
|
|
# (ab|cd) (ef) -> ((ab|cd)ef) |
2874
|
|
|
|
|
|
|
# e and f both CHAR_CLASS |
2875
|
|
|
|
|
|
|
# one of abcd is starified |
2876
|
2
|
|
|
|
|
3
|
$concat = [0, [[$tree_0, @{$$tree_1[1][0]}]]]; |
|
2
|
|
|
|
|
7
|
|
2877
|
|
|
|
|
|
|
} |
2878
|
|
|
|
|
|
|
else { |
2879
|
|
|
|
|
|
|
# (ab|cd) (ef) -> (acef|cdef) |
2880
|
|
|
|
|
|
|
# e and f both CHAR_CLASS |
2881
|
|
|
|
|
|
|
# none of abcd is starified |
2882
|
30
|
|
|
|
|
71
|
$concat = [ |
2883
|
|
|
|
|
|
|
0 |
2884
|
9
|
|
|
|
|
11
|
, [ map { [@$_, @{$$tree_1[1][0]}] } @{$$tree_0[1]} ] |
|
30
|
|
|
|
|
55
|
|
|
9
|
|
|
|
|
17
|
|
2885
|
|
|
|
|
|
|
]; |
2886
|
|
|
|
|
|
|
} |
2887
|
|
|
|
|
|
|
} |
2888
|
|
|
|
|
|
|
else { |
2889
|
|
|
|
|
|
|
# (ab|cd) (ef) -> ((ab|cd)ef) |
2890
|
|
|
|
|
|
|
# e or f is a tree |
2891
|
2
|
|
|
|
|
4
|
$concat = [0, [[$tree_0, @{$$tree_1[1][0]}]]]; |
|
2
|
|
|
|
|
7
|
|
2892
|
|
|
|
|
|
|
} |
2893
|
|
|
|
|
|
|
} |
2894
|
|
|
|
|
|
|
elsif ($TREE_CONCAT_FULL_EXPAND) { |
2895
|
|
|
|
|
|
|
# (ab|cd) (ef|gh) -> (abef|abgh|cdef|cdgh) |
2896
|
0
|
|
|
|
|
0
|
$concat = [ |
2897
|
|
|
|
|
|
|
0 |
2898
|
|
|
|
|
|
|
, [ |
2899
|
|
|
|
|
|
|
map { |
2900
|
0
|
|
|
|
|
0
|
my $alt_0 = $_; |
2901
|
0
|
|
|
|
|
0
|
map { [@$alt_0, @$_] } |
|
0
|
|
|
|
|
0
|
|
2902
|
0
|
|
|
|
|
0
|
@{$$tree_1[1]} |
2903
|
|
|
|
|
|
|
} |
2904
|
0
|
|
|
|
|
0
|
@{$$tree_0[1]} |
2905
|
|
|
|
|
|
|
] |
2906
|
|
|
|
|
|
|
]; |
2907
|
|
|
|
|
|
|
} |
2908
|
|
|
|
|
|
|
else { |
2909
|
|
|
|
|
|
|
# (ab|cd) (ef|gh) -> ((ab|cd)(ef|gh)) |
2910
|
72
|
|
|
|
|
134
|
$concat = [0, [[ $tree_0, $tree_1 ]]]; |
2911
|
|
|
|
|
|
|
} |
2912
|
|
|
|
|
|
|
} |
2913
|
2995
|
|
|
|
|
5373
|
return $concat; |
2914
|
|
|
|
|
|
|
} |
2915
|
|
|
|
|
|
|
|
2916
|
|
|
|
|
|
|
# concatenation regex0regex1... |
2917
|
|
|
|
|
|
|
sub tree_concat { |
2918
|
1476
|
50
|
|
1476
|
0
|
2869
|
if (@_ == 0) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
2919
|
0
|
|
|
|
|
0
|
return $cc_none; # neutral element: accepting empty string |
2920
|
|
|
|
|
|
|
} |
2921
|
4156
|
|
|
|
|
5359
|
elsif (@_ == 1) { |
2922
|
31
|
|
|
|
|
85
|
return $_[0]; |
2923
|
|
|
|
|
|
|
} |
2924
|
|
|
|
|
|
|
elsif (grep {!defined($_)} @_) { |
2925
|
0
|
|
|
|
|
0
|
return undef; # one accepting nothing -> concat accepting nothing |
2926
|
|
|
|
|
|
|
} |
2927
|
|
|
|
|
|
|
|
2928
|
|
|
|
|
|
|
# resolve words first |
2929
|
1445
|
|
|
|
|
1043
|
my @word; |
2930
|
|
|
|
|
|
|
my @trees; |
2931
|
1445
|
|
|
|
|
1520
|
for (@_) { |
2932
|
4156
|
100
|
|
|
|
4822
|
if (ref($_) eq CHAR_CLASS) { |
2933
|
1452
|
|
|
|
|
1514
|
push(@word, $_); |
2934
|
|
|
|
|
|
|
} |
2935
|
|
|
|
|
|
|
else { |
2936
|
2704
|
100
|
|
|
|
4521
|
if (@word > 1) { |
|
|
100
|
|
|
|
|
|
2937
|
30
|
|
|
|
|
66
|
push(@trees, [0, [[ @word ]] ] ); |
2938
|
30
|
|
|
|
|
41
|
@word = (); |
2939
|
|
|
|
|
|
|
} |
2940
|
|
|
|
|
|
|
elsif (@word) { |
2941
|
438
|
|
|
|
|
392
|
push(@trees, $word[0]); |
2942
|
438
|
|
|
|
|
492
|
@word = (); |
2943
|
|
|
|
|
|
|
} |
2944
|
2704
|
|
|
|
|
3004
|
push(@trees, $_); |
2945
|
|
|
|
|
|
|
} |
2946
|
|
|
|
|
|
|
} |
2947
|
1445
|
100
|
|
|
|
2645
|
if (@word > 1) { |
|
|
100
|
|
|
|
|
|
2948
|
151
|
|
|
|
|
299
|
push(@trees, [0, [[ @word ]] ] ); |
2949
|
|
|
|
|
|
|
} |
2950
|
|
|
|
|
|
|
elsif (@word) { |
2951
|
351
|
|
|
|
|
371
|
push(@trees, $word[0]); |
2952
|
|
|
|
|
|
|
} |
2953
|
|
|
|
|
|
|
|
2954
|
1445
|
|
|
|
|
1267
|
my $concat = $trees[0]; |
2955
|
1445
|
|
|
|
|
1829
|
for my $tree (@trees[1..$#trees]) { |
2956
|
2229
|
|
|
|
|
2471
|
$concat = tree_concat2($concat, $tree); |
2957
|
|
|
|
|
|
|
} |
2958
|
|
|
|
|
|
|
|
2959
|
1445
|
|
|
|
|
2118
|
return $concat; |
2960
|
|
|
|
|
|
|
} |
2961
|
|
|
|
|
|
|
|
2962
|
|
|
|
|
|
|
# alternation regex0|regex1|... |
2963
|
|
|
|
|
|
|
sub tree_alt { |
2964
|
1007
|
|
|
1007
|
0
|
843
|
my @starified_alts; |
2965
|
|
|
|
|
|
|
my @non_starified_alts; |
2966
|
0
|
|
|
|
|
0
|
my $has_empty; |
2967
|
|
|
|
|
|
|
|
2968
|
1007
|
|
|
|
|
1021
|
for (grep { defined($_) } @_) { |
|
1972
|
|
|
|
|
2451
|
|
2969
|
1972
|
100
|
|
|
|
2400
|
if (ref($_) eq CHAR_CLASS) { |
|
1547
|
50
|
|
|
|
2822
|
|
|
|
100
|
|
|
|
|
|
2970
|
425
|
|
|
|
|
598
|
push(@non_starified_alts, [$_]); |
2971
|
|
|
|
|
|
|
} |
2972
|
|
|
|
|
|
|
elsif (!@{$$_[1]}) { |
2973
|
0
|
|
|
|
|
0
|
$has_empty = 1; |
2974
|
|
|
|
|
|
|
} |
2975
|
|
|
|
|
|
|
elsif ($$_[0]) { |
2976
|
265
|
|
|
|
|
189
|
push(@starified_alts, @{$$_[1]}); |
|
265
|
|
|
|
|
398
|
|
2977
|
|
|
|
|
|
|
} |
2978
|
|
|
|
|
|
|
else { |
2979
|
1282
|
|
|
|
|
832
|
push(@non_starified_alts, @{$$_[1]}); |
|
1282
|
|
|
|
|
1745
|
|
2980
|
|
|
|
|
|
|
} |
2981
|
|
|
|
|
|
|
} |
2982
|
|
|
|
|
|
|
|
2983
|
1007
|
100
|
|
|
|
1455
|
if (!@starified_alts) { |
|
|
100
|
|
|
|
|
|
2984
|
874
|
100
|
66
|
|
|
2101
|
if ( |
|
|
50
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
2985
|
43
|
|
|
|
|
140
|
@non_starified_alts > 1 |
2986
|
|
|
|
|
|
|
|| $has_empty |
2987
|
|
|
|
|
|
|
|| @non_starified_alts && @{$non_starified_alts[0]} > 1 |
2988
|
|
|
|
|
|
|
) { |
2989
|
|
|
|
|
|
|
return [ |
2990
|
835
|
50
|
|
|
|
2899
|
0 |
2991
|
|
|
|
|
|
|
, [ |
2992
|
|
|
|
|
|
|
@non_starified_alts |
2993
|
|
|
|
|
|
|
, ($has_empty ? [[0, []]] : ()) |
2994
|
|
|
|
|
|
|
] |
2995
|
|
|
|
|
|
|
]; |
2996
|
|
|
|
|
|
|
} |
2997
|
|
|
|
|
|
|
elsif (!@non_starified_alts) { |
2998
|
0
|
|
|
|
|
0
|
return undef; # neutral element: accepting nothing |
2999
|
|
|
|
|
|
|
} |
3000
|
|
|
|
|
|
|
else { |
3001
|
39
|
|
|
|
|
123
|
return $non_starified_alts[0][0]; |
3002
|
|
|
|
|
|
|
} |
3003
|
|
|
|
|
|
|
|
3004
|
|
|
|
|
|
|
} |
3005
|
|
|
|
|
|
|
elsif (!@non_starified_alts) { |
3006
|
132
|
|
|
|
|
506
|
return [1, \@starified_alts]; |
3007
|
|
|
|
|
|
|
} |
3008
|
|
|
|
|
|
|
else { |
3009
|
|
|
|
|
|
|
return [ |
3010
|
1
|
|
|
|
|
4
|
0 |
3011
|
|
|
|
|
|
|
, [ |
3012
|
|
|
|
|
|
|
@non_starified_alts |
3013
|
|
|
|
|
|
|
, [[1, \@starified_alts]] |
3014
|
|
|
|
|
|
|
] |
3015
|
|
|
|
|
|
|
]; |
3016
|
|
|
|
|
|
|
} |
3017
|
|
|
|
|
|
|
} |
3018
|
|
|
|
|
|
|
|
3019
|
|
|
|
|
|
|
|
3020
|
|
|
|
|
|
|
# returns an unanchored $ere having exactly the same structure |
3021
|
|
|
|
|
|
|
# as the given $tree. Intended for tracing/debugging. |
3022
|
|
|
|
|
|
|
sub tree_dump { |
3023
|
131
|
|
|
131
|
0
|
209
|
my ($tree) = @_; |
3024
|
131
|
50
|
|
|
|
205
|
if (!defined($_[0])) { |
3025
|
|
|
|
|
|
|
# nothing accepted (not even the empty string) |
3026
|
0
|
|
|
|
|
0
|
return '$.'; |
3027
|
|
|
|
|
|
|
} |
3028
|
131
|
100
|
|
|
|
192
|
if (ref($tree) eq CHAR_CLASS) { |
|
83
|
100
|
|
|
|
131
|
|
3029
|
48
|
|
|
|
|
59
|
return cc_to_regex($tree); |
3030
|
|
|
|
|
|
|
} |
3031
|
|
|
|
|
|
|
elsif (@{$$tree[1]} == 0) { |
3032
|
20
|
|
|
|
|
54
|
return '()'; |
3033
|
|
|
|
|
|
|
} |
3034
|
|
|
|
|
|
|
else { |
3035
|
63
|
|
|
|
|
58
|
return join('' |
3036
|
|
|
|
|
|
|
, '(' |
3037
|
|
|
|
|
|
|
, ( |
3038
|
|
|
|
|
|
|
join('|', |
3039
|
|
|
|
|
|
|
map { |
3040
|
63
|
|
|
|
|
76
|
my $alt = $_; |
3041
|
132
|
|
|
|
|
90
|
join('', |
3042
|
|
|
|
|
|
|
map { |
3043
|
63
|
|
|
|
|
55
|
my $atom = $_; |
3044
|
132
|
100
|
|
|
|
178
|
if (ref($atom) eq CHAR_CLASS) { |
3045
|
126
|
|
|
|
|
136
|
cc_to_regex($atom); |
3046
|
|
|
|
|
|
|
} |
3047
|
|
|
|
|
|
|
else { |
3048
|
6
|
|
|
|
|
10
|
tree_dump($atom); |
3049
|
|
|
|
|
|
|
} |
3050
|
|
|
|
|
|
|
} |
3051
|
|
|
|
|
|
|
@$alt |
3052
|
|
|
|
|
|
|
) |
3053
|
|
|
|
|
|
|
} |
3054
|
63
|
100
|
|
|
|
66
|
@{$$tree[1]} |
3055
|
|
|
|
|
|
|
) |
3056
|
|
|
|
|
|
|
) |
3057
|
|
|
|
|
|
|
, ')' |
3058
|
|
|
|
|
|
|
, ($$tree[0] ? '*' : ()) |
3059
|
|
|
|
|
|
|
); |
3060
|
|
|
|
|
|
|
} |
3061
|
|
|
|
|
|
|
} |
3062
|
|
|
|
|
|
|
|
3063
|
|
|
|
|
|
|
# Heuristic weight function for the processing order of the warshall algorithm |
3064
|
|
|
|
|
|
|
sub _tree_weight { |
3065
|
1741
|
|
|
1741
|
|
1261
|
my ($tree) = @_; |
3066
|
1741
|
|
|
|
|
1240
|
my $weight = 0; |
3067
|
1741
|
100
|
|
|
|
2245
|
if (ref($tree) eq CHAR_CLASS) { |
|
|
50
|
|
|
|
|
|
3068
|
1476
|
|
|
|
|
1333
|
for (@$tree) { |
3069
|
1692
|
100
|
|
|
|
2695
|
$weight += ($$_[0] == $$_[1] ? 1 : 2); |
3070
|
|
|
|
|
|
|
} |
3071
|
|
|
|
|
|
|
} |
3072
|
|
|
|
|
|
|
elsif (defined($tree)) { |
3073
|
265
|
|
|
|
|
215
|
for (map { @$_ } @{$$tree[1]}) { |
|
347
|
|
|
|
|
501
|
|
|
265
|
|
|
|
|
359
|
|
3074
|
1267
|
|
|
|
|
1473
|
$weight += _tree_weight($_); |
3075
|
|
|
|
|
|
|
} |
3076
|
|
|
|
|
|
|
} |
3077
|
1741
|
|
|
|
|
2068
|
return $weight; |
3078
|
|
|
|
|
|
|
} |
3079
|
|
|
|
|
|
|
|
3080
|
|
|
|
|
|
|
|
3081
|
|
|
|
|
|
|
############################################################################## |
3082
|
|
|
|
|
|
|
# $input_constraints |
3083
|
|
|
|
|
|
|
############################################################################## |
3084
|
|
|
|
|
|
|
|
3085
|
|
|
|
|
|
|
use constant { |
3086
|
8
|
|
|
|
|
26526
|
FREE_TEXT => 'free text' |
3087
|
8
|
|
|
8
|
|
75560
|
}; |
|
8
|
|
|
|
|
14
|
|
3088
|
|
|
|
|
|
|
|
3089
|
|
|
|
|
|
|
=back |
3090
|
|
|
|
|
|
|
|
3091
|
|
|
|
|
|
|
=head2 Input constraints |
3092
|
|
|
|
|
|
|
|
3093
|
|
|
|
|
|
|
$input_constraints = [ $input_constraint_0, $input_constraint_1, ... ] |
3094
|
|
|
|
|
|
|
$input_constraint = [ 'word_0', 'word_1', ... ] (drop down) |
3095
|
|
|
|
|
|
|
or 'free_text' (free text) |
3096
|
|
|
|
|
|
|
|
3097
|
|
|
|
|
|
|
|
3098
|
|
|
|
|
|
|
=over 4 |
3099
|
|
|
|
|
|
|
|
3100
|
|
|
|
|
|
|
=item C |
3101
|
|
|
|
|
|
|
|
3102
|
|
|
|
|
|
|
Converts a C<$tree> to a pair C<($input_constraints, $split_str)>. |
3103
|
|
|
|
|
|
|
|
3104
|
|
|
|
|
|
|
C<$split_perlre> is a compiled perl regular expression splitting a string |
3105
|
|
|
|
|
|
|
accordingly to C<$input_constraints>. This C<$perlre> matches if and only if |
3106
|
|
|
|
|
|
|
each drop down can be assigned a value; then C<$str =~ $perlre> in list |
3107
|
|
|
|
|
|
|
context returns as many values as C<@$input_constraints>. |
3108
|
|
|
|
|
|
|
|
3109
|
|
|
|
|
|
|
=cut |
3110
|
|
|
|
|
|
|
|
3111
|
|
|
|
|
|
|
sub tree_to_input_constraints { |
3112
|
12
|
|
|
12
|
1
|
20
|
my ($input_constraints, $perlres) = &_tree_to_input_constraints; |
3113
|
|
|
|
|
|
|
|
3114
|
|
|
|
|
|
|
# concatenate free texts and stronger underlying regexs |
3115
|
12
|
|
|
|
|
15
|
my @previous_undefs; |
3116
|
|
|
|
|
|
|
my @kept; |
3117
|
12
|
|
|
|
|
28
|
for my $i (0..$#$input_constraints) { |
3118
|
36
|
100
|
|
|
|
59
|
if ($$input_constraints[$i] eq FREE_TEXT) { |
3119
|
15
|
|
|
|
|
21
|
push(@previous_undefs, $i); |
3120
|
|
|
|
|
|
|
} |
3121
|
|
|
|
|
|
|
else { |
3122
|
21
|
100
|
|
|
|
42
|
if (@previous_undefs) { |
3123
|
9
|
|
|
|
|
12
|
push(@kept, $i-1); |
3124
|
9
|
100
|
|
|
|
22
|
if (@previous_undefs > 1) { |
3125
|
6
|
|
|
|
|
18
|
$$perlres[$i-1] = join('', |
3126
|
2
|
|
|
|
|
3
|
map { '(?:' . $$perlres[$_] . ')' } |
3127
|
|
|
|
|
|
|
@previous_undefs |
3128
|
|
|
|
|
|
|
); |
3129
|
|
|
|
|
|
|
} |
3130
|
9
|
|
|
|
|
16
|
@previous_undefs = (); |
3131
|
|
|
|
|
|
|
} |
3132
|
21
|
|
|
|
|
40
|
push(@kept, $i); |
3133
|
|
|
|
|
|
|
} |
3134
|
|
|
|
|
|
|
} |
3135
|
12
|
100
|
|
|
|
24
|
if (@previous_undefs) { |
3136
|
1
|
|
|
|
|
2
|
push(@kept, $#$input_constraints); |
3137
|
1
|
50
|
|
|
|
3
|
if (@previous_undefs > 1) { |
3138
|
2
|
|
|
|
|
8
|
$$perlres[$#$input_constraints] = join('', |
3139
|
1
|
|
|
|
|
2
|
map { '(?:' . $$perlres[$_] . ')' } |
3140
|
|
|
|
|
|
|
@previous_undefs |
3141
|
|
|
|
|
|
|
); |
3142
|
|
|
|
|
|
|
} |
3143
|
|
|
|
|
|
|
} |
3144
|
12
|
|
|
|
|
41
|
@$input_constraints = @$input_constraints[@kept]; |
3145
|
12
|
|
|
|
|
34
|
@$perlres = @$perlres[@kept]; |
3146
|
|
|
|
|
|
|
|
3147
|
|
|
|
|
|
|
# sort words, remove duplicates |
3148
|
12
|
|
|
|
|
20
|
for (grep { $_ ne FREE_TEXT } @$input_constraints) { |
|
31
|
|
|
|
|
55
|
|
3149
|
21
|
|
|
|
|
21
|
$_ = [ sort(keys(%{ { map { ($_ => $_) } @$_ } })) ]; |
|
21
|
|
|
|
|
24
|
|
|
37
|
|
|
|
|
147
|
|
3150
|
|
|
|
|
|
|
} |
3151
|
|
|
|
|
|
|
|
3152
|
|
|
|
|
|
|
# remove empty words |
3153
|
|
|
|
|
|
|
# concatenate single words |
3154
|
12
|
|
|
|
|
14
|
my @previous_singles; |
3155
|
12
|
|
|
|
|
15
|
@kept = (); |
3156
|
12
|
|
|
|
|
23
|
for my $i (0..$#$input_constraints) { |
3157
|
31
|
100
|
100
|
|
|
66
|
if ( |
|
|
50
|
33
|
|
|
|
|
3158
|
21
|
|
|
|
|
57
|
$$input_constraints[$i] eq FREE_TEXT |
3159
|
|
|
|
|
|
|
|| @{$$input_constraints[$i]} > 1 |
3160
|
11
|
|
|
|
|
50
|
) { |
3161
|
20
|
100
|
|
|
|
31
|
if (@previous_singles) { |
3162
|
6
|
|
|
|
|
8
|
push(@kept, $i-1); |
3163
|
6
|
50
|
|
|
|
70
|
if (@previous_singles > 1) { |
3164
|
0
|
|
|
|
|
0
|
$$perlres[$i-1] = join('', |
3165
|
0
|
|
|
|
|
0
|
map { $$perlres[$_] } |
3166
|
|
|
|
|
|
|
@previous_singles |
3167
|
|
|
|
|
|
|
); |
3168
|
0
|
|
|
|
|
0
|
$$input_constraints[$i-1] = join('', |
3169
|
0
|
|
|
|
|
0
|
map { $$input_constraints[$_][0] } |
3170
|
|
|
|
|
|
|
@previous_singles |
3171
|
|
|
|
|
|
|
); |
3172
|
|
|
|
|
|
|
} |
3173
|
6
|
|
|
|
|
11
|
@previous_singles = (); |
3174
|
|
|
|
|
|
|
} |
3175
|
20
|
|
|
|
|
29
|
push(@kept, $i); |
3176
|
|
|
|
|
|
|
} |
3177
|
|
|
|
|
|
|
elsif ( |
3178
|
|
|
|
|
|
|
@{$$input_constraints[$i]} == 1 |
3179
|
|
|
|
|
|
|
&& length($$input_constraints[$i][0]) |
3180
|
|
|
|
|
|
|
) { |
3181
|
11
|
|
|
|
|
23
|
push(@previous_singles, $i); |
3182
|
|
|
|
|
|
|
} |
3183
|
|
|
|
|
|
|
} |
3184
|
12
|
100
|
|
|
|
24
|
if (@previous_singles) { |
3185
|
5
|
|
|
|
|
11
|
push(@kept, $#$input_constraints); |
3186
|
5
|
50
|
|
|
|
18
|
if (@previous_singles > 1) { |
3187
|
0
|
|
|
|
|
0
|
$$perlres[$#$input_constraints] = join('', |
3188
|
0
|
|
|
|
|
0
|
map { $$perlres[$_] } |
3189
|
|
|
|
|
|
|
@previous_singles |
3190
|
|
|
|
|
|
|
); |
3191
|
0
|
|
|
|
|
0
|
$$input_constraints[$#$input_constraints] = join('', |
3192
|
0
|
|
|
|
|
0
|
map { $$input_constraints[$_][0] } |
3193
|
|
|
|
|
|
|
@previous_singles |
3194
|
|
|
|
|
|
|
); |
3195
|
|
|
|
|
|
|
} |
3196
|
|
|
|
|
|
|
} |
3197
|
12
|
|
|
|
|
26
|
@$input_constraints = @$input_constraints[@kept]; |
3198
|
12
|
|
|
|
|
28
|
@$perlres = @$perlres[@kept]; |
3199
|
|
|
|
|
|
|
|
3200
|
12
|
50
|
|
|
|
27
|
if (!@$input_constraints) { |
3201
|
0
|
|
|
|
|
0
|
@$input_constraints = (['']); |
3202
|
0
|
|
|
|
|
0
|
@$perlres = (''); |
3203
|
|
|
|
|
|
|
} |
3204
|
|
|
|
|
|
|
|
3205
|
31
|
100
|
|
|
|
97
|
my $split_perlre |
3206
|
|
|
|
|
|
|
= join('', |
3207
|
|
|
|
|
|
|
map { |
3208
|
12
|
|
|
|
|
26
|
$$input_constraints[$_] eq FREE_TEXT |
3209
|
|
|
|
|
|
|
? "($$perlres[$_]|.*?)" |
3210
|
|
|
|
|
|
|
: "($$perlres[$_])" |
3211
|
|
|
|
|
|
|
} |
3212
|
|
|
|
|
|
|
(0..$#$perlres) |
3213
|
|
|
|
|
|
|
) |
3214
|
|
|
|
|
|
|
; |
3215
|
12
|
|
|
|
|
743
|
return ($input_constraints, qr/\A$split_perlre\z/ms); |
3216
|
|
|
|
|
|
|
} |
3217
|
|
|
|
|
|
|
|
3218
|
|
|
|
|
|
|
{ |
3219
|
|
|
|
|
|
|
|
3220
|
|
|
|
|
|
|
my %cc_to_input_constraint_cache; |
3221
|
|
|
|
|
|
|
|
3222
|
|
|
|
|
|
|
# returns ($input_constraints, $perlres) |
3223
|
|
|
|
|
|
|
# two references to arrays of the same size. |
3224
|
|
|
|
|
|
|
sub _tree_to_input_constraints { |
3225
|
38
|
|
|
38
|
|
41
|
my ($tree) = @_; |
3226
|
38
|
|
|
|
|
36
|
my $input_constraints; |
3227
|
|
|
|
|
|
|
my $perlres; |
3228
|
38
|
50
|
|
|
|
85
|
if (!defined($tree)) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
3229
|
|
|
|
|
|
|
# regex accepting nothing -> free text (always rejected) |
3230
|
|
|
|
|
|
|
|
3231
|
0
|
|
|
|
|
0
|
$input_constraints = [FREE_TEXT]; |
3232
|
0
|
|
|
|
|
0
|
$perlres = ['$.']; |
3233
|
|
|
|
|
|
|
} |
3234
|
35
|
|
|
|
|
83
|
elsif (ref($tree) eq CHAR_CLASS) { |
3235
|
|
|
|
|
|
|
# single character class -> drop down |
3236
|
|
|
|
|
|
|
|
3237
|
3
|
|
66
|
|
|
13
|
$input_constraints = [ |
3238
|
|
|
|
|
|
|
$cc_to_input_constraint_cache{$tree} |
3239
|
|
|
|
|
|
|
||= cc_to_input_constraint($tree) |
3240
|
|
|
|
|
|
|
]; |
3241
|
3
|
|
|
|
|
7
|
$perlres = [_tree_to_regex($tree, 1)]; |
3242
|
|
|
|
|
|
|
} |
3243
|
|
|
|
|
|
|
elsif (@{$$tree[1]} == 0) { |
3244
|
|
|
|
|
|
|
# no top-level alternation |
3245
|
|
|
|
|
|
|
|
3246
|
0
|
|
|
|
|
0
|
$input_constraints = [['']]; |
3247
|
0
|
|
|
|
|
0
|
$perlres = [_tree_to_regex($tree, 1)]; |
3248
|
|
|
|
|
|
|
} |
3249
|
29
|
|
|
|
|
50
|
elsif ($$tree[0]) { |
3250
|
|
|
|
|
|
|
# starified regex -> free text |
3251
|
|
|
|
|
|
|
|
3252
|
6
|
|
|
|
|
11
|
$input_constraints = [FREE_TEXT]; |
3253
|
6
|
|
|
|
|
13
|
$perlres = [_tree_to_regex($tree, 1)]; |
3254
|
|
|
|
|
|
|
} |
3255
|
|
|
|
|
|
|
elsif (@{$$tree[1]} == 1) { |
3256
|
|
|
|
|
|
|
# single top-level alternation -> mixed results |
3257
|
|
|
|
|
|
|
# example: ab*c(d|e)f |
3258
|
|
|
|
|
|
|
|
3259
|
14
|
|
|
|
|
13
|
$input_constraints = []; |
3260
|
14
|
|
|
|
|
16
|
$perlres = []; |
3261
|
|
|
|
|
|
|
|
3262
|
14
|
|
|
|
|
14
|
my $i = 0; |
3263
|
14
|
|
|
|
|
16
|
while ($i != @{$$tree[1][0]}) { |
|
43
|
|
|
|
|
90
|
|
3264
|
29
|
|
|
|
|
27
|
my $beg = $i; |
3265
|
29
|
|
|
|
|
43
|
my @expanded_words = (''); |
3266
|
29
|
|
|
|
|
25
|
my $cc; |
3267
|
29
|
|
100
|
|
|
27
|
while ( |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
3268
|
85
|
|
|
|
|
487
|
$i != @{$$tree[1][0]} |
3269
|
|
|
|
|
|
|
&& ref($cc = $$tree[1][0][$i]) eq CHAR_CLASS |
3270
|
|
|
|
|
|
|
&& (!@$cc || $$cc[-1][1] != MAX_CHAR) |
3271
|
|
|
|
|
|
|
) { |
3272
|
56
|
|
66
|
|
|
137
|
my $input_constraint |
3273
|
|
|
|
|
|
|
= $cc_to_input_constraint_cache{$cc} |
3274
|
|
|
|
|
|
|
||= cc_to_input_constraint($cc) |
3275
|
|
|
|
|
|
|
; |
3276
|
|
|
|
|
|
|
|
3277
|
|
|
|
|
|
|
@expanded_words |
3278
|
57
|
|
|
|
|
50
|
= map { |
3279
|
56
|
|
|
|
|
60
|
my $letter = $_; |
3280
|
57
|
|
|
|
|
48
|
map { $_ . $letter } |
|
61
|
|
|
|
|
129
|
|
3281
|
|
|
|
|
|
|
@expanded_words |
3282
|
|
|
|
|
|
|
} |
3283
|
|
|
|
|
|
|
@$input_constraint |
3284
|
|
|
|
|
|
|
; |
3285
|
56
|
|
|
|
|
60
|
$i++; |
3286
|
|
|
|
|
|
|
} |
3287
|
29
|
100
|
66
|
|
|
94
|
if ($beg < $i && length($expanded_words[0])) { |
3288
|
12
|
|
|
|
|
38
|
my $wrd_perlre = _tree_to_regex( |
3289
|
|
|
|
|
|
|
[ |
3290
|
|
|
|
|
|
|
0 |
3291
|
12
|
|
|
|
|
23
|
, [[ @{$$tree[1][0]}[$beg..$i-1] ]] |
3292
|
|
|
|
|
|
|
] |
3293
|
|
|
|
|
|
|
, 1 |
3294
|
|
|
|
|
|
|
); |
3295
|
12
|
|
|
|
|
34
|
push(@$input_constraints, \@expanded_words); |
3296
|
12
|
|
|
|
|
18
|
push(@$perlres, $wrd_perlre); |
3297
|
|
|
|
|
|
|
} |
3298
|
29
|
100
|
|
|
|
70
|
if ($i < @{$$tree[1][0]}) { |
|
29
|
|
|
|
|
67
|
|
3299
|
23
|
|
|
|
|
69
|
my ($sub_input_constraints, $sub_perlres) |
3300
|
|
|
|
|
|
|
= _tree_to_input_constraints($$tree[1][0][$i]); |
3301
|
23
|
50
|
66
|
|
|
102
|
if ( |
|
|
|
33
|
|
|
|
|
3302
|
|
|
|
|
|
|
@$sub_input_constraints |
3303
|
|
|
|
|
|
|
&& ( |
3304
|
|
|
|
|
|
|
$$sub_input_constraints[0] eq FREE_TEXT |
3305
|
|
|
|
|
|
|
|| grep { length($_) } @{$$sub_input_constraints[0]} |
3306
|
|
|
|
|
|
|
) |
3307
|
|
|
|
|
|
|
) { |
3308
|
23
|
|
|
|
|
33
|
push(@$input_constraints, @$sub_input_constraints); |
3309
|
23
|
|
|
|
|
28
|
push(@$perlres, @$sub_perlres); |
3310
|
|
|
|
|
|
|
} |
3311
|
23
|
|
|
|
|
45
|
$i++; |
3312
|
|
|
|
|
|
|
} |
3313
|
|
|
|
|
|
|
} |
3314
|
|
|
|
|
|
|
} |
3315
|
|
|
|
|
|
|
else { |
3316
|
|
|
|
|
|
|
# multiple top-level alternations |
3317
|
|
|
|
|
|
|
|
3318
|
15
|
100
|
|
|
|
13
|
if ( |
3319
|
279
|
100
|
66
|
|
|
948
|
grep { grep { |
|
15
|
|
|
|
|
24
|
|
3320
|
38
|
|
|
|
|
41
|
ref($_) ne CHAR_CLASS |
3321
|
|
|
|
|
|
|
|| (@$_ && $$_[$#$_][1] == MAX_CHAR) |
3322
|
|
|
|
|
|
|
} @$_ } |
3323
|
|
|
|
|
|
|
@{$$tree[1]} |
3324
|
|
|
|
|
|
|
) { |
3325
|
|
|
|
|
|
|
# some alternation contains a sub-tree -> mixed results |
3326
|
|
|
|
|
|
|
# example: abd|ab*d |
3327
|
|
|
|
|
|
|
# common pre/suf-fixes are factorized out |
3328
|
|
|
|
|
|
|
# example: a(bd|b*)d |
3329
|
|
|
|
|
|
|
|
3330
|
6
|
|
|
|
|
6
|
my $fst_len = @{$$tree[1][0]}; |
|
6
|
|
|
|
|
11
|
|
3331
|
6
|
|
|
|
|
9
|
my ($pre_len, $suf_len) = (0, 0); |
3332
|
6
|
|
|
|
|
10
|
for (1, 0) { |
3333
|
16
|
|
|
|
|
14
|
my ($len_ref, @range) |
3334
|
|
|
|
|
|
|
= $_ |
3335
|
|
|
|
|
|
|
? (\$pre_len, (0..$fst_len-1)) |
3336
|
12
|
100
|
|
|
|
29
|
: (\$suf_len, map {-$_} (1..$fst_len-$pre_len)) |
3337
|
|
|
|
|
|
|
; |
3338
|
12
|
|
|
|
|
22
|
for my $i (@range) { |
3339
|
13
|
100
|
|
|
|
12
|
if ( |
3340
|
28
|
100
|
66
|
|
|
134
|
grep { |
3341
|
13
|
|
|
|
|
16
|
$i >= @$_ |
3342
|
|
|
|
|
|
|
|| ref($$_[$i]) ne CHAR_CLASS |
3343
|
|
|
|
|
|
|
|| $$tree[1][0][$i] != $$_[$i] |
3344
|
|
|
|
|
|
|
} |
3345
|
13
|
|
|
|
|
15
|
@{$$tree[1]}[0..$#{$$tree[1]}] |
3346
|
|
|
|
|
|
|
) { |
3347
|
3
|
|
|
|
|
7
|
last; |
3348
|
|
|
|
|
|
|
} |
3349
|
10
|
|
|
|
|
16
|
$$len_ref++; |
3350
|
|
|
|
|
|
|
} |
3351
|
|
|
|
|
|
|
} |
3352
|
6
|
100
|
|
|
|
12
|
if ($pre_len) { |
3353
|
2
|
|
|
|
|
14
|
my ($pre_input_constraints, $pre_perlres) |
3354
|
|
|
|
|
|
|
= _tree_to_input_constraints( |
3355
|
|
|
|
|
|
|
[ |
3356
|
|
|
|
|
|
|
0 |
3357
|
2
|
|
|
|
|
5
|
, [[ @{$$tree[1][0]}[0..$pre_len-1] ]] |
3358
|
|
|
|
|
|
|
] |
3359
|
|
|
|
|
|
|
); |
3360
|
2
|
|
|
|
|
5
|
push(@$input_constraints, @$pre_input_constraints); |
3361
|
2
|
|
|
|
|
3
|
push(@$perlres, @$pre_perlres); |
3362
|
|
|
|
|
|
|
} |
3363
|
|
|
|
|
|
|
|
3364
|
6
|
50
|
|
|
|
9
|
if ( |
3365
|
|
|
|
|
|
|
my @mid_alts |
3366
|
14
|
|
|
|
|
37
|
= map { [ @$_[$pre_len..$#$_-$suf_len] ] } |
|
6
|
|
|
|
|
11
|
|
3367
|
|
|
|
|
|
|
@{$$tree[1]} |
3368
|
|
|
|
|
|
|
) { |
3369
|
6
|
|
|
|
|
12
|
push(@$input_constraints, FREE_TEXT); |
3370
|
6
|
|
|
|
|
14
|
push(@$perlres, _tree_to_regex([ 0, \@mid_alts ] , 1)); |
3371
|
|
|
|
|
|
|
} |
3372
|
|
|
|
|
|
|
|
3373
|
6
|
100
|
|
|
|
19
|
if ($suf_len) { |
3374
|
1
|
|
|
|
|
4
|
my ($suf_input_constraints, $suf_perlres) |
3375
|
|
|
|
|
|
|
= _tree_to_input_constraints( |
3376
|
|
|
|
|
|
|
[ |
3377
|
|
|
|
|
|
|
0 |
3378
|
|
|
|
|
|
|
, [[ |
3379
|
1
|
|
|
|
|
3
|
@{$$tree[1][0]} |
3380
|
|
|
|
|
|
|
[$fst_len-$suf_len..$fst_len-1] |
3381
|
|
|
|
|
|
|
]] |
3382
|
|
|
|
|
|
|
] |
3383
|
|
|
|
|
|
|
); |
3384
|
1
|
|
|
|
|
3
|
push(@$input_constraints, @$suf_input_constraints); |
3385
|
1
|
|
|
|
|
2
|
push(@$perlres, @$suf_perlres); |
3386
|
|
|
|
|
|
|
} |
3387
|
|
|
|
|
|
|
} |
3388
|
|
|
|
|
|
|
else { |
3389
|
|
|
|
|
|
|
# each alternation contains only non negated char classes |
3390
|
|
|
|
|
|
|
# -> drop down |
3391
|
|
|
|
|
|
|
|
3392
|
9
|
|
|
|
|
16
|
$perlres = [_tree_to_regex($tree, 1)]; |
3393
|
9
|
|
|
|
|
15
|
for my $word (@{$$tree[1]}) { |
|
9
|
|
|
|
|
81
|
|
3394
|
24
|
|
|
|
|
32
|
my @expanded_words = (''); |
3395
|
24
|
|
66
|
|
|
25
|
for my $input_constraint ( |
|
224
|
|
|
|
|
427
|
|
3396
|
|
|
|
|
|
|
map { |
3397
|
|
|
|
|
|
|
$cc_to_input_constraint_cache{$_} |
3398
|
|
|
|
|
|
|
||= cc_to_input_constraint($_); |
3399
|
|
|
|
|
|
|
} |
3400
|
|
|
|
|
|
|
@$word |
3401
|
|
|
|
|
|
|
) { |
3402
|
224
|
50
|
|
|
|
249
|
if (@$input_constraint == 1) { |
3403
|
224
|
|
|
|
|
185
|
for (@expanded_words) { |
3404
|
224
|
|
|
|
|
331
|
$_ .= $$input_constraint[0]; |
3405
|
|
|
|
|
|
|
} |
3406
|
|
|
|
|
|
|
} |
3407
|
|
|
|
|
|
|
else { |
3408
|
|
|
|
|
|
|
@expanded_words |
3409
|
0
|
|
|
|
|
0
|
= map { |
3410
|
0
|
|
|
|
|
0
|
my $letter = $_; |
3411
|
0
|
|
|
|
|
0
|
map { $_ . $letter } |
|
0
|
|
|
|
|
0
|
|
3412
|
|
|
|
|
|
|
@expanded_words |
3413
|
|
|
|
|
|
|
} |
3414
|
|
|
|
|
|
|
@$input_constraint |
3415
|
|
|
|
|
|
|
; |
3416
|
|
|
|
|
|
|
} |
3417
|
|
|
|
|
|
|
} |
3418
|
24
|
|
|
|
|
23
|
push(@{$$input_constraints[0]}, @expanded_words); |
|
24
|
|
|
|
|
55
|
|
3419
|
|
|
|
|
|
|
} |
3420
|
|
|
|
|
|
|
} |
3421
|
|
|
|
|
|
|
} |
3422
|
38
|
|
|
|
|
65
|
return ($input_constraints, $perlres); |
3423
|
|
|
|
|
|
|
} |
3424
|
|
|
|
|
|
|
} |
3425
|
|
|
|
|
|
|
|
3426
|
|
|
|
|
|
|
sub cc_to_input_constraint { |
3427
|
28
|
|
|
28
|
0
|
29
|
my ($cc) = @_; |
3428
|
28
|
50
|
|
|
|
66
|
if (@$cc == 0) { |
|
|
100
|
|
|
|
|
|
3429
|
0
|
|
|
|
|
0
|
return ['']; |
3430
|
|
|
|
|
|
|
} |
3431
|
|
|
|
|
|
|
elsif ($$cc[$#$cc][1] == MAX_CHAR) { |
3432
|
1
|
|
|
|
|
4
|
return FREE_TEXT; |
3433
|
|
|
|
|
|
|
} |
3434
|
|
|
|
|
|
|
else { |
3435
|
|
|
|
|
|
|
return [ |
3436
|
27
|
|
|
|
|
40
|
map { map { chr($_) } ($$_[0]..$$_[1]) } |
|
27
|
|
|
|
|
39
|
|
|
28
|
|
|
|
|
293
|
|
3437
|
|
|
|
|
|
|
@$cc |
3438
|
|
|
|
|
|
|
]; |
3439
|
|
|
|
|
|
|
} |
3440
|
|
|
|
|
|
|
} |
3441
|
|
|
|
|
|
|
|
3442
|
|
|
|
|
|
|
|
3443
|
|
|
|
|
|
|
############################################################################## |
3444
|
|
|
|
|
|
|
# $ere |
3445
|
|
|
|
|
|
|
############################################################################## |
3446
|
|
|
|
|
|
|
|
3447
|
|
|
|
|
|
|
=back |
3448
|
|
|
|
|
|
|
|
3449
|
|
|
|
|
|
|
=head2 Ere |
3450
|
|
|
|
|
|
|
|
3451
|
|
|
|
|
|
|
An C<$ere> is a perl string. |
3452
|
|
|
|
|
|
|
|
3453
|
|
|
|
|
|
|
The syntax an C<$ere> is assumed to follow is based on POSIX ERE |
3454
|
|
|
|
|
|
|
(else the C routines will C). |
3455
|
|
|
|
|
|
|
|
3456
|
|
|
|
|
|
|
Unsupported POSIX features: |
3457
|
|
|
|
|
|
|
back-references, |
3458
|
|
|
|
|
|
|
equivalence classes C<[[=a=]]>, |
3459
|
|
|
|
|
|
|
character class C<[[:digit:]]>, |
3460
|
|
|
|
|
|
|
collating symbols C<[[.ch.]]>. |
3461
|
|
|
|
|
|
|
|
3462
|
|
|
|
|
|
|
C<)> is always a special character. POSIX says that C<)> is a normal |
3463
|
|
|
|
|
|
|
character if there is no matching C<(>. |
3464
|
|
|
|
|
|
|
|
3465
|
|
|
|
|
|
|
There is no escape sequences such as C<\t> for tab or C<\n> for line feed. |
3466
|
|
|
|
|
|
|
POSIX does not specify such escape sequences neither. |
3467
|
|
|
|
|
|
|
|
3468
|
|
|
|
|
|
|
C<\> before a non-special character is ignored |
3469
|
|
|
|
|
|
|
(except in bracket expressions). POSIX does not allow it. |
3470
|
|
|
|
|
|
|
|
3471
|
|
|
|
|
|
|
The empty string is legal in alternations (C<(|a)> is equivalent to C<(a?)>). |
3472
|
|
|
|
|
|
|
POSIX does not allow it. |
3473
|
|
|
|
|
|
|
The C<(|a)> form is generated by the C routines |
3474
|
|
|
|
|
|
|
(avoiding quantifiers other than C<*>). |
3475
|
|
|
|
|
|
|
|
3476
|
|
|
|
|
|
|
C<[a-l-z]> is interpreted as C<([a-l] | - | z)> (but it is discouraged to |
3477
|
|
|
|
|
|
|
rely upon this implementation artefact). POSIX says that the interpretation |
3478
|
|
|
|
|
|
|
of this construct is undefined. |
3479
|
|
|
|
|
|
|
|
3480
|
|
|
|
|
|
|
In bracket expressions, C<\> is a normal character, |
3481
|
|
|
|
|
|
|
thus C<]> as character must occur first, or second after a C<^> |
3482
|
|
|
|
|
|
|
(POSIX compliant, but possibly surprising for perl programmers). |
3483
|
|
|
|
|
|
|
|
3484
|
|
|
|
|
|
|
All unicode characters supported by perl are allowed as literal characters. |
3485
|
|
|
|
|
|
|
|
3486
|
|
|
|
|
|
|
=over 4 |
3487
|
|
|
|
|
|
|
|
3488
|
|
|
|
|
|
|
=item C |
3489
|
|
|
|
|
|
|
|
3490
|
|
|
|
|
|
|
Parses an C<$ere> to a C<$nfa>. |
3491
|
|
|
|
|
|
|
|
3492
|
|
|
|
|
|
|
WARNING: the parsing routines, in particular C, |
3493
|
|
|
|
|
|
|
C on syntax errors; thus the caller may want to eval-trap such errors. |
3494
|
|
|
|
|
|
|
|
3495
|
|
|
|
|
|
|
=cut |
3496
|
|
|
|
|
|
|
|
3497
|
|
|
|
|
|
|
sub ere_to_nfa { |
3498
|
215
|
|
|
215
|
1
|
29574
|
my ($ere, $has_anchor_ref) = @_; |
3499
|
|
|
|
|
|
|
|
3500
|
|
|
|
|
|
|
# optimize very first and very last anchors |
3501
|
215
|
|
|
|
|
999
|
my $has_beg_anchor = $ere =~ s/^\^+//; |
3502
|
215
|
|
|
|
|
655
|
my $has_end_anchor = $ere =~ s/\$+$//; |
3503
|
|
|
|
|
|
|
|
3504
|
215
|
|
|
|
|
279
|
$$has_anchor_ref = 0; |
3505
|
215
|
|
|
|
|
196
|
my @alternation_nfas; |
3506
|
215
|
|
|
|
|
251
|
do { |
3507
|
215
|
|
|
|
|
432
|
push(@alternation_nfas, parse_alternation(\$ere, $has_anchor_ref)); |
3508
|
|
|
|
|
|
|
} while($ere =~ /\G \| /xmsgc); |
3509
|
|
|
|
|
|
|
|
3510
|
215
|
50
|
100
|
|
|
683
|
if ((pos($ere) || 0) != length($ere)) { |
3511
|
0
|
|
|
|
|
0
|
parse_die("unexpected character", \$ere); |
3512
|
|
|
|
|
|
|
} |
3513
|
|
|
|
|
|
|
|
3514
|
215
|
|
|
|
|
190
|
my $nfa; |
3515
|
215
|
100
|
100
|
|
|
516
|
if (!$has_beg_anchor && !$has_end_anchor) { |
3516
|
|
|
|
|
|
|
# a|b|c => ^.*(a|b|c).*$ |
3517
|
|
|
|
|
|
|
|
3518
|
10
|
50
|
|
|
|
66
|
$nfa = nfa_concat( |
3519
|
|
|
|
|
|
|
[[1, [[$cc_any, 0]]]] |
3520
|
|
|
|
|
|
|
, @alternation_nfas == 1 |
3521
|
|
|
|
|
|
|
? $alternation_nfas[0] |
3522
|
|
|
|
|
|
|
: nfa_union(@alternation_nfas) |
3523
|
|
|
|
|
|
|
, [[1, [[$cc_any, 0]]]] |
3524
|
|
|
|
|
|
|
); |
3525
|
|
|
|
|
|
|
} |
3526
|
|
|
|
|
|
|
else { |
3527
|
205
|
|
|
|
|
505
|
for my $alternation_nfa (@alternation_nfas[1..$#alternation_nfas-1]) { |
3528
|
0
|
|
|
|
|
0
|
$alternation_nfa = nfa_concat( |
3529
|
|
|
|
|
|
|
[[1, [[$cc_any, 0]]]] |
3530
|
|
|
|
|
|
|
, $alternation_nfa |
3531
|
|
|
|
|
|
|
, [[1, [[$cc_any, 0]]]] |
3532
|
|
|
|
|
|
|
); |
3533
|
|
|
|
|
|
|
} |
3534
|
205
|
100
|
66
|
|
|
839
|
if (!$has_beg_anchor || @alternation_nfas > 1) { |
3535
|
7
|
50
|
|
|
|
41
|
$alternation_nfas[0] = nfa_concat( |
|
|
50
|
|
|
|
|
|
3536
|
|
|
|
|
|
|
!$has_beg_anchor ? [[1, [[$cc_any, 0]]]] : () |
3537
|
|
|
|
|
|
|
, $alternation_nfas[0] |
3538
|
|
|
|
|
|
|
, @alternation_nfas > 1 ? [[1, [[$cc_any, 0]]]] : () |
3539
|
|
|
|
|
|
|
); |
3540
|
|
|
|
|
|
|
} |
3541
|
205
|
100
|
66
|
|
|
726
|
if (!$has_end_anchor || @alternation_nfas > 1) { |
3542
|
2
|
50
|
|
|
|
14
|
$alternation_nfas[-1] = nfa_concat( |
|
|
50
|
|
|
|
|
|
3543
|
|
|
|
|
|
|
@alternation_nfas > 1 ? [[1, [[$cc_any, 0]]]] : () |
3544
|
|
|
|
|
|
|
, $alternation_nfas[-1] |
3545
|
|
|
|
|
|
|
, !$has_end_anchor ? [[1, [[$cc_any, 0]]]] : () |
3546
|
|
|
|
|
|
|
); |
3547
|
|
|
|
|
|
|
} |
3548
|
|
|
|
|
|
|
$nfa |
3549
|
205
|
50
|
|
|
|
384
|
= @alternation_nfas == 1 |
3550
|
|
|
|
|
|
|
? $alternation_nfas[0] |
3551
|
|
|
|
|
|
|
: nfa_union(@alternation_nfas) |
3552
|
|
|
|
|
|
|
; |
3553
|
|
|
|
|
|
|
} |
3554
|
|
|
|
|
|
|
|
3555
|
215
|
100
|
|
|
|
774
|
return $$has_anchor_ref ? nfa_resolve_anchors($nfa) : $nfa; |
3556
|
|
|
|
|
|
|
} |
3557
|
|
|
|
|
|
|
|
3558
|
|
|
|
|
|
|
sub _ere_to_nfa { |
3559
|
238
|
|
|
238
|
|
240
|
my ($str_ref, $has_anchor_ref) = @_; |
3560
|
|
|
|
|
|
|
|
3561
|
238
|
|
|
|
|
219
|
my @alternation_nfas; |
3562
|
238
|
|
|
|
|
202
|
do { |
3563
|
385
|
|
|
|
|
591
|
push(@alternation_nfas, parse_alternation($str_ref, $has_anchor_ref)); |
3564
|
|
|
|
|
|
|
} while($$str_ref =~ /\G \| /xmsgc); |
3565
|
|
|
|
|
|
|
|
3566
|
|
|
|
|
|
|
return |
3567
|
238
|
100
|
|
|
|
602
|
@alternation_nfas == 1 |
3568
|
|
|
|
|
|
|
? $alternation_nfas[0] |
3569
|
|
|
|
|
|
|
: nfa_union(@alternation_nfas) |
3570
|
|
|
|
|
|
|
; |
3571
|
|
|
|
|
|
|
} |
3572
|
|
|
|
|
|
|
|
3573
|
|
|
|
|
|
|
sub bracket_expression_to_cc { |
3574
|
102
|
|
|
102
|
0
|
95
|
my ($str_ref) = @_; |
3575
|
102
|
|
|
|
|
162
|
my $neg = $$str_ref =~ /\G \^/xmsgc; |
3576
|
102
|
|
|
|
|
99
|
my $interval_list = []; |
3577
|
|
|
|
|
|
|
|
3578
|
|
|
|
|
|
|
# anything is allowed a first char, in particular ']' and '-' |
3579
|
102
|
100
|
|
|
|
347
|
if ($$str_ref =~ /\G (.) - ([^]]) /xmsgc) { |
|
|
50
|
|
|
|
|
|
3580
|
16
|
|
|
|
|
39
|
push(@$interval_list, [ord($1), ord($2)]); |
3581
|
|
|
|
|
|
|
} |
3582
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G (.) /xmsgc) { |
3583
|
86
|
|
|
|
|
209
|
push(@$interval_list, [ord($1), ord($1)]); |
3584
|
|
|
|
|
|
|
} |
3585
|
|
|
|
|
|
|
|
3586
|
102
|
|
|
|
|
112
|
my $loop = 1; |
3587
|
102
|
|
|
|
|
392
|
while ($loop) { |
3588
|
177
|
50
|
|
|
|
420
|
if ($$str_ref =~ /\G ([^]]) - ([^]]) /xmsgc) { |
|
|
100
|
|
|
|
|
|
3589
|
0
|
|
|
|
|
0
|
push(@$interval_list, [ord($1), ord($2)]); |
3590
|
|
|
|
|
|
|
} |
3591
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G ([^]]) /xmsgc) { |
3592
|
75
|
|
|
|
|
177
|
push(@$interval_list, [ord($1), ord($1)]); |
3593
|
|
|
|
|
|
|
} |
3594
|
|
|
|
|
|
|
else { |
3595
|
102
|
|
|
|
|
188
|
$loop = 0; |
3596
|
|
|
|
|
|
|
} |
3597
|
|
|
|
|
|
|
} |
3598
|
|
|
|
|
|
|
|
3599
|
|
|
|
|
|
|
return |
3600
|
102
|
100
|
|
|
|
249
|
$neg |
3601
|
|
|
|
|
|
|
? cc_neg(interval_list_to_cc($interval_list)) |
3602
|
|
|
|
|
|
|
: interval_list_to_cc($interval_list) |
3603
|
|
|
|
|
|
|
; |
3604
|
|
|
|
|
|
|
} |
3605
|
|
|
|
|
|
|
|
3606
|
|
|
|
|
|
|
# Returns: |
3607
|
|
|
|
|
|
|
# - the empty list iff no quantification has been parsed |
3608
|
|
|
|
|
|
|
# - a 2-tuple ($min, $max) |
3609
|
|
|
|
|
|
|
# either $max is the empty string |
3610
|
|
|
|
|
|
|
# or $min <= $max |
3611
|
|
|
|
|
|
|
sub parse_quant { |
3612
|
271
|
|
|
271
|
0
|
279
|
my ($str_ref) = @_; |
3613
|
271
|
100
|
|
|
|
723
|
if ($$str_ref =~ /\G \* /xmsgc) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
3614
|
238
|
|
|
|
|
623
|
return (0, ''); |
3615
|
|
|
|
|
|
|
} |
3616
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G \+ /xmsgc) { |
3617
|
16
|
|
|
|
|
38
|
return (1, ''); |
3618
|
|
|
|
|
|
|
} |
3619
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G \? /xmsgc) { |
3620
|
8
|
|
|
|
|
20
|
return (0, 1); |
3621
|
|
|
|
|
|
|
} |
3622
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G \{ /xmsgc) { |
3623
|
9
|
|
|
|
|
12
|
my ($min, $max); |
3624
|
9
|
50
|
|
|
|
26
|
if ($$str_ref =~ /\G ( [0-9]+ ) /xmsgc) { |
3625
|
9
|
|
|
|
|
18
|
$min = $1; |
3626
|
9
|
100
|
|
|
|
26
|
if ($$str_ref =~ /\G , ([0-9]*) /xmsgc) { |
3627
|
8
|
|
|
|
|
12
|
$max = $1; # may be '' |
3628
|
8
|
50
|
66
|
|
|
44
|
if (length($max) && $min > $max) { |
3629
|
0
|
|
|
|
|
0
|
parse_die("$min > $max", $str_ref); |
3630
|
|
|
|
|
|
|
} |
3631
|
|
|
|
|
|
|
} |
3632
|
|
|
|
|
|
|
else { |
3633
|
1
|
|
|
|
|
2
|
$max = $min; |
3634
|
|
|
|
|
|
|
} |
3635
|
|
|
|
|
|
|
} |
3636
|
|
|
|
|
|
|
else { |
3637
|
0
|
|
|
|
|
0
|
parse_die('number expected', $str_ref); |
3638
|
|
|
|
|
|
|
} |
3639
|
|
|
|
|
|
|
|
3640
|
9
|
50
|
|
|
|
33
|
if ($$str_ref !~ /\G \} /xmsgc) { |
3641
|
0
|
|
|
|
|
0
|
parse_die('} expected', $str_ref); |
3642
|
|
|
|
|
|
|
} |
3643
|
9
|
|
|
|
|
25
|
return ($min, $max); |
3644
|
|
|
|
|
|
|
} |
3645
|
|
|
|
|
|
|
else { |
3646
|
0
|
|
|
|
|
0
|
return; |
3647
|
|
|
|
|
|
|
} |
3648
|
|
|
|
|
|
|
} |
3649
|
|
|
|
|
|
|
|
3650
|
|
|
|
|
|
|
=item quote($string) |
3651
|
|
|
|
|
|
|
|
3652
|
|
|
|
|
|
|
Returns $string with escaped special characters. |
3653
|
|
|
|
|
|
|
|
3654
|
|
|
|
|
|
|
=cut |
3655
|
|
|
|
|
|
|
|
3656
|
|
|
|
|
|
|
sub quote { |
3657
|
0
|
|
|
0
|
1
|
0
|
my ($str) = @_; |
3658
|
0
|
|
|
|
|
0
|
$str =~ s/([.\[\\(*+?{|^\$])/\\$1/xsmg; |
3659
|
0
|
|
|
|
|
0
|
return $str; |
3660
|
|
|
|
|
|
|
} |
3661
|
|
|
|
|
|
|
|
3662
|
|
|
|
|
|
|
{ |
3663
|
|
|
|
|
|
|
my %char_to_cc_cache; |
3664
|
|
|
|
|
|
|
sub parse_alternation { |
3665
|
600
|
|
|
600
|
0
|
557
|
my ($str_ref, $has_anchor_ref) = @_; |
3666
|
600
|
|
|
|
|
478
|
my @all_nfas; |
3667
|
|
|
|
|
|
|
my $loop; |
3668
|
0
|
|
|
|
|
0
|
my @quants; |
3669
|
600
|
|
|
|
|
482
|
do { |
3670
|
1005
|
|
|
|
|
861
|
$loop = 0; |
3671
|
1005
|
|
|
|
|
1031
|
my $nfa = []; |
3672
|
1005
|
|
|
|
|
856
|
my $next_state_index = 1; |
3673
|
1005
|
|
|
|
|
735
|
while (1) { |
3674
|
1631
|
100
|
|
|
|
6900
|
if ($$str_ref =~ /\G ( $ERE_literal + ) /xmsogc) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
3675
|
1146
|
|
66
|
|
|
3906
|
push(@$nfa, |
3676
|
|
|
|
|
|
|
map { |
3677
|
437
|
|
|
|
|
1237
|
[ 0, [[ |
3678
|
|
|
|
|
|
|
$char_to_cc_cache{$_} ||= char_to_cc($_) |
3679
|
|
|
|
|
|
|
, $next_state_index++ |
3680
|
|
|
|
|
|
|
]]] |
3681
|
|
|
|
|
|
|
} |
3682
|
|
|
|
|
|
|
split('', $1) |
3683
|
|
|
|
|
|
|
); |
3684
|
|
|
|
|
|
|
} |
3685
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G ( \. + ) /xmsgc) { |
3686
|
53
|
|
|
|
|
159
|
push(@$nfa, |
3687
|
|
|
|
|
|
|
map { |
3688
|
52
|
|
|
|
|
144
|
[ 0, [[ |
3689
|
|
|
|
|
|
|
$cc_any |
3690
|
|
|
|
|
|
|
, $next_state_index++ |
3691
|
|
|
|
|
|
|
]]] |
3692
|
|
|
|
|
|
|
} |
3693
|
|
|
|
|
|
|
(1..length($1)) |
3694
|
|
|
|
|
|
|
); |
3695
|
|
|
|
|
|
|
} |
3696
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G ( \[ ) /xmsgc) { |
3697
|
102
|
|
|
|
|
185
|
push(@$nfa, |
3698
|
|
|
|
|
|
|
[ 0, [[ |
3699
|
|
|
|
|
|
|
bracket_expression_to_cc($str_ref) |
3700
|
|
|
|
|
|
|
, $next_state_index++ |
3701
|
|
|
|
|
|
|
]]] |
3702
|
|
|
|
|
|
|
); |
3703
|
102
|
50
|
|
|
|
355
|
if ($$str_ref !~ /\G ] /xmsgc) { |
3704
|
0
|
|
|
|
|
0
|
parse_die('] expected', $str_ref); |
3705
|
|
|
|
|
|
|
} |
3706
|
|
|
|
|
|
|
} |
3707
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G \\ (.) /xmsgc) { |
3708
|
17
|
|
66
|
|
|
83
|
push(@$nfa, |
3709
|
|
|
|
|
|
|
[ 0, [[ |
3710
|
|
|
|
|
|
|
$char_to_cc_cache{$1} ||= char_to_cc($1) |
3711
|
|
|
|
|
|
|
, $next_state_index++ |
3712
|
|
|
|
|
|
|
]]] |
3713
|
|
|
|
|
|
|
); |
3714
|
|
|
|
|
|
|
} |
3715
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G \^ /xmsgc) { |
3716
|
9
|
|
|
|
|
25
|
push(@$nfa, |
3717
|
|
|
|
|
|
|
[ 0, [[ |
3718
|
|
|
|
|
|
|
$cc_beg |
3719
|
|
|
|
|
|
|
, $next_state_index++ |
3720
|
|
|
|
|
|
|
]]] |
3721
|
|
|
|
|
|
|
); |
3722
|
9
|
|
100
|
|
|
26
|
$$has_anchor_ref ||= 1; |
3723
|
|
|
|
|
|
|
} |
3724
|
|
|
|
|
|
|
elsif ($$str_ref =~ /\G \$ /xmsgc) { |
3725
|
9
|
|
|
|
|
28
|
push(@$nfa, |
3726
|
|
|
|
|
|
|
[ 0, [[ |
3727
|
|
|
|
|
|
|
$cc_end |
3728
|
|
|
|
|
|
|
, $next_state_index++ |
3729
|
|
|
|
|
|
|
]]] |
3730
|
|
|
|
|
|
|
); |
3731
|
9
|
|
100
|
|
|
35
|
$$has_anchor_ref ||= 1; |
3732
|
|
|
|
|
|
|
} |
3733
|
|
|
|
|
|
|
else { |
3734
|
1005
|
|
|
|
|
967
|
last; |
3735
|
|
|
|
|
|
|
} |
3736
|
|
|
|
|
|
|
} |
3737
|
|
|
|
|
|
|
|
3738
|
1005
|
100
|
|
|
|
1577
|
if (@$nfa) { |
3739
|
500
|
100
|
|
|
|
1025
|
if ($$str_ref =~ /\G (?= [*+?{] ) /xmsgc) { |
3740
|
175
|
|
|
|
|
339
|
my $last_char_class = $$nfa[$#$nfa][1][0][0]; |
3741
|
175
|
100
|
|
|
|
331
|
if (@$nfa > 1) { |
3742
|
82
|
|
|
|
|
92
|
@{$$nfa[$#$nfa]} = (1, []); |
|
82
|
|
|
|
|
188
|
|
3743
|
82
|
|
|
|
|
108
|
push(@all_nfas, $nfa); |
3744
|
|
|
|
|
|
|
} |
3745
|
175
|
|
|
|
|
339
|
push(@quants, [scalar(@all_nfas), parse_quant($str_ref)]); |
3746
|
175
|
|
|
|
|
484
|
push(@all_nfas, [[0, [[$last_char_class, 1 ]]], [1, []]]); |
3747
|
175
|
|
|
|
|
218
|
$loop = 1; |
3748
|
|
|
|
|
|
|
} |
3749
|
|
|
|
|
|
|
else { |
3750
|
325
|
|
|
|
|
545
|
push(@$nfa, [1, []]); |
3751
|
325
|
|
|
|
|
364
|
push(@all_nfas, $nfa); |
3752
|
|
|
|
|
|
|
} |
3753
|
|
|
|
|
|
|
} |
3754
|
|
|
|
|
|
|
|
3755
|
1005
|
100
|
|
|
|
2633
|
if ($$str_ref =~ /\G \( /xmsgc) { |
3756
|
238
|
|
|
|
|
422
|
$nfa = _ere_to_nfa($str_ref, $has_anchor_ref); |
3757
|
238
|
50
|
|
|
|
836
|
if ($$str_ref !~ /\G \) /xmsgc) { |
3758
|
0
|
|
|
|
|
0
|
parse_die(') expected', $str_ref); |
3759
|
|
|
|
|
|
|
} |
3760
|
238
|
100
|
|
|
|
600
|
if ($$str_ref =~ /\G (?= [*+?{] ) /xmsgc) { |
3761
|
96
|
|
|
|
|
191
|
push(@quants, [scalar(@all_nfas), parse_quant($str_ref)]); |
3762
|
|
|
|
|
|
|
} |
3763
|
238
|
|
|
|
|
277
|
push(@all_nfas, $nfa); |
3764
|
238
|
|
|
|
|
456
|
$loop = 1; |
3765
|
|
|
|
|
|
|
} |
3766
|
|
|
|
|
|
|
} while ($loop); |
3767
|
|
|
|
|
|
|
|
3768
|
600
|
|
|
|
|
791
|
for (@quants) { |
3769
|
271
|
|
|
|
|
406
|
my ($i, $min, $max) = @$_; |
3770
|
271
|
|
66
|
|
|
1355
|
$all_nfas[$i] = nfa_quant( |
|
|
|
66
|
|
|
|
|
3771
|
|
|
|
|
|
|
$all_nfas[$i] |
3772
|
|
|
|
|
|
|
, $min, $max |
3773
|
|
|
|
|
|
|
, $min && $i != 0 && _nfa_has_suffix($all_nfas[$i-1]) |
3774
|
|
|
|
|
|
|
, $min && $i != $#all_nfas && _nfa_has_prefix($all_nfas[$i+1]) |
3775
|
|
|
|
|
|
|
); |
3776
|
|
|
|
|
|
|
} |
3777
|
|
|
|
|
|
|
|
3778
|
600
|
100
|
|
|
|
1060
|
if (@all_nfas > 1) { |
|
|
100
|
|
|
|
|
|
3779
|
209
|
|
|
|
|
352
|
return nfa_concat(@all_nfas); |
3780
|
|
|
|
|
|
|
} |
3781
|
|
|
|
|
|
|
elsif (@all_nfas) { |
3782
|
310
|
|
|
|
|
979
|
return $all_nfas[0]; |
3783
|
|
|
|
|
|
|
} |
3784
|
|
|
|
|
|
|
else { |
3785
|
81
|
|
|
|
|
290
|
return [[1, []]]; |
3786
|
|
|
|
|
|
|
} |
3787
|
|
|
|
|
|
|
} |
3788
|
|
|
|
|
|
|
} |
3789
|
|
|
|
|
|
|
|
3790
|
|
|
|
|
|
|
sub _nfa_has_prefix { |
3791
|
19
|
|
|
19
|
|
56
|
my ($nfa) = @_; |
3792
|
|
|
|
|
|
|
# initial state non-accepting or no loop back to it |
3793
|
19
|
|
33
|
|
|
99
|
!$$nfa[0][0] || !grep { $$_[1] == 0 } map { @{$$_[1]} } @$nfa; |
3794
|
|
|
|
|
|
|
} |
3795
|
|
|
|
|
|
|
|
3796
|
|
|
|
|
|
|
sub _nfa_has_suffix { |
3797
|
23
|
|
|
23
|
|
20
|
my ($nfa) = @_; |
3798
|
|
|
|
|
|
|
# all accepting states are final |
3799
|
23
|
100
|
|
|
|
28
|
!grep { $$_[0] && @{$$_[1]} } @$nfa |
|
55
|
|
|
|
|
108
|
|
|
23
|
|
|
|
|
167
|
|
3800
|
|
|
|
|
|
|
} |
3801
|
|
|
|
|
|
|
|
3802
|
|
|
|
|
|
|
sub parse_die { |
3803
|
0
|
|
|
0
|
0
|
0
|
my ($msg, $str_ref) = @_; |
3804
|
0
|
|
0
|
|
|
0
|
die("malformed regex: $msg at " |
3805
|
|
|
|
|
|
|
. (pos($$str_ref) || 0) . " in $$str_ref"); |
3806
|
|
|
|
|
|
|
} |
3807
|
|
|
|
|
|
|
|
3808
|
|
|
|
|
|
|
|
3809
|
|
|
|
|
|
|
############################################################################## |
3810
|
|
|
|
|
|
|
# Shorthands |
3811
|
|
|
|
|
|
|
############################################################################## |
3812
|
|
|
|
|
|
|
|
3813
|
|
|
|
|
|
|
=back |
3814
|
|
|
|
|
|
|
|
3815
|
|
|
|
|
|
|
=head2 Shorthands |
3816
|
|
|
|
|
|
|
|
3817
|
|
|
|
|
|
|
=over 4 |
3818
|
|
|
|
|
|
|
|
3819
|
|
|
|
|
|
|
=item C |
3820
|
|
|
|
|
|
|
:= C |
3821
|
|
|
|
|
|
|
|
3822
|
|
|
|
|
|
|
=cut |
3823
|
|
|
|
|
|
|
|
3824
|
|
|
|
|
|
|
sub ere_to_tree { |
3825
|
0
|
|
|
0
|
1
|
0
|
my ($ere) = @_; |
3826
|
0
|
|
|
|
|
0
|
return nfa_to_tree(ere_to_nfa($ere)); |
3827
|
|
|
|
|
|
|
} |
3828
|
|
|
|
|
|
|
|
3829
|
|
|
|
|
|
|
=item C |
3830
|
|
|
|
|
|
|
:= C |
3831
|
|
|
|
|
|
|
|
3832
|
|
|
|
|
|
|
=cut |
3833
|
|
|
|
|
|
|
|
3834
|
|
|
|
|
|
|
sub ere_to_regex { |
3835
|
0
|
|
|
0
|
1
|
0
|
my ($ere, $to_perlre) = (@_, 0); |
3836
|
0
|
|
|
|
|
0
|
return tree_to_regex(ere_to_tree($ere), $to_perlre); |
3837
|
|
|
|
|
|
|
} |
3838
|
|
|
|
|
|
|
|
3839
|
|
|
|
|
|
|
=item C |
3840
|
|
|
|
|
|
|
:= C |
3841
|
|
|
|
|
|
|
|
3842
|
|
|
|
|
|
|
=cut |
3843
|
|
|
|
|
|
|
|
3844
|
|
|
|
|
|
|
sub nfa_to_regex { |
3845
|
119
|
|
|
119
|
1
|
298
|
my ($nfa, $to_perlre) = (@_, 0); |
3846
|
119
|
|
|
|
|
265
|
return tree_to_regex(nfa_to_tree($nfa), $to_perlre); |
3847
|
|
|
|
|
|
|
} |
3848
|
|
|
|
|
|
|
|
3849
|
|
|
|
|
|
|
=item C |
3850
|
|
|
|
|
|
|
:= C |
3851
|
|
|
|
|
|
|
|
3852
|
|
|
|
|
|
|
=cut |
3853
|
|
|
|
|
|
|
|
3854
|
|
|
|
|
|
|
sub ere_to_input_constraints { |
3855
|
0
|
|
|
0
|
1
|
0
|
my ($ere) = @_; |
3856
|
0
|
|
|
|
|
0
|
return tree_to_input_constraints(ere_to_tree($ere)); |
3857
|
|
|
|
|
|
|
} |
3858
|
|
|
|
|
|
|
|
3859
|
|
|
|
|
|
|
=item C |
3860
|
|
|
|
|
|
|
:= C |
3861
|
|
|
|
|
|
|
|
3862
|
|
|
|
|
|
|
=cut |
3863
|
|
|
|
|
|
|
|
3864
|
|
|
|
|
|
|
sub nfa_to_input_constraints { |
3865
|
12
|
|
|
12
|
1
|
81
|
my ($nfa) = @_; |
3866
|
12
|
|
|
|
|
27
|
return tree_to_input_constraints(nfa_to_tree($nfa)); |
3867
|
|
|
|
|
|
|
} |
3868
|
|
|
|
|
|
|
|
3869
|
|
|
|
|
|
|
=item C |
3870
|
|
|
|
|
|
|
:= C |
3871
|
|
|
|
|
|
|
|
3872
|
|
|
|
|
|
|
=cut |
3873
|
|
|
|
|
|
|
|
3874
|
|
|
|
|
|
|
sub nfa_to_min_dfa { |
3875
|
207
|
|
|
207
|
1
|
28396
|
my ($nfa) = @_; |
3876
|
207
|
|
|
|
|
426
|
return dfa_to_min_dfa(nfa_to_dfa($nfa)); |
3877
|
|
|
|
|
|
|
} |
3878
|
|
|
|
|
|
|
|
3879
|
|
|
|
|
|
|
1; |
3880
|
|
|
|
|
|
|
|
3881
|
|
|
|
|
|
|
=back |
3882
|
|
|
|
|
|
|
|
3883
|
|
|
|
|
|
|
=head1 AUTHOR |
3884
|
|
|
|
|
|
|
|
3885
|
|
|
|
|
|
|
Loïc Jonas Etienne |
3886
|
|
|
|
|
|
|
|
3887
|
|
|
|
|
|
|
=head1 COPYRIGHT and LICENSE |
3888
|
|
|
|
|
|
|
|
3889
|
|
|
|
|
|
|
Artistic License 2.0 |
3890
|
|
|
|
|
|
|
http://www.perlfoundation.org/artistic_license_2_0 |