File Coverage

blib/lib/Regexp/ERE.pm

Criterion	Covered	Total	%
statement	1296	1412	91.7
branch	541	632	85.6
condition	258	323	79.8
subroutine	59	65	90.7
pod	23	42	54.7
total	2177	2474	88.0

line	stmt	bran	cond	sub	pod	time	code
1	8			8		5982	use 5.008008;
	8					19
	8					269
2	8			8		78	use strict;
	8					10
	8					194
3	8			8		23	use warnings;
	8					11
	8					160
4	8			8		4134	use integer;
	8					64
	8					29
5
6							package Regexp::ERE;
7							our $VERSION = '0.04';
8
9							BEGIN {
10	8			8		398	use Exporter ();
	8					11
	8					468
11	8			8		14	our (@ISA, @EXPORT_OK);
12	8					71	@ISA = qw(Exporter);
13	8					529	@EXPORT_OK = qw(
14							&ere_to_nfa
15							&ere_to_tree
16							&ere_to_regex
17							&ere_to_input_constraints
18							&nfa_to_tree
19							&nfa_to_regex
20							&nfa_to_input_constraints
21							&nfa_clone
22							&nfa_concat
23							&nfa_union
24							&nfa_inter
25							&nfa_match
26							&nfa_quant
27							&nfa_isomorph
28							&nfa_to_dfa
29							&dfa_to_min_dfa
30							&nfa_to_min_dfa
31							&tree_to_regex
32							&tree_to_input_constraints
33							&char_to_cc
34							&interval_list_to_cc
35							&cc_union
36							"e
37							);
38							}
39
40							=encoding utf8
41
42							=head1 NAME
43
44							Regexp::ERE - extended regular expressions and finite automata
45
46							=head1 SYNOPSIS
47
48							use Regexp::ERE qw(
49							&ere_to_nfa
50							&nfa_inter
51							&nfa_to_regex
52							&nfa_to_input_constraints
53							&nfa_to_dfa
54							&dfa_to_min_dfa
55							);
56
57							# condition 1: begins with abc or def
58							my $nfa1 = ere_to_nfa('^(abc\|def)');
59
60							# condition 2: ends with 123 or 456
61							my $nfa2 = ere_to_nfa('(123\|456)$');
62
63							# condition 1 and condition 2
64							my $inter_nfa = nfa_inter($nfa1, $nfa2);
65
66							# compute extended regular expression (string)
67							my $ere = nfa_to_regex($inter_nfa);
68
69							# compute perl regular expression
70							my $perlre = nfa_to_regex($inter_nfa, 1);
71
72							# compute weaker input constraints suitable for widgets
73							my ($input_constraints, $split_perlre)
74							= nfa_to_input_constraints($inter_nfa);
75
76							# minimal dfa (simpler regular expression happens to result)
77							my $nfa3 = ere_to_nfa('^(a\|ab\|b)*$');
78							my $dfa3 = nfa_to_dfa($nfa3);
79							my $min_dfa3 = dfa_to_min_dfa($dfa3);
80							my $ere3 = nfa_to_regex($min_dfa3);
81
82							=head1 DESCRIPTION
83
84							Pure-perl module for:
85
86							=over 4
87
88							=item *
89
90							Parsing POSIX Extended Regular Expressions (C<$ere>) into
91							Non-Deterministic Finite Automata (C<$nfa>)
92
93							=item *
94
95							Manipulating C<$nfa>s (concatenating, or-ing, and-ing)
96
97							=item *
98
99							Computing Deterministic Finite Automata (C<$dfa>s) from C<$nfa>s
100							(powerset construction)
101
102							=item *
103
104							Computing minimal C<$dfa>s from C<$dfa>s (Hopcroft's algorithm)
105
106							=item *
107
108							Computing C<$ere>s or Perl Regular Expressions from C<$nfa> or C<$dfa>
109							(Warshall algorithm)
110
111							=item *
112
113							Heuristically deriving (possibly weaker) constraints from a C<$nfa> or C<$dfa>
114							suitable for display in a graphical user interface,
115							i.e. a sequence of widgets of type 'free text' and 'drop down';
116
117							Example: C<'^(abc\|def)'> => C<$nfa> => C<[['abc', 'def'], 'free text']>
118
119							=back
120
121							=head1 GLOSSARY AND CONVERSIONS OVERVIEW
122
123							=head2 Conversions overview
124
125							$ere -> $nfa -> $tree -> $regex ($ere or $perlre)
126							-> $input_constraints
127
128							The second argument of -> $regex conversions is an optional boolean,
129							true : conversion to a compiled perl regular expression
130							false: conversion to an ere string
131
132							The -> $input_constraints conversions return a pair (
133							$input_constraints: aref as described at tree_to_input_constraints()
134							$split_perlre : a compiled perl regular expression
135							)
136
137
138							=head2 Glossary
139
140							=over 4
141
142							=item $char_class
143
144							A set of unicode characters.
145
146							=item $ere
147
148							Extended regular expression (string).
149							See C for the exact syntax.
150
151							=item $perlre
152
153							Perl regular expression
154
155							=item $nfa
156
157							Non-deterministic finite automaton
158
159							=item $dfa
160
161							Deterministic finite automaton (special case of C<$nfa>)
162
163							=item $tree
164
165							Intermediate hierarchical representation of a regular expression
166							(which still can be manipulated before stringification),
167							similar to a parse tree (but used for generating, not for parsing).
168
169							=item $input_constraints
170
171							Ad-hoc data structure representing a list of gui-widgets
172							(free text fields and drop-down lists),
173							a helper for entering inputs
174							conforming to a given C<$nfa>.
175
176							=back
177
178							=cut
179
180
181							##############################################################################
182							# Config
183							##############################################################################
184
185							# If true, nfa_to_tree() always expands concatenated alternations.
186							# Example: (ab\|cd) (ef\|gh) -> (abef\|abgh\|cdef\|cdgh)
187							our $TREE_CONCAT_FULL_EXPAND = 0;
188
189							# If true, prefixes and suffixes are factorized out even for
190							# trees with a single alternation.
191							# Example: (a1b\|a2b) -> a(1\|2)b
192							our $FULL_FACTORIZE_FIXES = 0;
193
194							# Should be 0. Else, traces nfa_to_tree() on STDERR.
195							use constant {
196	8					577	TRACE_NFA_TO_TREE => 0
197	8			8		35	};
	8					13
198
199							use constant {
200	8					1559	MAX_CHAR => 0x10FFFF
201							, CHAR_CLASS => 'cc' # for blessing $char_classes (label only, no methods)
202	8			8		32	};
	8					8
203
204
205							=head1 DATA STRUCTURES AND SUBROUTINES
206
207							Each of the documented subroutines can be imported,
208							for instance C.
209
210							=cut
211
212
213							##############################################################################
214							# $char_class
215							##############################################################################
216
217							=head2 Character class
218
219
220							WARNING: C<$char_class>es must be created exclusively by
221							C
222							or C
223							for equivalent character classes to be always the same array reference.
224							For the same reason, C<$char_class>es must never be mutated.
225
226							In this implementation, the state transitions of a C<$nfa> are based upon
227							character classes (not single characters). A character class is an ordered
228							list of disjoint, non-mergeable intervals (over unicode code points,
229							i.e. positive integers).
230
231							$char_class = [
232							[ $low_0, $high_0 ] # $interval_0
233							, [ $low_1, $high_1 ] # $interval_1
234							, ...
235							]
236
237
238							Constraints:
239
240							1: 0 <= $$char_class[$i][0] (0 <= low)
241							2: $$char_class[$i][1] <= MAX_CHAR (high <= MAX_CHAR)
242							3: $$char_class[$i][0] <= $$char_class[$i][1] (low <= high)
243							4: $$char_class[$i][1] + 1 < $$char_class[$i+1][0] (non mergeable)
244
245
246							Exceptions (anchors used only in the parsing phase only):
247
248							begin : [ -2, -1 ]
249							end : [ -3, -2 ]
250							begin or end : [ -3, -1 ]
251
252							Immediately after parsing, such pseudo-character classes
253							are removed by C (internal subroutine).
254
255							=over 4
256
257							=cut
258
259							our $ERE_literal = qr/ [^.[\\()*+?{\|^\$] /xms;
260							our $PERLRE_char_class_special = qr/ [\[\]\\\^\-] /xms;
261
262							our $cc_any = bless([[ 0, MAX_CHAR ]], CHAR_CLASS);
263							our $cc_none = bless([], CHAR_CLASS);
264							our $cc_beg = bless([[ -2, -1]], CHAR_CLASS);
265							our $cc_end = bless([[ -3, -2]], CHAR_CLASS);
266							{
267
268	8			8		38	no warnings qw(utf8); # in particular for 0x10FFFF
	8					9
	8					15499
269
270							my %cc_cache;
271							# keys: join(',',1,map{@$_}@{$char_class})
272
273							for ($cc_any, $cc_none, $cc_beg, $cc_end) {
274							$cc_cache{ join(',', 1, map {@$_} @$_) } = $_;
275							}
276
277							=item char_to_cc($c)
278
279							Returns the unique C<$char_class> equivalent to C<[[ord($c), ord($c)]]>.
280
281							=cut
282
283							sub char_to_cc {
284	857		100	857	1	192470	return $cc_cache{ join(',', 1, (ord($_[0])) x 2) }
285							\|\|= bless([[ord($_[0]), ord($_[0])]], CHAR_CLASS);
286							}
287
288							# $interval_list is the same data structure as $char_class.
289							# Constraints 1, 2 are assumed.
290							# Constraints 3, 4 are enforced.
291
292							=item interval_list_to_cc($interval_list)
293
294							C<$interval_list> is an arbitrary list of intervals.
295							Returns the unique C<$char_class> whose reunion of intervals
296							is the same set as the reunion of the intervals of C<$interval_list>.
297
298							Example:
299
300							interval_list_to_cc([[102, 112], [65, 90], [97, 102], [113, 122]])
301							returns [[65, 90], [97, 122]]
302							(i.e [f-p]\|[A-Z]\|[a-f]\|[q-z] => [A-Z]\|[a-z])
303
304							Note that both C<$interval_list> and C<$char_class> are lists of intervals,
305							but only C<$char_class> obeys the constraints above,
306							while C<$interval_list> does not.
307
308							Remark also that C is the identity
309							(returns the same reference as given) on C<$char_class>es returned
310							by either C or C.
311
312							=cut
313
314							sub interval_list_to_cc {
315	6031			6031	1	4883	my ($interval_list) = @_;
316							my @sorted
317	7353					7380	= sort { $$a[0] <=> $$b[0] }
	10004					15651
318	6031					5656	grep { $$_[0] <= $$_[1] }
319							@$interval_list
320							;
321	6031					7313	my $char_class = bless([], CHAR_CLASS);
322	6031					4879	my $i = 0;
323	6031					9038	while ($i != @sorted) {
324	6939					5528	my $interval = $sorted[$i];
325	6939					4794	$i++;
326	6939		100			14390	while ($i != @sorted && $$interval[1] + 1 >= $sorted[$i][0]) {
327	3065	100				4458	if ($$interval[1] < $sorted[$i][1]) {
328	3064					3118	$$interval[1] = $sorted[$i][1];
329							}
330	3065					7705	$i++;
331							}
332	6939					12168	push(@$char_class, $interval);
333							}
334	6031		66			6165	return $cc_cache{ join(',', 1, map {@$_} @$char_class) }
	6939					23469
335							\|\|= $char_class;
336							}
337
338							sub cc_neg {
339	1260			1260	0	1125	my ($char_class) = @_;
340
341	1260	100				2295	if (!@$char_class) { return $cc_any; }
	85					328
342
343	1175					1494	my $neg = bless([], CHAR_CLASS);
344	1175	100				2205	if ($$char_class[0][0] != 0) {
345	1107					1967	push(@$neg, [0, $$char_class[0][0] - 1]);
346							}
347	1175					973	my $i = 0;
348	1175					2151	while ($i != $#$char_class) {
349	194					434	push(@$neg, [$$char_class[$i][1] + 1, $$char_class[$i+1][0] - 1]);
350	194					324	$i++;
351							}
352	1175	100				1958	if ($$char_class[$i][1] != MAX_CHAR) {
353	1107					1764	push(@$neg, [$$char_class[$i][1] + 1, MAX_CHAR]);
354							}
355	1175		66			1402	return $cc_cache{ join(',', 1, map{@$_} @$neg) } \|\|= $neg;
	2408					11433
356							}
357
358							sub cc_inter2 {
359	440			440	0	385	my ($char_class_0, $char_class_1) = @_;
360
361	440					617	my $inter = bless([], CHAR_CLASS);
362	440					326	my $i_0 = 0;
363	440					343	my $i_1 = 0;
364	440		100			1342	while ($i_0 < @$char_class_0 && $i_1 < @$char_class_1) {
365
366							# skip interval_0 if interval_0 < interval_1
367	543		66			2526	while (
			100
368							$i_0 < @$char_class_0
369							&& $i_1 < @$char_class_1
370							&& $$char_class_0[$i_0][1] < $$char_class_1[$i_1][0]
371							) {
372	367					1395	$i_0++;
373							}
374
375							# skip interval_1 if interval_1 < interval_0
376	543		100			2240	while (
			100
377							$i_0 < @$char_class_0
378							&& $i_1 < @$char_class_1
379							&& $$char_class_1[$i_1][1] < $$char_class_0[$i_0][0]
380							) {
381	192					637	$i_1++;
382							}
383
384							# Check that the exit condition of the first while still holds.
385	543	100	100			2376	if (
			100
386							$i_0 < @$char_class_0
387							&& $i_1 < @$char_class_1
388							&& $$char_class_1[$i_1][0] <= $$char_class_0[$i_0][1]
389							) {
390							# The exit conditions of both whiles hold:
391							#
392							# $$char_class_0[$i_0][1] >= $$char_class_1[$i_1][0]
393							# && $$char_class_1[$i_1][1] >= $$char_class_0[$i_0][0]
394							#
395							# short:
396							# high_0 >= low_1
397							# high_1 >= low_0
398							#
399							# furthermore:
400							# high_0 >= low_0
401							# high_1 >= low_1
402							#
403							# with:
404							# min_high := min(high_0, high_1)
405							# max_low := max(low_0, low_1)
406							#
407							# holds:
408							# min_high >= max_low_0
409
410	291					209	my ($interval_0_done, $interval_1_done);
411
412	291	100				518	my $max_low =
413							$$char_class_0[$i_0][0] > $$char_class_1[$i_1][0]
414							? $$char_class_0[$i_0][0]
415							: $$char_class_1[$i_1][0]
416							;
417
418	291					184	my $min_high;
419	291	100				477	if ($$char_class_0[$i_0][1] <= $$char_class_1[$i_1][1]) {
420	186					152	$min_high = $$char_class_0[$i_0][1];
421							# interval_0 < next interval_1
422	186					159	$interval_0_done = 1;
423							}
424	291	100				499	if ($$char_class_1[$i_1][1] <= $$char_class_0[$i_0][1]) {
425	208					193	$min_high = $$char_class_1[$i_1][1];
426							# interval_1 < next interval_0
427	208					149	$interval_1_done = 1;
428							}
429	291	100				391	if ($interval_0_done) { $i_0++; }
	186					133
430	291	100				352	if ($interval_1_done) { $i_1++; }
	208					145
431
432	291					1057	push(@$inter, [$max_low, $min_high]);
433							}
434							}
435	440		33			1256	return $cc_cache{ join(',', 1, map{@$_} @$inter) } \|\|=$inter;
	291					1379
436							}
437							}
438
439							sub cc_match {
440	117555			117555	0	91132	my ($char_class, $c) = @_;
441	117555					99276	for my $interval (@$char_class) {
442	138519	100				228247	if ($c < $$interval[0]) {
		100
443	46706					84021	return 0;
444							}
445							elsif ($c <= $$interval[1]) {
446	27591					52211	return 1;
447							}
448							}
449	43258					75823	return 0;
450							}
451
452							=item cc_union(@char_classes)
453
454							Returns the unique C<$char_class> containing all characters of all given
455							C<@char_classes>.
456
457							=cut
458
459							sub cc_union {
460	2098			2098	1	3225	return interval_list_to_cc( [ map { map { [@$_] } @$_ } @_ ] );
	4964					4103
	5492					9557
461							}
462
463							sub cc_is_subset {
464	56			56	0	64	my ($char_class_0, $char_class_1) = @_;
465	56					66	for my $c ( map { @$_ } @$char_class_0 ) {
	59					102
466	89	100				113	if (!cc_match($char_class_1, $c)) { return 0; }
	32					122
467							}
468	24					66	return 1;
469							}
470
471							# $to_perlre (boolean)
472							# true : perl syntax
473							# false: ere syntax
474							sub cc_to_regex {
475	564			564	0	615	my ($char_class, $to_perlre) = (@_, 0);
476
477	564					424	my @items;
478	564	50	66			1872	if (@$char_class && $$char_class[0][0] < 0) {
479	0	0				0	if ($$char_class[0][0] == -2) {
480	0	0				0	if ($$char_class[0][1] == -1) {
481	0					0	push(@items, '^');
482							}
483							else {
484	0					0	push(@items, '^$');
485							}
486							}
487							else {
488	0	0				0	if ($$char_class[0][1] == -2) {
489	0					0	push(@items, '$');
490							}
491							else {
492	0					0	push(@items, '^', '$');
493							}
494							}
495	0					0	$char_class = [@$char_class[1..$#$char_class]];
496							}
497	564	100				759	if (@$char_class) {
498	524	100	100			1625	if (
		100	100
		100	66
499							@$char_class == 1
500							&& $$char_class[0][0] == $$char_class[0][1]
501							) {
502	491					537	my $c = chr($$char_class[0][0]);
503	491	100				498	if ($to_perlre) {
504	115					154	push(@items, quotemeta($c))
505							}
506							else {
507	376	100				1117	push(@items,
508							$c =~ /$ERE_literal/o
509							? $c
510							: "\\$c"
511							);
512							}
513							}
514							elsif (
515							@$char_class == 1
516							&& $$char_class[0][0] == 0
517							&& $$char_class[0][1] == MAX_CHAR
518							) {
519	4					8	push(@items, '.');
520							}
521							elsif ($$char_class[$#$char_class][1] == MAX_CHAR) {
522	8	100				15	if ($to_perlre) {
523	2					4	push(@items,
524							'[^' . _cc_to_perlre(cc_neg($char_class)) . ']'
525							);
526							}
527							else {
528	6					17	push(@items,
529							'[^' . _cc_to_ere(cc_neg($char_class)) . ']'
530							);
531							}
532							}
533							else {
534	21	100				35	if ($to_perlre) {
535	18					35	push(@items, '[' . _cc_to_perlre($char_class) . ']');
536							}
537							else {
538	3					11	push(@items, '[' . _cc_to_ere($char_class) . ']');
539							}
540							}
541							}
542
543	564					411	my $regex;
544	564	100				951	if (@items == 0) {
		50
545	40					139	return '';
546							}
547							elsif (@items == 1) {
548	524					1935	return $items[0];
549							}
550							else {
551	0	0				0	if ($to_perlre) {
552	0					0	return '(?:' . join('\|', @items) . ')';
553							}
554							else {
555	0					0	return '(' . join('\|', @items) . ')';
556							}
557							}
558							}
559
560							sub _cc_to_ere {
561	9			9		11	my ($char_class) = @_;
562	9					11	my $has_minus;
563							my $has_r_bracket;
564							my $ere = join('',
565							map {
566	9	100				14	if ($$_[0] == $$_[1]) {
	11					30
567	6	50				37	if ($$_[0] == ord('-')) {
		50
568	0					0	$has_minus = 1;
569	0					0	'';
570							}
571							elsif ($$_[0] == ord(']')) {
572	0					0	$has_r_bracket = 1;
573	0					0	'';
574							}
575							else {
576	6					17	chr($$_[0]);
577							}
578							}
579							else {
580	5	50	33			28	if (
581							$$_[0] == ord('-')
582							\|\| $$_[0] == ord(']')
583							) {
584	0	0				0	if ($$_[0] == ord('-')) {
585	0					0	$has_minus = 1;
586							}
587							else {
588	0					0	$has_r_bracket = 1;
589							}
590	0	0				0	if ($$_[1] == $$_[0] + 1) {
		0
591	0					0	chr($$_[1]);
592							}
593							elsif ($$_[1] == $$_[0] + 2) {
594	0					0	chr($$_[0] + 1) . chr($$_[1]);
595							}
596							else {
597	0					0	chr($$_[0] + 1) . '-' . chr($$_[1]);
598							}
599							}
600							else {
601	5	100				17	if ($$_[1] == $$_[0] + 1) {
602	4					19	chr($$_[0]) . chr($$_[1]);
603							}
604							else {
605	1					4	chr($$_[0]) . '-' . chr($$_[1]);
606							}
607							}
608							}
609							}
610							@$char_class
611							);
612	9	50				24	if ($has_minus) { $ere .= '-'; }
	0					0
613	9	50				18	if ($has_r_bracket) { $ere = "]$ere"; }
	0					0
614	9					23	return $ere;
615							}
616
617							sub _cc_to_perlre {
618	20			20		23	my ($char_class) = @_;
619							return join('',
620							map {
621	20	100				30	if ($$_[0] == $$_[1]) {
	49					104
622	44					55	my $c = chr($$_[0]);
623	44	50				141	$c =~ /$PERLRE_char_class_special/o ? "\\$c" : $c;
624							}
625							else {
626	5					14	my ($c1, $c2) = (chr($$_[0]), chr($$_[1]));
627	5	50				47	($c1 =~ /$PERLRE_char_class_special/o ? "\\$c1" : $c1)
		100
		50
628							. ($$_[0] + 1 < $$_[1] ? '-' : '')
629							. ($c2 =~ /$PERLRE_char_class_special/o ? "\\$c2" : $c2)
630							}
631							} @$char_class
632							);
633							}
634
635
636							##############################################################################
637							# $nfa
638							##############################################################################
639
640							=back
641
642							=head2 Nfa
643
644
645							WARNING: C routines are destructive,
646							the C<$nfa> references given as arguments will not be valid C<$nfa> any more.
647							Furthermore, the same C<$nfa> reference must be used only once as argument.
648							For instance, for concatenating a C<$nfa> with itself, C
649							does not work; instead, C must be used;
650							or even C if the original
651							C<$nfa> is to be used further.
652
653							$nfa = [ $state_0, $state_1, ... ]
654
655							$state = [
656							$accepting
657							, $transitions
658							]
659
660							$transitions = [
661							[ $char_class_0 => $state_ind_0 ]
662							, [ $char_class_1 => $state_ind_1 ]
663							, ...
664							]
665
666							In the same C<$transition>, C<$state_ind_i> are pairwise different and are
667							valid indexes of C<@$nfa>. There is exactly one initial state at index 0.
668
669							=over 4
670
671							=item C
672
673							Maps each of the given C<@nfas> to a clone.
674
675							=cut
676
677							sub nfa_clone {
678							return
679	298			298	1	335	map { [
	600					1170
680	344					374	map { [
681							$$_[0] # accepting
682	826					685	, [ map { [ @$_ ] } @{$$_[1]} ] # transitions
	826					1411
683							] }
684							@$_ # states of the $nfa
685							] } @_ # list of $nfas
686							;
687							}
688
689							sub _transitions_is_subset {
690	461			461		498	my ($transitions_0, $transitions_1, $state_ind_map) = @_;
691	521	100	66			2630	my %state_ind_to_t_1
692	461					481	= map {(
693							$state_ind_map && exists($$state_ind_map{$$_[1]})
694							? $$state_ind_map{$$_[1]}
695							: $$_[1]
696							=> $_
697							)}
698							@$transitions_1
699							;
700	461					728	for my $t_0 (@$transitions_0) {
701	212	100	66			829	my $state_ind_0
702							= $state_ind_map && exists($$state_ind_map{$$t_0[1]})
703							? $$state_ind_map{$$t_0[1]}
704							: $$t_0[1]
705							;
706	212	100				372	if (!exists($state_ind_to_t_1{$state_ind_0})) { return 0; }
	185					931
707	27					31	my $t_1 = $state_ind_to_t_1{$state_ind_0};
708	27	100				59	if (!cc_is_subset($$t_0[0], $$t_1[0])) { return 0; }
	17					72
709							}
710	259					553	return 1;
711							}
712
713							# The keys of %$state_ind_to_equiv are state_inds of @$nfa to be removed.
714							# State indexes in transitions are remapped following %$state_ind_to_equiv.
715							# A state index mapped to itself denotes an unreachable state index.
716							sub _nfa_shrink_equiv {
717	835			835		966	my ($nfa, $state_ind_to_equiv) = @_;
718	835					795	my $i = 0;
719	3927					6625	my %compact_map
720	5190					7293	= map { ($_ => $i++) }
721							my @active_state_inds
722	835					1295	= grep { !exists($$state_ind_to_equiv{$_}) }
723							(0..$#$nfa)
724							;
725
726	835					1342	my %equiv_index_to_char_classes;
727							my %plain_index_to_char_class;
728	835					2682	for (@$nfa = @$nfa[@active_state_inds]) {
729
730							# update $state_ind
731							# -> $compact_map{$state_ind}
732							# or $compact_map{$$state_ind_to_equiv{$state_ind}}
733	3927					3529	%equiv_index_to_char_classes = ();
734	3927					3362	%plain_index_to_char_class = ();
735	3927					2803	for (@{$$_[1]}) { # transition list
	3927					4768
736	7095	100				8948	if (exists($$state_ind_to_equiv{$$_[1]})) {
737							push(
738	1018					759	@{$equiv_index_to_char_classes{
739	1018					2586	$$_[1]
740							= $compact_map{$$state_ind_to_equiv{$$_[1]}}
741							}}
742							, $$_[0]
743							);
744							}
745							else {
746							$plain_index_to_char_class{
747	6077					9605	$$_[1]
748							= $compact_map{$$_[1]}
749							} = $$_[0];
750							}
751							}
752							# merge char_classes to the same state index
753	3927	100				6774	if (keys(%equiv_index_to_char_classes)) {
754	804					1958	@{$$_[1]} = ((
	115					194
755	811					2042	map {[
756							exists($equiv_index_to_char_classes{$_})
757							? cc_union(
758							$plain_index_to_char_class{$_}
759	641	100				1353	, @{$equiv_index_to_char_classes{$_}}
760							)
761							: $plain_index_to_char_class{$_}
762							, $_
763							]}
764							keys(%plain_index_to_char_class)
765							) , (
766	926					1387	map {[
767	16					32	@{$equiv_index_to_char_classes{$_}} == 1
768							? $equiv_index_to_char_classes{$_}[0]
769	811	100				603	: cc_union(@{$equiv_index_to_char_classes{$_}})
770							, $_
771							]}
772	804					1218	grep { !exists($plain_index_to_char_class{$_}) }
773							keys(%equiv_index_to_char_classes)
774							))
775							}
776							}
777	835					5399	return $nfa;
778							}
779
780							=item C
781
782							Precondition: C<0 <= $min && ( $max eq '' \|\| $min <= $max)>
783
784							Returns C<$out_nfa>, a C<$nfa> computed from C<$in_nfa>.
785
786							Let L be the language accepted by C<$in_nfa> and M the language accepted
787							by C<$out_nfa>. Then a word m belongs to M if and only if and ordered list
788							(l_1, ..., l_r) of words belonging to L exists such that:
789
790							$min <= r
791							and ($max eq '' or r <= $max)
792							and m is the concatenation of (l_1, ..., l_r)
793
794							Examples with C<$in_nfa> being a C<$nfa> accepting C<'^a$'>:
795
796							nfa_quant($in_nfa, 2, 4 ) accepts '^a{2,4}$'
797							nfa_quant($in_nfa, 0, '') accepts '^a{0,}$' (i.e. '^a*$')
798
799							C<$pref_has_prefix> and C<$next_has_prefix> are hints for dispatching C<$min>,
800							for example:
801
802							'a+' => 'a*a' (!$prev_has_suffix && $next_has_prefix)
803							'a+' => 'aa*' ( $prev_has_suffix && !$next_has_prefix)
804							'a{2,}' => 'aa*a' ( $prev_has_suffix && $next_has_prefix)
805
806							=cut
807
808							sub nfa_quant {
809	271			271	1	349	my ($nfa, $min, $max, $prev_has_suffix, $next_has_prefix) = @_;
810	271					227	my @quant_parts;
811							my $optional_part;
812
813							# dispatch min left and right: a+b => aab, ba+ => baa
814	8			8		91	use integer;
	8					13
	8					28
815	271	100	66			516	my ($min_left, $min_right)
		100	66
		100
816							=
817							# no suffix, no prefix
818							$min == 0 ? (0 , 0 )
819
820							# no suffix, maybe prefix
821							: !($next_has_prefix && _nfa_has_suffix($nfa)) ? ($min , 0 )
822
823							# suffix, no prefix
824							: !($prev_has_suffix && _nfa_has_prefix($nfa)) ? (0 , $min )
825
826							# suffix and prefix
827							: (($min+1)/2, $min/2)
828							;
829
830	271	100				444	if ($min_left > 0) {
831	13					27	push(@quant_parts, nfa_concat(nfa_clone(($nfa) x $min_left)));
832							}
833	271	100	100			672	if (length($max) == 0 \|\| $max > $min) {
834	270	100				579	if ($$nfa[0][0]) {
	495	100				855
835							# initial state already accepting
836	7					15	($optional_part) = nfa_clone($nfa);
837							}
838							elsif (
839	639					909	!grep { $$_[1] == 0 }
840	639					428	map { @{$$_[1]} }
841							@$nfa
842							) {
843							# initial state not accepting and unreachable
844	251					441	($optional_part) = nfa_clone($nfa);
845	251					438	$$optional_part[0][0] = 1;
846							}
847							else {
848							# initial state not accepting and reachable
849	24					70	$optional_part = [
850							# additional root initial state accepting state
851							[
852							1 # accepting
853	12					27	, [ map {[$$_[0] , $$_[1]+1]} @{$$nfa[0][1]} ] # transitions
	32					70
854							]
855							# original states with offset 1
856	12					25	, map { [
857							$$_[0] # accepting
858	32					86	, [ map {[ $$_[0], $$_[1]+1 ]} @{$$_[1]} ] # transitions
	32					66
859							] }
860							@$nfa
861							];
862							}
863							}
864	271	100				452	if (length($max) == 0) {
		100
865
866							# starify optional part
867
868	302					763	my %root_index_to_char_class
869	256					396	= map { ($$_[1] => $$_[0]) }
870	256					219	@{$$optional_part[0][1]}
871							;
872
873	256					364	my $state_ind_to_equiv = {};
874							# loop over accepting state indexes
875	256					409	for (grep { $$optional_part[$_][0] } (1..$#$optional_part)) {
	366					586
876	256	100				768	if (
877							_transitions_is_subset(
878							$$optional_part[$_][1]
879							, $$optional_part[0][1]
880							, { $_ => 0 }
881							)
882							) {
883							# Accepting states whose transitions are
884							# a subset of the transitions of the initial state
885							# are equivalent to the initial state.
886	251					668	$$state_ind_to_equiv{$_} = 0;
887							}
888							else {
889	5	100				7	if (
890	5					15	grep { exists($root_index_to_char_class{$_}) }
	5					31
891	5					13	map { $$_[1] }
892							@{$$optional_part[$_][1]}
893							) {
894							# merge char classes to the same state index
895	2					6	my %new_index_to_char_classes
896	2					4	= map { ($$_[1] => [$$_[0]]) }
897	2					2	@{$$optional_part[$_][1]}
898							;
899	2					4	for (keys(%root_index_to_char_class)) {
900	4					8	push (
901	4					5	@{$new_index_to_char_classes{$_}}
902							, $root_index_to_char_class{$_}
903							);
904							}
905	2					9	@{$$optional_part[$_][1]}
	4					10
906	2					4	= map {[
907	2					3	@{$new_index_to_char_classes{$_}} == 1
908							? $new_index_to_char_classes{$_}[0]
909	4	100				5	: cc_union(@{$new_index_to_char_classes{$_}})
910							, $_
911							]}
912							keys(%new_index_to_char_classes)
913							;
914							}
915							else {
916	3					7	push(
917	5					15	@{$$optional_part[$_][1]}
918	3					7	, map { [@$_] } @{$$optional_part[0][1]}
	3					11
919							);
920							}
921							}
922							}
923	256	100				828	push(@quant_parts,
924							keys(%$state_ind_to_equiv)
925							? _nfa_shrink_equiv($optional_part, $state_ind_to_equiv)
926							: $optional_part
927							);
928							}
929							elsif ($max > $min) {
930
931							# concatenate optional_part $max - $min times
932
933	14					37	push(@quant_parts, _nfa_concat(1, nfa_clone(
934							($optional_part) x ($max - $min)
935							)));
936							}
937	271	100				529	if ($min_right > 0) {
938	13					37	push(@quant_parts, nfa_concat(nfa_clone(($nfa) x $min_right)));
939							}
940	271	100				1034	return @quant_parts == 1 ? $quant_parts[0] : nfa_concat(@quant_parts);
941							}
942
943							=item C
944
945							Returns C<$out_nfa>, a C<$nfa> computed from C<@in_nfas>.
946
947							Let r be the number of given C<@in_nfas>,
948							L_i the language accepted by C<$in_nfas[$i]> and M the language accepted
949							by C<$out_nfa>. Then a word m belongs to M if and only if an ordered list
950							(l_1, ..., l_r) of words exists, l_i belonging to L_i, such that
951							m is the concatenation of (l_1, ..., l_r).
952
953							=cut
954
955							sub nfa_concat {
956	302			302	1	534	_nfa_concat(0, @_);
957							}
958
959							sub _nfa_concat {
960	316			316		386	my $starifying = shift(@_);
961	316	50				552	if (!@_) {
962	0					0	return [[1, []]]; # neutral element: accepting empty string
963							}
964	316					323	my $concat = shift(@_); # result, to be extended
965	316					452	my @accepting_state_inds = grep { $$concat[$_][0] } (0..$#$concat);
	785					1051
966	316					358	my $state_ind_to_equiv = {};
967							my (
968	316					307	$state
969							, $init_state_ind
970							, $init_reachable
971							, $init_equiv_reachable
972							, $init_skipped
973							, @new_accepting_state_inds
974							);
975							# extend @$concat
976	316					404	for my $nfa (@_) {
977	401					347	$init_state_ind = @$concat;
978	401					333	$init_reachable = 0;
979	401					296	$init_equiv_reachable = 0;
980	401					311	$init_skipped = 0;
981							@new_accepting_state_inds
982	466					733	= map { $_ + $init_state_ind }
	1115					1213
983	401					534	grep { $$nfa[$_][0] }
984							(0..$#$nfa)
985							;
986
987							# renumber states, count states with transition to the initial state
988	401					450	for (map { @{$$_[1]} } @$nfa) {
	1115					799
	1115					1462
989	1063	100				2100	($$_[1] += $init_state_ind) == $init_state_ind
990							&& $init_reachable++;
991							}
992							# join old accepting states with new initial state
993	401					540	for my $acc_ind (@accepting_state_inds) {
994	510					511	$state = $$concat[$acc_ind]; # old accepting state
995	510					534	$$state[0] = $$nfa[0][0]; # overwrite accepting
996	510	100	100			452	if (
		100	66
			100
			100
997	510					1134	@{$$state[1]} == 0 # no transition
	274					1028
998							\|\| @{$$state[1]} == 1 # one transition
999							&& _transitions_is_subset(
1000							$$state[1] # transition of the old accepting state
1001							, $$nfa[0][1] # transitions of the new initial state
1002							, { $acc_ind => $init_state_ind }
1003							)
1004	42					165	) {
1005
1006							# Old accepting states whose transitions are
1007							# a subset of the transitions of the new initial state
1008							# are equivalent to the initial state.
1009							#
1010							# Note that such an old accepting states can have either
1011							# no transition or one self-transition;
1012							# the case that the old accepting state has no transition
1013							# occurs very often.
1014							#
1015							# %$state_ind_to_equiv gets extended by
1016							#
1017							# $acc_ind (old accepting state) => $init_state_ind
1018							#
1019							# But the keys and the values of %$state_ind_to_equiv
1020							# MUST remain disjoint (except for pairs key = val).
1021							#
1022							# Since $init_state_index are growing
1023							# and $acc_ind < $init_state_index:
1024							# - the new value does not belong the the keys
1025							# - the new key may belong to the vals,
1026							# such values must be updated.
1027							#
1028							# Example:
1029							# 0 => 1 ( %$state_ind_to_equiv )
1030							# 1 => 2 ( $acc_ind => $init_state_index )
1031							# %$state_ind_to_equiv must be updated to
1032							# 0 => 2
1033							# before being extended by
1034							# 1 => 2
1035	244					473	for (grep { $_ == $acc_ind } values(%$state_ind_to_equiv)) {
	151					222
1036	3					7	$_ = $init_state_ind;
1037							}
1038	244					414	$$state_ind_to_equiv{$acc_ind} = $init_state_ind;
1039	244					436	$init_equiv_reachable++;
1040							}
1041							elsif (
1042							$init_reachable == 1
1043	34					50	&& (grep { $$_[1] == $init_state_ind } @{$$nfa[0][1]})
	29					46
1044							&& cc_is_subset(
1045
1046							# char_class of the self-transition
1047							# of the new initial state
1048							(
1049	33					48	map { $$_[0] }
1050	29					47	grep { $$_[1] == $init_state_ind }
1051	22					36	@{$$nfa[0][1]}
1052							)
1053
1054							# char_class of the self-transition
1055							# of the old accepting state
1056							, (
1057	42					63	map { $$_[0] }
1058	29					35	grep { $$_[1] == $acc_ind }
1059							@{$$state[1]}
1060							)
1061							)
1062							) {
1063							# If the self-transitions of the new init state are
1064							# a subset of the transitions of the old accepting state,
1065							# the new state is not needed for looping;
1066							# the transition to the new init state can be skipped.
1067							#
1068							# Example 1:
1069							# [ab]a
1070							# the state for a* is superfluous.
1071							# Example 2:
1072							# ( x[ab]* \| y[ac]* \| z[bc]* ) a* c
1073							# the state for a* is only needed after [bc]*
1074							# the regular expression is equivalent to:
1075							# x[ab]c \| y[ac]c \| z[bc]ac
1076							#
1077							# Note that this one-letter-star optimization is
1078							# probably not very useful for practical purposes;
1079							# more general equivalences like (abc)(abc) ~ (abc)*
1080							# are not caught up, while the focused use cases
1081							# of prefix and suffix recognition require no star at all.
1082							#
1083							# It is merely a toy optimization for solving some exercises
1084							# of an introductory course on regexs.
1085							#
1086	14					19	push(@{$$state[1]},
	2					4
1087	16					26	map { [ @$_ ] }
1088	14					26	grep { $$_[1] != $init_state_ind}
1089	14					15	@{$$nfa[0][1]})
1090							;
1091	14					27	$init_skipped++;
1092							}
1093							else {
1094	252					314	push(@{$$state[1]},
	279					765
1095	252					345	map { [ @$_ ] }
1096	252					193	@{$$nfa[0][1]})
1097							;
1098							}
1099							}
1100	401	100	100			1877	if (
			100
1101							!$init_reachable && !$init_equiv_reachable
1102							\|\| $init_skipped == @accepting_state_inds
1103							) {
1104							# for being removed by _nfa_shrink_equiv()
1105	146					256	$$state_ind_to_equiv{$init_state_ind} = $init_state_ind;
1106							}
1107
1108	401	100				748	if (!$$nfa[0][0]) {
		100
1109	205					254	@accepting_state_inds = ();
1110							}
1111							elsif ($starifying) {
1112							# $starifying set for optimizing x{n,m}.
1113							# The old accepting states are redundant,
1114							# since reachable iff the newer ones are.
1115	13					22	for (@accepting_state_inds[1..$#accepting_state_inds]) {
1116	15					21	$$concat[$_][0] = 0;
1117							}
1118	13	50				26	if (!$init_reachable) {
1119	13					12	$$nfa[0][0] = 0;
1120	13					13	shift(@new_accepting_state_inds);
1121							}
1122	13					15	@accepting_state_inds = (0);
1123							}
1124							else {
1125							@accepting_state_inds
1126	183					198	= grep { !exists($$state_ind_to_equiv{$_}) }
	200					428
1127							@accepting_state_inds
1128							;
1129							}
1130
1131	401					612	push(@$concat, @$nfa);
1132	401					527	push(@accepting_state_inds, @new_accepting_state_inds);
1133							}
1134	316	100				537	if (keys(%$state_ind_to_equiv)) {
1135	257					358	return _nfa_shrink_equiv($concat, $state_ind_to_equiv);
1136							}
1137							else {
1138	59					153	return $concat;
1139							}
1140							}
1141
1142							=item C
1143
1144							Returns C<$out_nfa>, a C<$nfa> computed from C<@in_nfas>.
1145
1146							C<$out_nfa> accepts a word w if and only if at least one of C<@in_nfas>
1147							accepts w.
1148
1149							=cut
1150
1151							# Adds the total number of states
1152							sub nfa_union {
1153	130			130	1	259	my $union = [[0, []]]; # root, neutral element: accepting nothing
1154	130					166	my $state_ind_to_equiv = {};
1155	130					119	my $first_trivial_accepting_state_ind;
1156							my (
1157	130					143	$nfa
1158							, $init_state_ind
1159							, $init_reachable
1160							, $orig_state
1161							);
1162
1163	130					188	for $nfa (@_) {
1164
1165							# merge initial $accepting
1166	300		100			868	$$union[0][0] \|\|= $$nfa[0][0];
1167	300	100	100			623	if (@$nfa == 1 && @{$$nfa[0][1]} == 0) {
	68					194
1168	57					86	next;
1169							# Must be skipped because such a trivial state
1170							# would be removed below (!$init_reachable)
1171							# although it may be the $first_trivial_accepting state.
1172							#
1173							# On the other side, a well defined $nfa
1174							# with a single state and with a non-empty transition list
1175							# must loop to itself, thus $init_reachable.
1176							}
1177
1178	243					216	$init_state_ind = @$union;
1179	243					210	$init_reachable = 0;
1180	243					397	for (0..$#$nfa) {
1181	1261					1174	$orig_state = $$nfa[$_];
1182	1261	100	100			2181	if (
1183	266					623	$$orig_state[0] # accepting
1184							&& !@{$$orig_state[1]} # trivial
1185							) {
1186	164	100				225	if (defined($first_trivial_accepting_state_ind)) {
1187	87					205	$$state_ind_to_equiv{$_ + $init_state_ind}
1188							= $first_trivial_accepting_state_ind;
1189							}
1190							else {
1191	77					134	$first_trivial_accepting_state_ind
1192							= $_ + $init_state_ind;
1193							}
1194							}
1195							else {
1196	1097					701	for ( @{$$orig_state[1]} ) { # transition list
	1097					1245
1197	1311	100	100			2806	($$_[1] += $init_state_ind) == $init_state_ind
1198							&& ($init_reachable \|\|= 1);
1199							}
1200							}
1201							};
1202	243					529	push(@$union, @$nfa);
1203
1204							# merge initial $transitions
1205	243					187	push(@{$$union[0][1]}, map { [ @$_ ] } @{$$nfa[0][1]});
	243					318
	274					496
	243					324
1206	243	100				426	if (!$init_reachable) {
1207							# for being removed by _nfa_shrink_equiv()
1208	214					449	$$state_ind_to_equiv{$init_state_ind} = $init_state_ind;
1209							}
1210							};
1211	130	100				298	if (keys(%$state_ind_to_equiv)) {
1212	120					185	return _nfa_shrink_equiv($union, $state_ind_to_equiv);
1213							}
1214							else {
1215	10					30	return $union;
1216							}
1217							}
1218
1219							{
1220
1221							my %cached_cc_inter2;
1222
1223							=item C
1224
1225							Returns C<$out_nfa>, a $C<$nfa> computed from C<@in_nfas>.
1226
1227							C<$out_nfa> accepts a word w if and only if each of C<@in_nfas> accepts w.
1228
1229							=cut
1230
1231							sub nfa_inter {
1232	30			30	1	635	my ($inter, @nfas) = sort { @$a <=> @$b } @_;
	28					93
1233	30					53	for (@nfas) { $inter = nfa_inter2($inter, $_); }
	25					65
1234							return
1235	30		50			159	$inter
1236							\|\| [[1, [[$cc_any, 0]]]] # neutral element: accepting anything
1237							;
1238							}
1239
1240							# Multiplies the total number of states
1241							sub nfa_inter2 {
1242	25			25	0	32	my ($nfa_0, $nfa_1) = @_;
1243
1244							# computed states
1245	25					46	my @todo = (0);
1246	25					31	my %todo_seen; # set of state_inds
1247							my %done; # key-subset of %todo_seen (values are states)
1248							# After the following while, %done are %todo_seen the same set.
1249
1250							# dead end detection
1251	0					0	my %path_tr;
1252	0					0	my @cur_livings;
1253	0					0	my %livings;
1254
1255							# tmp variables
1256							my (
1257	0					0	$from_state_ind, $to_state_ind
1258							, $nfa_0_accepting, $nfa_0_transitions
1259							, $nfa_1_accepting, $nfa_1_transitions
1260							, $t_0, $t_1
1261							, $char_class
1262							, $accepting
1263							, @keys_path_to_state_ind
1264							);
1265
1266	25					40	my $nfa_1_len = @$nfa_1;
1267
1268	25					65	while (@todo) {
1269	1428					1538	$todo_seen{$from_state_ind} = $from_state_ind = pop(@todo);
1270
1271	1428					1783	($nfa_0_accepting, $nfa_0_transitions)
1272	1428					1062	= @{$$nfa_0[$from_state_ind / $nfa_1_len]}; # i-th state
1273	1428					1529	($nfa_1_accepting, $nfa_1_transitions)
1274	1428					991	= @{$$nfa_1[$from_state_ind % $nfa_1_len]}; # j-th state
1275
1276	1428					1312	my $new_transitions = [];
1277	1428					1342	for $t_0 (@$nfa_0_transitions) {
1278	2953					2418	for $t_1 (@$nfa_1_transitions) {
1279
1280	5804	100	66			16775	if (
1281							(
1282							$char_class
1283							= $cached_cc_inter2{$$t_0[0]}{$$t_1[0]}
1284							\|\|= &cc_inter2($$t_0[0], $$t_1[0])
1285							) != $cc_none
1286							) {
1287	2936					4322	push (@$new_transitions, [
1288							$char_class
1289							, $to_state_ind = $$t_0[1] * $nfa_1_len + $$t_1[1]
1290							]);
1291	2936	100				4283	if (!exists($todo_seen{$to_state_ind})) {
1292	1403					2248	push(@todo,
1293							$todo_seen{$to_state_ind} = $to_state_ind);
1294							}
1295	2936					5250	$path_tr{$to_state_ind}{$from_state_ind} = undef;
1296							}
1297							}
1298							}
1299	1428	100	100			3021	if ($accepting = $nfa_0_accepting && $nfa_1_accepting) {
1300	26					41	push(@cur_livings, $from_state_ind);
1301							}
1302	1428					3216	$done{$from_state_ind} = [
1303							$accepting
1304							, $new_transitions
1305							];
1306							}
1307
1308							# remove dead ends
1309	25					54	%livings = map { ($_ => $_) } @cur_livings;
	26					90
1310	25					65	while (@cur_livings) {
1311	945					2032	push(@cur_livings,
1312	2072					2360	map { $livings{$_} = $_ }
1313	971					1540	grep { !exists($livings{$_}) }
1314	971					677	keys(%{$path_tr{pop(@cur_livings)}})
1315							);
1316							}
1317
1318	25	50				84	if (keys(%livings) == 0) {
1319	0					0	return [[0, []]];
1320							}
1321
1322							# compact renumbering
1323	25					36	my @sorted_keys;
1324							my $inter = [@done{
1325	25					219	@sorted_keys = sort { $a <=> $b } keys(%livings)
	4434					3443
1326							}];
1327	25					66	my $i = 0;
1328	25					48	my %compact_map = map { ($_ => $i++) } @sorted_keys;
	971					1056
1329
1330	25					99	for (
1331	971					1597	map {
1332	2380					3193	@{$$_[1]}
1333	971					957	= grep { exists($compact_map{$$_[1]}) }
1334	971					602	@{$$_[1]}
1335							}
1336							@$inter
1337							) {
1338	2072					1913	$$_[1] = $compact_map{$$_[1]};
1339							}
1340	25					1175	return $inter;
1341							}
1342							}
1343
1344							sub nfa_resolve_anchors {
1345	10			10	0	14	my ($nfa) = @_;
1346
1347							# find state_inds reachable from the root by begin-anchor transitions
1348	10					21	my %begs = (0 => undef);
1349	10					18	my @todo = (0);
1350	10					33	while (defined(my $beg = pop(@todo))) {
1351	13					15	for (
1352	3					6	map { $$_[1] } # state_ind
	23					66
1353	13					83	grep { $$_[0][0][1] == -1 } # begin-anchor
1354							@{$$nfa[$beg][1]}
1355							) {
1356	3	50				7	if (!exists($begs{$_})) {
1357	3					3	$begs{$_} = undef;
1358	3					9	push(@todo, $_);
1359							}
1360							}
1361							}
1362
1363							# find state_inds leading to an accepting state by end-anchor transitions
1364	10					12	my @cur_livings;
1365							my %path_tr;
1366	10					28	for my $from_state_ind (0..$#$nfa) {
1367	38					24	for (@{$$nfa[$from_state_ind][1]}) {
	38					61
1368	46					84	$path_tr{$$_[1]}{$from_state_ind} = $$_[0];
1369							}
1370	38	100				82	if ($$nfa[$from_state_ind][0]) {
1371	10					20	push(@cur_livings, $from_state_ind);
1372							}
1373							}
1374	10					14	my %livings = map {($_ => undef)} @cur_livings;
	10					23
1375	10					28	while (defined(my $end = pop(@cur_livings))) {
1376	10					11	for (
1377	16					55	grep {
1378	10					24	$path_tr{$end}{$_}[0][0] == -3; # end-anchor
1379							}
1380							keys(%{$path_tr{$end}})
1381							) {
1382	0	0				0	if (!exists($livings{$_})) {
1383	0					0	push(@cur_livings, $livings{$_} = undef);
1384	0					0	$$nfa[$_][0] = 1;
1385							}
1386							}
1387							}
1388
1389	10					11	my $accept_empty;
1390	10	100				16	if (!($accept_empty = scalar(grep {$$nfa[$_][0]} keys(%begs)) ? 1 : 0)) {
	13	100				47
1391							# special case for $^ for and the like: empty string matches
1392	9					10	my %begends;
1393	9					18	my @todo = keys(%begs);
1394	9					22	while (defined(my $begend = pop(@todo))) {
1395	23					21	for (
1396	17					28	map { $$_[1] } # state_ind
	33					64
1397	23					34	grep { $$_[0][0][1] < 0 } # anchor
1398							@{$$nfa[$begend][1]}
1399							) {
1400	17	50	66			57	if (!exists($begs{$_}) && !exists($begends{$_})) {
1401	14	100				31	if ($$nfa[$_][0]) {
1402	3					8	$accept_empty = 1;
1403	3					6	@todo = ();
1404	3					10	last;
1405							}
1406	11					12	$begends{$_} = undef;
1407	11					33	push(@todo, $_);
1408							}
1409							}
1410							}
1411							}
1412
1413							# remove anchors
1414	10					24	for my $from_state_ind (
	46					71
1415							grep {
1416	38					50	grep { $$_[0][0][0] < 0 } # anchor
1417	38					26	@{$$nfa[$_][1]} # transitions
1418							}
1419							(0..$#$nfa)
1420							) {
1421	20					24	my $state = $$nfa[$from_state_ind];
1422							$$state[1] = [
1423							map {
1424	31	100				52	if ($$_[0][0][0] >= 0) {
	22	50				35
	20					23
1425	9					30	$_;
1426							}
1427							elsif ( @{$$_[0]} == 1 ) {
1428	22					32	delete($path_tr{$$_[1]}{$from_state_ind});
1429	22					45	();
1430							}
1431							else {
1432	0					0	$path_tr{$$_[1]}{$from_state_ind}
1433							= $$_[0]
1434	0					0	= interval_list_to_cc(@{$$_[0]}[1..$#{$$_[0]}]);
	0					0
1435	0					0	$_;
1436							}
1437							}
1438	20					17	@{$$state[1]} # transitions
1439							];
1440							}
1441
1442							# ensure that the initial state cannot be reached
1443	10	100				12	if (@{$$nfa[0][1]}) {
	10					26
1444							# proper init transitions (clone of the initial state needed)
1445
1446							# replace transitions to the initial state
1447							# with transitions to the cloned initial state
1448	8					10	my $new_state_ind = @$nfa;
1449	8					8	my $clone_reachable;
1450	8					22	for my $transition (
	22					39
1451	33					40	grep { $$_[1] == 0 } # to initial state
1452	33					26	map { @{$$_[1]} } # transitions
1453							@$nfa
1454							) {
1455	8					13	$$transition[1] = $new_state_ind;
1456	8					11	$clone_reachable = 1;
1457							}
1458
1459	8	50				19	if ($clone_reachable) {
1460	8					19	my $new_state = [
1461							$$nfa[0][0]
1462	8					13	, [@{$$nfa[0][1]}]
1463							];
1464	8					11	push(@$nfa, $new_state);
1465	8					13	$path_tr{$new_state_ind} = $path_tr{0};
1466	8					10	for (@{$$nfa[0][1]}) {
	8					16
1467	10					19	$path_tr{$$_[1]}{$new_state_ind} = $$_[0];
1468							}
1469	8	50				23	if ($$nfa[0][0]) {
1470	0					0	$livings{$new_state_ind} = undef;
1471							}
1472							}
1473							}
1474							else {
1475							# no proper init transitions
1476
1477							# drop transitions to the initial state
1478	2					4	for my $state (@$nfa) {
1479	5					5	@{$$state[1]} = grep { $$_[1] != 0 } @{$$state[1]};
	5					8
	2					5
	5					6
1480							}
1481							}
1482	10					15	delete($path_tr{0});
1483
1484							# extend initial state (merge all initial states of %begs)
1485	10	100				25	if (keys(%begs) > 1) {
1486	2					3	my %state_ind_to_char_classes;
1487	2					2	for ( map { @{$$nfa[$_][1]} } keys(%begs) ) {
	5					3
	5					12
1488	6					7	push(@{$state_ind_to_char_classes{$$_[1]}}, $$_[0]);
	6					12
1489							}
1490	2					7	@{$$nfa[0][1]}
	6					13
1491	2					5	= map { [
1492	6					7	$path_tr{$_}{0} = cc_union(@{$state_ind_to_char_classes{$_}})
1493							, int($_)
1494							] }
1495							keys(%state_ind_to_char_classes)
1496							;
1497							}
1498	10	100				25	if ($$nfa[0][0] = $accept_empty) {
1499	4					7	$livings{0} = undef;
1500							}
1501
1502							# remove unreachable states
1503	10					15	my @cur_reachables = (0);
1504	10					17	my %reachables = (0 => 0);
1505	10					31	while (@cur_reachables) {
1506	24					27	my $from_state_ind = shift(@cur_reachables);
1507	24					20	for (
1508	25					38	map { $$_[1] }
	24					36
1509							@{$$nfa[$from_state_ind][1]}
1510							) {
1511	25	100				58	if (!exists($reachables{$_})) {
1512	14					35	push(@cur_reachables, $reachables{$_} = $_);
1513							}
1514							}
1515							}
1516
1517							# remove dead ends
1518	10					17	delete(@livings{grep { !exists($reachables{$_}) } keys(%livings)});
	13					22
1519	10					17	@cur_livings = keys(%livings);
1520	10					23	while (@cur_livings) {
1521	11					11	for (
1522	10					16	grep { exists($reachables{$_}) }
	11					140
1523							keys(%{$path_tr{pop(@cur_livings)}})
1524							) {
1525	8	100				15	if (!exists($livings{$_})) {
1526	5					6	push(@cur_livings, $_);
1527	5					9	$livings{$_} = undef;
1528							}
1529							}
1530							}
1531
1532	10	100				34	if (keys(%livings) == 0) {
		50
1533	4					30	return [[0, []]];
1534							}
1535							elsif (keys(%livings) == @$nfa) {
1536	0					0	return $nfa;
1537							}
1538
1539							# compact renumbering
1540	6					15	my @sorted_keys = sort { $a <=> $b } keys(%livings);
	6					9
1541	6					8	my $i = 0;
1542	6					11	my %compact_map = map { ($_ => $i++) } @sorted_keys;
	11					20
1543
1544							return [
1545	11					15	map {
1546	6					16	@{$$_[1]}
	8					10
1547							= map {
1548	13					23	$$_[1] = $compact_map{$$_[1]};
1549	8					8	$_;
1550							}
1551	11					14	grep { exists($compact_map{$$_[1]}) }
1552	11					12	@{$$_[1]}
1553							;
1554	11					53	$_;
1555							}
1556							@$nfa[@sorted_keys]
1557							];
1558							}
1559
1560							=item C
1561
1562							Returns true if and only if C<$in_nfa> accepts C<$str>.
1563
1564							=cut
1565
1566							sub nfa_match {
1567	19			19	1	3360	my ($nfa, $str) = @_;
1568
1569	19					36	my %state_inds = (0 => 0);
1570	19					52	for my $c ( map { ord($_) } split('', $str) ) {
	119					114
1571	66					177	%state_inds
1572	105					128	= map { $$_[1] => $$_[1] }
1573	74					115	grep { cc_match($$_[0], $c) } # matching transition list
1574	119					143	map { @{$$_[1]} } # all transition list
	74					51
1575							@$nfa[values(%state_inds)] # current states
1576							;
1577							}
1578
1579	19					62	return grep { $$_[0] } @$nfa[values(%state_inds)];
	11					60
1580							}
1581
1582							sub nfa_dump {
1583	0			0	0	0	my ($nfa) = @_;
1584	0					0	my $dump = '';
1585	0					0	for my $i (0..$#$nfa) {
1586	0	0				0	$dump
1587							.= "$i:"
1588							. ($$nfa[$i][0] ? " (accepting)" : "")
1589							. "\n"
1590							;
1591	0					0	for my $transition (@{$$nfa[$i][1]}) {
	0					0
1592	0					0	$dump
1593							.= " "
1594							. cc_to_regex($$transition[0]) . " => $$transition[1]\n";
1595							}
1596							}
1597	0					0	return $dump;
1598							}
1599
1600							=item C
1601
1602							Returns true if and only if the labeled graphs represented by C<$nfa1>
1603							and C<$nfa2> are isomorph. While isomorph C<$nfa>s accept the same language,
1604							the converse is not true.
1605
1606							=cut
1607
1608							sub nfa_isomorph {
1609	99			99	1	1606	my ($nfa1, $nfa2) = @_;
1610
1611	99					220	my %nfa1_nfa2_indexes = (0 => 0);
1612	99					167	my %nfa2_nfa1_indexes = (0 => 0);
1613	99					162	my @nfa1_index_todo = (0);
1614
1615	99					287	while (defined(my $nfa1_index = pop(@nfa1_index_todo))) {
1616
1617	583					621	my $state1 = $$nfa1[$nfa1_index];
1618	583					577	my $state2 = $$nfa2[$nfa1_nfa2_indexes{$nfa1_index}];
1619
1620							# accepting
1621	583	50				969	if ($$state1[0] != $$state2[0]) {
1622	0					0	return 0;
1623							}
1624
1625							# transitions
1626	583					435	my $transitions1 = [sort { $$a[0] <=> $$b[0] } @{$$state1[1]}];
	2079					2283
	583					1026
1627	583					502	my $transitions2 = [sort { $$a[0] <=> $$b[0] } @{$$state2[1]}];
	2085					1961
	583					737
1628	583	50				912	if (@$transitions1 != @$transitions2) {
1629	0					0	return 0;
1630							}
1631	583					765	for my $i (0..$#$transitions1) {
1632	1662					1117	my ($cc1, $next_index1) = @{$$transitions1[$i]};
	1662					1900
1633	1662					1229	my ($cc2, $next_index2) = @{$$transitions2[$i]};
	1662					1557
1634	1662	50				2655	if ($cc1 ne $cc2) {
1635	0					0	return 0;
1636							}
1637	1662	100				2224	if (exists($nfa1_nfa2_indexes{$next_index1})) {
		50
1638	1178	50				2983	if ($nfa1_nfa2_indexes{$next_index1} != $next_index2) {
1639	0					0	return 0;
1640							}
1641							}
1642							elsif (exists($nfa2_nfa1_indexes{$next_index2})) {
1643							# $nfa2_nfa1_indexes{$next_index2} != $next_index1
1644							# because
1645							# - !exists($nfa1_nfa2_indexes{$next_index1})
1646							# - $nfa1_nfa2_indexes and $nfa2_nfa1_indexes
1647							# are reverse to each other by construction
1648	0					0	return 0;
1649							}
1650							else {
1651	484					565	$nfa1_nfa2_indexes{$next_index1} = $next_index2;
1652	484					455	$nfa2_nfa1_indexes{$next_index2} = $next_index1;
1653	484					862	push(@nfa1_index_todo, $next_index1);
1654							}
1655							}
1656							}
1657	99					573	return 1;
1658							}
1659
1660
1661							##############################################################################
1662							# $dfa
1663							##############################################################################
1664
1665							# input X:
1666							# Arbitrary list of intervals.
1667							# output Y:
1668							# List of pairwise disjoint intervals spanning the same subset such that
1669							# for any intersections/unions of intervals of X
1670							# an equal union of intervals of Y exists.
1671							# In short, all boundaries of X are preserved.
1672							#
1673							# Motivation:
1674							# nfas use character classes as alphabet (instead of single code points).
1675							# dfa operations needs a common refinement of sets of character classes.
1676							#
1677							# Example:
1678							# interval_cases( [ [0, 5], [2, 8] ] )
1679							# = [ [0, 1], [2, 5], [6, 8] ]
1680							#
1681							# X: \|0 1 2 3 4 5\|
1682							# \|2 3 4 5 6 7 8\|
1683							# Y: \|0 1\|2 3 4 5\|6 7 8\|
1684							#
1685							sub interval_cases {
1686	1525			1525	0	1342	my ($interval_list) = @_;
1687							my @sorted
1688	45323	50				61168	= sort {
1689	1525					2612	$$a[0] <=> $$b[0]
1690							\|\| $$b[1] <=> $$a[1]
1691							}
1692							@$interval_list
1693							;
1694	1525					1267	my %los;
1695							my %his;
1696	1525					1331	my $i = 0;
1697	1525					2497	while ($i < @sorted) {
1698	4757					5938	$los{$sorted[$i][0]} = undef;
1699	4757					4409	$his{$sorted[$i][1]} = undef;
1700	4757					4062	my $j = $i + 1;
1701	4757		100			17235	while (
			100
1702							$j < @sorted
1703							&& $sorted[$j][0] == $sorted[$i][0]
1704							&& $sorted[$j][1] == $sorted[$i][1]
1705							) {
1706							# $sorted[$i] ---------
1707							# $sorted[$j] ---------
1708	2418					8733	$j++;
1709							}
1710	4757		100			14267	while (
			66
1711							$j < @sorted
1712							&& $sorted[$j][0] == $sorted[$i][0]
1713							&& $sorted[$j][1] < $sorted[$i][1]
1714							) {
1715							# $sorted[$i] ---------
1716							# $sorted[$j] -----
1717	1712					1695	$his{$sorted[$j][1]} = undef;
1718	1712					1749	$los{$sorted[$j][1]+1} = undef;
1719	1712					6933	$j++;
1720							}
1721							# $sorted[$j][0] > $sorted[$i][0]
1722	4757		100			12853	while (
1723							$j < @sorted
1724							&& $sorted[$j][1] < $sorted[$i][1]
1725							) {
1726							# $sorted[$i] ---------
1727							# $sorted[$j] -----
1728	3625					3549	$his{$sorted[$j][0]-1} = undef;
1729	3625					3229	$los{$sorted[$j][0]} = undef;
1730	3625					3067	$his{$sorted[$j][1]} = undef;
1731	3625					3279	$los{$sorted[$j][1]+1} = undef;
1732	3625					10220	$j++;
1733							}
1734	4757	100	100			12280	if (
1735							$j < @sorted
1736							&& $sorted[$j][0] <= $sorted[$i][1]
1737							) {
1738							# $sorted[$j][0] > $sorted[$i][0]
1739							# && $sorted[$j][0] <= $sorted[$i][1]
1740							# && $sorted[$j][1] >= $sorted[$i][1]
1741							#
1742							# $sorted[$i] ---------
1743							# $sorted[$j] -----
1744	446					519	$his{$sorted[$j][0]-1} = undef;
1745	446	50				815	if ($sorted[$i][1] != $sorted[$j][1]) {
1746	0					0	$los{$sorted[$i][1]+1} = undef;
1747							}
1748							}
1749	4757					7248	$i = $j;
1750							}
1751	1525					3055	my @sorted_los = sort( { $a <=> $b } keys(%los));
	8876					8007
1752	1525					2597	my @sorted_his = sort( { $a <=> $b } keys(%his));
	8796					7602
1753	1525					2446	return [ map { [$sorted_los[$_], $sorted_his[$_]] } (0..$#sorted_los) ];
	5452					10348
1754							}
1755
1756							=item C
1757
1758							Compute a deterministic finite automaton from C<$in_nfa>
1759							(powerset construction).
1760
1761							The data structure of a deterministic finite automaton (dfa) is
1762							the same as that of a non-deterministic one, but it is further constrained:
1763							For each state and each unicode character there exist exactly one transition
1764							(i.e. a pair C<($char_class, $state_index)>) matching this character.
1765
1766							Note that the following constraint hold for both a C<$dfa> and a C<$nfa>:
1767							For each pair of state p1 and p2, there exists at most one transition
1768							from p1 to p2 (artefact of this implementation).
1769
1770							=cut
1771
1772							sub nfa_to_dfa {
1773	207			207	1	212	my ($nfa) = @_;
1774	207					283	my $dfa = [];
1775	207	50				419	if (!@$nfa) {
1776	0					0	return [[0, [$cc_any, 0]]];
1777							}
1778	207					217	my $trap_needed = 0;
1779	207					176	my $dfa_size = 0;
1780	207					473	my %dfa_indexes = ("0" => $dfa_size++);
1781	207					365	my @todo = ([0]);
1782	207					392	while (@todo) {
1783	1326					1406	my $nfa_indexes = pop(@todo);
1784	1326					2110	my $dfa_index = $dfa_indexes{join('.', @$nfa_indexes)};
1785	1326					2499	my @nfa_states = @$nfa[@$nfa_indexes];
1786
1787							# accepting
1788	1326	100				1329	$$dfa[$dfa_index][0] = scalar(grep { $$_[0] } @nfa_states) ? 1 : 0;
	2189					4226
1789
1790							# transitions
1791	4986					7097	my $cases = interval_cases([
1792	4986					3273	map { @{$$_[0]} }
	2189					3351
1793	1326					1302	map { @{$$_[1]} }
	2189					1468
1794							@nfa_states
1795							]);
1796	1326					1944	my %dfa_index_to_intervals;
1797	1326					1689	for my $interval (@$cases) {
1798							my @next_nfa_indexes
1799	4335					3146	= sort(keys(%{ {
	6234					15516
1800	28291					32787	map { ($$_[1] => undef) }
1801	7122					10371	grep { cc_match($$_[0], $$interval[0]) }
1802	4335					3968	map { @{$$_[1]} }
	7122					4474
1803							@nfa_states
1804							} }))
1805							;
1806	4335					8436	my $next_index_key = join('.', @next_nfa_indexes);
1807	4335	100				6497	if (!exists($dfa_indexes{$next_index_key})) {
1808	1119					1444	$dfa_indexes{$next_index_key} = $dfa_size++;
1809	1119					1218	push(@todo, \@next_nfa_indexes);
1810							}
1811	4335					3095	push(@{$dfa_index_to_intervals{$dfa_indexes{$next_index_key}}},
	4335					8549
1812							$interval
1813							);
1814							}
1815
1816	1326					1230	my @any_ccs;
1817	3831					4772	$$dfa[$dfa_index][1] = [
1818							map {
1819	1326					2375	my $cc = interval_list_to_cc($dfa_index_to_intervals{$_});
1820	3831					3544	push(@any_ccs, $cc);
1821	3831					6003	[$cc, $_ ];
1822							}
1823							sort(keys(%dfa_index_to_intervals))
1824							];
1825	1326	100				2130	if ((my $all_cc = cc_union(@any_ccs)) != $cc_any) {
1826	1177					981	$trap_needed = 1;
1827	1177					879	push(@{$$dfa[$dfa_index][1]},
	1177					2056
1828							[ cc_neg($all_cc), -1 ]
1829							);
1830							}
1831							}
1832
1833	207	100				389	if ($trap_needed) {
1834	195					280	for (
1835	4889					5534	grep { $$_[1] == -1 }
	1284					1910
1836	1284					800	map { @{$$_[1]} }
1837							@$dfa
1838							) {
1839	1177					1075	$$_[1] = $dfa_size;
1840							}
1841	195					540	$$dfa[$dfa_size] = [0, [[$cc_any, $dfa_size]]];
1842							}
1843
1844	207					775	return $dfa;
1845							}
1846
1847
1848							=item C
1849
1850
1851							Computes a minimal deterministic C<$dfa> from the given C<$in_dfa>
1852							(Hopcroft's algorithm).
1853
1854							Note that the given C<$in_dfa> must be a C<$dfa>, as
1855							returned from C, and not a mere C<$nfa>.
1856
1857							Myhill-Nerode theorem: two minimal dfa accepting
1858							the same language are isomorph (i.e. C returns true).
1859
1860							=cut
1861
1862							sub dfa_to_min_dfa {
1863	207			207	1	337	my ($dfa) = @_;
1864	207					225	my @acceptings;
1865							my @non_acceptings;
1866	0					0	my @intervals;
1867	207					423	for my $index (0..$#$dfa) {
1868	1521	100				1905	if ($$dfa[$index][0]) {
1869	275					311	push(@acceptings, $index);
1870							}
1871							else {
1872	1246					1023	push(@non_acceptings, $index);
1873							}
1874	1521					1007	push(@intervals, map { @{$$_[0]} } @{$$dfa[$index][1]})
	5203					3406
	5203					6818
	1521					1657
1875							}
1876	207					238	my $partition;
1877	207	100				326	if (@non_acceptings) {
1878	199					416	$partition = [\@non_acceptings, \@acceptings];
1879	199					764	my %todo = (join('.', @non_acceptings) => \@non_acceptings);
1880	199					337	my $cases = interval_cases(\@intervals);
1881	199					611	while (my ($todo_key) = keys(%todo)) {
1882	994					926	my %indexes = map { ($_ => undef) } @{delete($todo{$todo_key})};
	2325					3626
	994					1736
1883	994					1489	for my $interval (@$cases) {
1884	21234					27236	my %prev_inds = (
1885	269076					175312	map { ($_ => undef) }
1886							grep {
1887	9909					18146	my $i = $_;
1888	914283	100				1504678	grep {
1889	269076					274494	exists($indexes{$$_[1]})
1890							&& cc_match($$_[0], $$interval[0])
1891							}
1892	269076					170903	@{$$dfa[$i][1]}
1893							}
1894							(0..$#$dfa)
1895							);
1896	9909					13289	my $refined_partition;
1897	9909					9813	for my $partition_indexes (@$partition) {
1898	127676					77475	my (@inter, @diff);
1899	127676					105996	for (@$partition_indexes) {
1900	269076	100				269220	if (exists($prev_inds{$_})) {
1901	21234					23006	push(@inter, $_);
1902							}
1903							else {
1904	247842					258570	push(@diff, $_);
1905							}
1906							}
1907	127676	100	100			217504	if (!@inter \|\| !@diff) {
1908	126881					158392	push(@$refined_partition, $partition_indexes);
1909							}
1910							else {
1911	795					1306	push(@$refined_partition, \@inter, \@diff);
1912	795					3328	my $prev_inds_key = join('.', sort(keys(%prev_inds)));
1913	795	50				2078	if ($todo{$prev_inds_key}) {
		100
1914	0					0	delete($todo{$prev_inds_key});
1915	0					0	$todo{join('.', @diff)} = \@diff;
1916	0					0	$todo{join('.', @inter)} = \@inter;
1917							}
1918							elsif (@diff < @inter) {
1919	174					479	$todo{join('.', @diff)} = \@diff;
1920							}
1921							else {
1922	621					1541	$todo{join('.', @inter)} = \@inter;
1923							}
1924							}
1925							}
1926	9909					21149	$partition = $refined_partition;
1927							}
1928							}
1929							}
1930							else {
1931	8					15	$partition = [\@acceptings];
1932							}
1933	207					237	my $state_ind_to_equiv;
1934	207					302	for (grep { @$_ != 1 } @$partition) {
	1201					1449
1935	150					615	@$state_ind_to_equiv{@$_[1..$#$_]} = ($$_[0]) x $#$_;
1936							}
1937	207					487	return _nfa_shrink_equiv($dfa, $state_ind_to_equiv);
1938							}
1939
1940
1941							##############################################################################
1942							# $tree
1943							##############################################################################
1944
1945							=back
1946
1947							=head2 Tree
1948
1949							$tree = [ $star, [ $alt_0, $alt_1, ... ] ]
1950							or $char_class # ref($char_class) eq CHAR_CLASS
1951							or undef # accepting nothing
1952							$alt = [ $tree_0, $tree_1, ... ]
1953
1954							A C<$tree> is a hierarchical data structure used as intermediate form for
1955							regular expression generation routines.
1956
1957							Similar to a parse tree, except that the C<$tree>s described here are not the
1958							direct result of the parsing routines C; indeed, the parsing
1959							routines generate a C<$nfa>, which then can be converted to a C<$tree>.
1960
1961							A string is spanned by C<$tree = [$star, [ $alt_0, $alt_1, ... ] ]> if it is
1962							spanned by one of the C<$alt_i> (if C<$star> is false) of a repetition thereof
1963							(if C<$star> is true).
1964
1965							A string is spanned by C<$alt = [ $tree_0, $tree_1, ...]> if it is the
1966							concatenation of C<@substrings>, each C<$substrings[$i]> being spanned by
1967							C<$$alt[$i]>.
1968
1969							=over 4
1970
1971							=item C
1972
1973							Converts a C<$nfa> to a C<$tree>.
1974							Returns C if the C<$nfa> accepts nothing (not even the empty string).
1975
1976							=cut
1977
1978							sub nfa_to_tree {
1979	131			131	1	183	my ($nfa) = @_;
1980
1981							# Warshall algorithm (Kleene's theorem)
1982							# with preliminary computations:
1983							# - words-paths (unbranched paths) are shrunken
1984							# - unique accepting state is ensured
1985							# - branches (with single parent) are skipped
1986
1987	131					184	my $path = {};
1988	131					159	my $path_tr = {};
1989	131					133	my %accepting_state_inds;
1990
1991							# Initialization of the paths
1992
1993	131					272	for my $i (0..$#$nfa) {
1994	923	100				1440	if ($$nfa[$i][0]) {
1995	153					273	$accepting_state_inds{$i} = $i;
1996							}
1997	923					633	for (@{$$nfa[$i][1]}) {
	923					1175
1998	1362					3841	$$path{$i}{$$_[1]}
1999							= $$path_tr{$$_[1]}{$i}
2000							= $$_[0];
2001							}
2002							}
2003
2004	131					144	if (TRACE_NFA_TO_TREE) {
2005							print STDERR "before word shrink\n";
2006							for my $i (sort {$a <=> $b} (keys(%$path))) {
2007							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2008							print STDERR "$i $j: " . cc_to_regex($$path{$i}{$j}) . "\n";
2009							}}
2010							}
2011
2012	131					186	my @tree_list;
2013							my @state_ind_path;
2014
2015							# word-paths (unbranched paths) are shrunken
2016	131					220	for my $first (0..$#$nfa) {
2017	923	100				1427	if (!exists($$path{$first})) { next; }
	411					343
2018							my @todo
2019	452					508	= sort {
2020	452	50				328	keys(%{$$path_tr{$b}}) <=> keys(%{$$path_tr{$a}})
	452					1223
	956					1614
2021							\|\| $b <=> $a
2022							}
2023	512					920	grep { $_ != $first }
2024	512					402	keys(%{$$path{$first}})
2025							;
2026	512					522	my %todo_ctrl;
2027	512					389	my $todo_sorted = 1;
2028	512		100			1289	while (
			66
2029							@todo
2030							&& (
2031							!$todo_sorted
2032							\|\| keys(%{$$path_tr{$todo[-1]}}) == 1
2033							)
2034							) {
2035	461					596	$todo_ctrl{my $i = pop(@todo)} = undef;
2036	461	100				367	if (keys(%{$$path_tr{$i}}) != 1) {
	461					832
2037	165	100	33			839	if ($i != $first && !$todo_sorted && @todo) {
			66
2038							@todo
2039	87					104	= sort {
2040	87	50				71	keys(%{$$path_tr{$b}}) <=> keys(%{$$path_tr{$a}})
	87					219
	162					343
2041							\|\| $b <=> $a
2042							}
2043	69					77	keys(%{ { map { ($_ => undef) } (@todo, $i) } })
	69					83
2044							;
2045	69					138	$todo_sorted = 1;
2046							}
2047	165					449	next;
2048							}
2049	296					262	$todo_sorted = 0;
2050
2051	296					420	my @tree_list = ($$path{$first}{$i});
2052	296					309	my @state_ind_path = ($i);
2053
2054	296		66			214	while (
2055	595					1229	keys(%{$$path{$i}}) == 1
	406					1104
2056							&& (my $j = (keys(%{$$path{$i}}))[0]) != $first
2057							) {
2058	406					429	push(@tree_list, $$path{$i}{$j});
2059	406					380	push(@state_ind_path, $i = $j);
2060	406	100				243	if (keys(%{$$path_tr{$j}}) != 1) {
	406					823
2061	107					116	last;
2062							}
2063							}
2064
2065	296					293	if (TRACE_NFA_TO_TREE) {
2066							print STDERR "first, state_ind_path: $first, @state_ind_path\n";
2067							}
2068
2069	296	100				795	if (@state_ind_path > 1) {
2070
2071	137					116	if (TRACE_NFA_TO_TREE) {
2072							print STDERR "delete head $first -> $state_ind_path[0]\n";
2073							}
2074	137					229	delete($$path{$first}{$state_ind_path[0]});
2075	137					313	for (@state_ind_path[0..$#state_ind_path-1]) {
2076	406					432	delete($$path{$_});
2077	406					370	delete($$path_tr{$_});
2078	406					380	if (TRACE_NFA_TO_TREE) {
2079							print STDERR "delete path $_ -> *\n";
2080							print STDERR "delete path * <- $_\n";
2081							}
2082							}
2083	137					189	delete($$path_tr{$state_ind_path[-1]}{$state_ind_path[-2]});
2084	137	100				236	if (!exists($todo_ctrl{$state_ind_path[-1]})) {
2085	122					140	$todo_ctrl{$state_ind_path[-1]} = undef;
2086	122					131	push(@todo, $state_ind_path[-1]);
2087							}
2088	137					96	if (TRACE_NFA_TO_TREE) {
2089							print STDERR "delete tail $state_ind_path[-1] <- $state_ind_path[-2]\n";
2090							}
2091
2092
2093							# $first -> $last
2094	137					139	my $last = $state_ind_path[-1];
2095	137	100				399	$$path{$first}{$last}
2096							= $$path_tr{$last}{$first}
2097							= exists($$path{$first}{$last})
2098							? tree_alt(
2099							$$path{$first}{$last}
2100							, tree_concat(@tree_list)
2101							)
2102							: tree_concat(@tree_list)
2103							;
2104
2105	137					193	if (TRACE_NFA_TO_TREE) {
2106							print STDERR
2107							"$first -> $last created (first ->last): "
2108							. join('', map {_tree_to_regex($_)} @tree_list) . "\n";
2109							}
2110
2111	137					227	for (0..$#state_ind_path-1) {
2112
2113							# $first -> accepting
2114	406	100				1159	if ($accepting_state_inds{
2115							my $state_ind = $state_ind_path[$_]
2116							}) {
2117	30	50				85	$$path{$first}{$state_ind}
2118							= $$path_tr{$state_ind}{$first}
2119							= exists($$path{$first}{$state_ind})
2120							? tree_alt(
2121							$$path{$first}{$state_ind}
2122							, tree_concat(@tree_list[0..$_])
2123							)
2124							: tree_concat(@tree_list[0..$_])
2125							;
2126	30					133	if (TRACE_NFA_TO_TREE) {
2127							print STDERR
2128							"$first -> $state_ind created (first -> accepting): "
2129							. join('', map {_tree_to_regex($_)} @tree_list[0..$_]) . "\n";
2130							}
2131							}
2132							}
2133							}
2134							}
2135							}
2136
2137	131					140	if (TRACE_NFA_TO_TREE) {
2138							print STDERR "after word shrink\n";
2139							for my $i (sort {$a <=> $b} (keys(%$path))) {
2140							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2141							print STDERR "$i $j: " . tree_dump($$path{$i}{$j}) . "\n";
2142							}}
2143							for my $j (sort {$a <=> $b} (keys(%$path_tr))) {
2144							for my $i (sort {$a <=> $b} (keys(%{$$path_tr{$j}}))) {
2145							print STDERR "$j <- $i: " . tree_dump($$path_tr{$j}{$i}) . "\n";
2146							}}
2147							}
2148
2149							# unique accepting state is ensured
2150							# (pseudo-unique: the initial state may additionally be accepting)
2151	131					194	my $unique_accepting_state_ind = @$nfa;
2152	131	100	100			319	if (
		100
2153							keys(%accepting_state_inds) == 1
2154							) {
2155	114					252	$unique_accepting_state_ind = (keys(%accepting_state_inds))[0];
2156							}
2157							elsif (
2158							keys(%accepting_state_inds) == 2
2159							&& exists($accepting_state_inds{0})
2160							) {
2161	6					8	$unique_accepting_state_ind
2162	3					7	= (grep {$_} keys(%accepting_state_inds))[0];
2163							}
2164							else {
2165	14					24	$unique_accepting_state_ind = @$nfa;
2166	14					32	for my $to_state_ind (keys(%accepting_state_inds)) {
2167	33					30	for my $from_state_ind (keys(%{$$path_tr{$to_state_ind}})) {
	33					51
2168	58					119	push(
2169	58					41	@{$$path_tr{$unique_accepting_state_ind}{$from_state_ind}}
2170							, $$path_tr{$to_state_ind}{$from_state_ind}
2171							);
2172							}
2173							}
2174	14					18	for my $from_state_ind (
	14					39
2175							keys(%{$$path_tr{$unique_accepting_state_ind}})
2176							) {
2177	50					92	$$path_tr{$unique_accepting_state_ind}{$from_state_ind}
2178							= $$path{$from_state_ind}{$unique_accepting_state_ind}
2179							= tree_alt(
2180	50					44	@{$$path_tr{$unique_accepting_state_ind}{$from_state_ind}}
2181							);
2182							}
2183							}
2184
2185	131					121	if (TRACE_NFA_TO_TREE) {
2186							print STDERR "after unique state addition\n";
2187							for my $i (sort {$a <=> $b} (keys(%$path))) {
2188							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2189							print STDERR "$i $j: " . tree_dump($$path{$i}{$j}) . "\n";
2190							}}
2191							for my $j (sort {$a <=> $b} (keys(%$path_tr))) {
2192							for my $i (sort {$a <=> $b} (keys(%{$$path_tr{$j}}))) {
2193							print STDERR "$j <- $i: " . tree_dump($$path_tr{$j}{$i}) . "\n";
2194							}}
2195							}
2196
2197	131					223	for my $reversed (0, 1) {
2198	262	100				524	my ($tmp_path, $tmp_path_tr)
2199							= $reversed
2200							? ($path_tr, $path)
2201							: ($path, $path_tr)
2202							;
2203
2204							# branches (with single parent) are skipped
2205							my @branch_inds
2206	480					510	= $reversed
2207	781					811	? sort {$a <=> $b} (keys(%$tmp_path))
2208	262	100				837	: sort {$b <=> $a} (keys(%$tmp_path))
2209							;
2210	262					483	while (@branch_inds) {
2211	1150					996	my $branch = pop(@branch_inds);
2212	1150	100	100			4847	if (
			100
			100
2213	642					1616	!exists($$tmp_path{$branch})
2214							# root cannot be un-branched
2215							\|\| $branch == 0
2216							# accepting states cannot be un-branched
2217							\|\| $branch == $unique_accepting_state_ind
2218							# single parent (non-root have one or more parents)
2219							\|\| keys(%{$$tmp_path_tr{$branch}}) != 1
2220							) {
2221	881					1400	next;
2222							}
2223
2224	269					209	if (TRACE_NFA_TO_TREE) {
2225							print STDERR "branch at $branch\n";
2226							}
2227	269					220	my ($parent) = keys(%{$$tmp_path_tr{$branch}}); # single parent
	269					387
2228	269	100	66			793	if (
			66
2229							ref($$tmp_path{$parent}{$branch}) ne CHAR_CLASS
2230							&& (
2231							# starified parent
2232							$$tmp_path{$parent}{$branch}[0]
2233							# parent containing several paths
2234							\|\| @{$$tmp_path{$parent}{$branch}[1]} > 1
2235							)
2236							) {
2237	35					67	next;
2238							}
2239
2240	234					204	my (@children) = keys(%{$$tmp_path{$branch}});
	234					453
2241
2242	234					290	for my $child (@children) {
2243	533	100				1804	$$tmp_path{$parent}{$child}
		100
		100
2244							= $$tmp_path_tr{$child}{$parent}
2245							= exists($$tmp_path{$parent}{$child})
2246							? tree_alt(
2247							$$tmp_path{$parent}{$child}
2248							, tree_concat2(
2249							$reversed
2250							? (
2251							$$tmp_path{$branch}{$child}
2252							, $$tmp_path{$parent}{$branch}
2253							)
2254							: (
2255							$$tmp_path{$parent}{$branch}
2256							, $$tmp_path{$branch}{$child}
2257							)
2258							)
2259							)
2260							: tree_concat2(
2261							$reversed
2262							? (
2263							$$tmp_path{$branch}{$child}
2264							, $$tmp_path{$parent}{$branch}
2265							)
2266							: (
2267							$$tmp_path{$parent}{$branch}
2268							, $$tmp_path{$branch}{$child}
2269							)
2270							)
2271							;
2272	533					849	delete($$tmp_path_tr{$child}{$branch});
2273
2274	533					568	if (TRACE_NFA_TO_TREE) {
2275							print STDERR
2276							"parent -> branch: "
2277							. tree_dump($$tmp_path{$parent}{$branch}) . "\n";
2278							print STDERR
2279							"branch -> child : "
2280							. tree_dump($$tmp_path{$branch}{$child}) . "\n";
2281							print STDERR
2282							"$parent -> $child created (un-branch): "
2283							. tree_dump($$tmp_path{$parent}{$child})
2284							. ($reversed ? " (reversed)" : "" ) . "\n";
2285							print STDERR
2286							"delete $child <- $branch\n";
2287							}
2288
2289							}
2290	234					305	delete($$tmp_path{$parent}{$branch});
2291	234					454	delete($$tmp_path{$branch});
2292	234					327	delete($$tmp_path_tr{$branch});
2293
2294	234					193	if (TRACE_NFA_TO_TREE) {
2295							print STDERR "delete $parent -> $branch\n";
2296							print STDERR "delete $branch -> *\n";
2297							print STDERR "delete $branch <- *\n";
2298							}
2299
2300	234					578	push(@branch_inds, $parent);
2301							}
2302
2303	262					400	if (TRACE_NFA_TO_TREE) {
2304							print STDERR "after branch skip\n";
2305							for my $i (sort {$a <=> $b} (keys(%$tmp_path))) {
2306							for my $j (sort {$a <=> $b} (keys(%{$$tmp_path{$i}}))) {
2307							if ($reversed) {
2308							print STDERR "$j $i: " . tree_dump($$tmp_path{$i}{$j}) . "\n";
2309							}
2310							else {
2311							print STDERR "$i $j: " . tree_dump($$tmp_path{$i}{$j}) . "\n";
2312							}
2313							}}
2314							for my $j (sort {$a <=> $b} (keys(%$tmp_path_tr))) {
2315							for my $i (sort {$a <=> $b} (keys(%{$$tmp_path_tr{$j}}))) {
2316							print STDERR
2317							($reversed ? "$i <- $j: " : "$j <- $i:")
2318							. tree_dump($$tmp_path_tr{$j}{$i}) . "\n";
2319							}}
2320							}
2321
2322							}
2323
2324
2325							# starify diagonal
2326	131					335	for (grep { exists($$path{$_}{$_}) } keys(%$path)) {
	300					511
2327	128					288	$$path{$_}{$_}
2328							= $$path_tr{$_}{$_}
2329							= tree_starify($$path{$_}{$_});
2330							}
2331
2332	131					137	if (TRACE_NFA_TO_TREE) {
2333							print STDERR "after diagonal starification\n";
2334							for my $i (sort {$a <=> $b} (keys(%$path))) {
2335							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2336							print STDERR "$i $j: ";
2337							print STDERR tree_dump($$path{$i}{$j}) . "\n";
2338							}}
2339							}
2340
2341							# Warshall algorithm (Kleene's theorem)
2342	131					131	my %updates;
2343	300					269	my %weight = map {
2344	131					203	my $w = 0;
2345	300					212	for (values(%{$$path{$_}})) { $w += _tree_weight($_) }
	300					488
	474					621
2346	300					534	($_ => $w);
2347							} keys(%$path);
2348	131	50				364	my @ks = sort { $weight{$a} <=> $weight{$b} \|\| $a <=> $b } keys(%$path);
	302					607
2349							# note that keys(%$path_tr) are not additionally needed
2350							# case i == k && k == j: nothing to do
2351							# case i != k && k != j: $$path{$k}{$j} must exist
2352							# case i == k && k != j: $$path{$k}{$k} must exist
2353							# case i != k && k == j: $$path{$k}{$k} must exist
2354	131					202	for my $k (@ks) {
2355	300					265	for my $i (keys(%{$$path_tr{$k}})) { # i -> k
	300					545
2356	481					403	for my $j (keys(%{$$path{$k}})) { # k -> j
	481					793
2357	1336	100	100			3097	if ($i == $k && $k == $j) { next; }
	139					243
2358	1197					841	my @trees;
2359	1197	100	100			4402	if (
			66
2360							exists($$path{$i}{$j})
2361							&& ($i != $k && $k != $j)
2362							) {
2363	533					616	push(@trees, $$path{$i}{$j});
2364							}
2365	1197	100				3462	my $new_tree
		100
		100
2366							= exists($$path{$k}{$k})
2367							? tree_concat(
2368							(
2369							$i != $k
2370							? $$path{$i}{$k}
2371							: ()
2372							)
2373							, $$path{$k}{$k}
2374							, (
2375							$k != $j
2376							? $$path{$k}{$j}
2377							: ()
2378							)
2379							)
2380							: tree_concat2($$path{$i}{$k}, $$path{$k}{$j})
2381							;
2382	1197	100				1808	push(@trees, $i == $j ? tree_starify($new_tree) : $new_tree);
2383
2384	1197	100				1454	if (@trees == 1) {
2385	664					1470	$updates{$i}{$j} = $trees[0];
2386							}
2387							else {
2388	533					667	$updates{$i}{$j} = tree_alt(@trees);
2389							}
2390							}
2391							}
2392	300					541	for my $i (keys(%updates)) {
2393	407					298	for my $j (keys(%{$updates{$i}})) {
	407					744
2394	1197					2046	$$path{$i}{$j} = $$path_tr{$j}{$i} = $updates{$i}{$j};
2395							}
2396							}
2397
2398	300					259	if (TRACE_NFA_TO_TREE) {
2399							my $num_of_updates = map {keys(%{$updates{$_}})} keys(%updates);
2400							print STDERR "k = $k ($num_of_updates updates)\n";
2401							if ($num_of_updates) {
2402							for my $i (sort {$a <=> $b} (keys(%$path))) {
2403							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2404							print STDERR "$i $j: ";
2405							print STDERR tree_dump($$path{$i}{$j}) . "\n";
2406							}}
2407							}
2408							}
2409
2410	300					603	%updates = ();
2411							}
2412
2413	131					154	my $tree;
2414
2415							# accepting empty init
2416	131	100				284	if ($$nfa[0][0]) {
2417
2418	53	100				142	my $path_0_0 = exists($$path{0}{0}) ? $$path{0}{0} : $cc_none;
2419
2420	53	100				93	if ($unique_accepting_state_ind == 0) {
2421	48					53	$tree = $path_0_0;
2422							}
2423							else {
2424	5					9	my $path_0_end = $$path{0}{$unique_accepting_state_ind};
2425
2426	5	50	100			32	if (
			66
2427							$path_0_0 == $cc_none
2428							&& ref($path_0_end) ne CHAR_CLASS
2429							&& $$path_0_end[0]
2430							) {
2431							# starified expression e* does not need (\|e*)
2432	0					0	$tree = $path_0_end;
2433							}
2434							else {
2435							# non-starified expression e needs (\|e)
2436	5					7	$tree = tree_alt($path_0_0, $path_0_end);
2437							}
2438							}
2439							}
2440							else {
2441	78					158	$tree = $$path{0}{$unique_accepting_state_ind};
2442							}
2443
2444	131					107	if (TRACE_NFA_TO_TREE) {
2445							print STDERR "tree: " . tree_dump($tree) . "\n";
2446							}
2447
2448	131					275	_tree_factorize_fixes($tree);
2449
2450	131					146	if (TRACE_NFA_TO_TREE) {
2451							print STDERR "tree (after factorization): " . tree_dump($tree) . "\n";
2452							}
2453	131					1124	return $tree;
2454							}
2455
2456
2457							# Recursively (bottom up) factorizes prefixes and suffixes out from
2458							# alternations if at least one of them contains a sub-tree.
2459							#
2460							# Example 1: (ab1cd\|ab2cd\|ab3cd) -> ab(1\|2\|3)cd
2461							# Example 2: (ab1cd\|ab2cd\|ab3cd) remains the same (no sub-tree)
2462							#
2463							# Example 2 does not need to be factorized
2464							# because it can be represented by a drop-down list,
2465							# which is the primary purpose of this module;
2466							# in this case, a factorization may lead to counter-intuitive results,
2467							# like words cut in the middle.
2468							#
2469							# But example 1 (less common) could only be represented as mere free-text
2470							# if the common pre- and suf-fixes were not factorized out,
2471							# thus loosing information for the input helper (xxx_to_input_constraints).
2472							#
2473							# This behavior can be changed by setting our $FULL_FACTORIZE_FIXES = 1;
2474							# in this case, Example 2 would produce ab(1\|2\|3)cd.
2475							#
2476							# Modifies $tree in place
2477							#
2478							sub _tree_factorize_fixes {
2479	1199			1199		969	my ($tree) = @_;
2480	1199	100	100			4262	if (
			66
			100
			66
			66
2481	628					2277	!defined($tree)
2482							\|\| ref($tree) eq CHAR_CLASS
2483							\|\| @{$$tree[1]} == 0
2484							\|\| !$FULL_FACTORIZE_FIXES
2485							&& (
2486							@{$$tree[1]} == 1
2487							\|\| !grep { ref($_) ne CHAR_CLASS } map { @$_ } @{$$tree[1]}
2488							)
2489							) {
2490	954					1318	return $tree;
2491							}
2492							else {
2493	245					206	for (grep { grep { ref($_) ne CHAR_CLASS } @$_ } @{$$tree[1]} ) {
	564					507
	1503					1762
	245					310
2494	1068					1304	my $tmp_tree =
2495	320					332	tree_concat(map { _tree_factorize_fixes($_) } @$_)
2496							;
2497	320	100	66			1177	if (
			66
2498	319					689	ref($tmp_tree) eq CHAR_CLASS
2499							\|\| $$tmp_tree[0]
2500							\|\| @{$$tmp_tree[1]} > 1
2501							) {
2502	1					3	$_ = [$tmp_tree];
2503							}
2504							else {
2505	319					560	$_ = $$tmp_tree[1][0];
2506							}
2507							}
2508
2509							# flatten
2510	245					460	@{$$tree[1]} = map {
2511	245					289	[map {
2512	1598					1662	ref($_) ne CHAR_CLASS
2513	1					3	&& !$$_[0] && @{$$_[1]} == 1
2514							# non-starified with single alternation
2515	1598	100	100			4477	? @{$$_[1][0]}
2516							: $_
2517	564					543	} grep { defined($_) } @$_]
2518	245					236	} @{$$tree[1]};
2519
2520	245	100				212	if (@{$$tree[1]} == 1) {
	245					462
2521	4					9	return $tree;
2522							}
2523
2524	241					189	my $fst_len = @{$$tree[1][0]};
	241					279
2525	241					230	my ($pre_len, $suf_len) = (0, 0);
2526	241					255	for (1, 0) {
2527	392					482	my ($len_ref, @range)
2528							= $_
2529							? (\$pre_len, (0..$fst_len-1))
2530	482	100				893	: (\$suf_len, map {-$_} (1..$fst_len-$pre_len))
2531							;
2532	482					504	for my $i (@range) {
2533	532	100				451	if (
2534	713	100	100			3705	grep {
2535	532					545	$i >= @$_
2536							\|\| ref($$_[$i]) ne CHAR_CLASS
2537							\|\| $$tree[1][0][$i] != $$_[$i]
2538							}
2539	532					581	@{$$tree[1]}[1..$#{$$tree[1]}]
2540							) {
2541	415					629	last;
2542							}
2543	117					179	$$len_ref++;
2544							}
2545							}
2546	241	100	100			810	if ($pre_len == 0 && $suf_len == 0) {
2547	172					330	return $tree;
2548							}
2549
2550	69					78	my $empty_seen = 0;
2551							my $mid_tree = [
2552							0
2553							, [
2554							map {
2555	157	100				279	if ($pre_len <= $#$_ - $suf_len) {
	69	50				115
2556	104					257	[ @$_[$pre_len..$#$_-$suf_len] ];
2557							}
2558							elsif (!$empty_seen++) {
2559	53					77	[];
2560							}
2561							else {
2562	0					0	();
2563							}
2564							}
2565	69					95	@{$$tree[1]}
2566							]
2567							];
2568	69					98	$$tree[1] = [[
2569	69					165	@{$$tree[1][0]}[0..$pre_len-1]
2570	69					160	, $empty_seen == @{$$tree[1]} ? () : $mid_tree
2571	69	50				124	, @{$$tree[1][0]}[$fst_len-$suf_len..$fst_len-1]
2572							]];
2573	69					172	return $tree;
2574							}
2575							}
2576
2577							=item C
2578
2579							Converts a C<$tree> to an C<$ere> (if C<$to_perlre> is false)
2580							or to a C<$perlre> (if C<$to_perlre> is true).
2581
2582							=cut
2583
2584							sub tree_to_regex {
2585	119	100		119	1	316	my $re = defined($_[0]) ? &_tree_to_regex : '$.';
2586	119	100				1662	return $_[1] ? qr/\A$re\z/ms : "^$re\$";
2587							}
2588
2589							{
2590							my %cc_to_regex_cache;
2591
2592							sub _tree_to_regex {
2593	930			930		996	my ($tree, $to_perlre) = (@_, 0);
2594	930	100	100			1259	if (ref($tree) eq CHAR_CLASS) {
	906	50				1265
		100
2595							return
2596	24		100			108	$cc_to_regex_cache{$tree.$to_perlre}
2597							\|\|= cc_to_regex($tree, $to_perlre)
2598							;
2599							}
2600	906					1739	elsif (@{$$tree[1]} == 0) {
2601	0					0	return '';
2602							}
2603							elsif (
2604	487					1155	@{$$tree[1]} == 1 # single alteration
2605							&& @{$$tree[1][0]} == 1 # single atom
2606							) {
2607	312					333	my $atom = $$tree[1][0][0];
2608	312	50				416	if (ref($atom) eq CHAR_CLASS) {
2609	312	100	100			1374	return join('',
2610							$cc_to_regex_cache{$atom.$to_perlre}
2611							\|\|= cc_to_regex($atom, $to_perlre)
2612							, $$tree[0] ? '*' : ()
2613							);
2614							}
2615							else {
2616	0		0			0	return _tree_to_regex(
2617							[$$tree[0] \|\| $$atom[0], $$atom[1]]
2618							, $to_perlre
2619							);
2620							}
2621							}
2622							else {
2623							my $needs_parenthesis
2624							= @{$$tree[1]} > 1 # (a\|...)
2625	594		66			421	\|\| $$tree[0] && @{$$tree[1][0]} > 1 # (ab...)*
2626							;
2627
2628	2878	100	100			8682	return join(''
2629							, ($needs_parenthesis ? ($to_perlre ? '(?:' : '(') : ())
2630							, (
2631							join('\|',
2632							map {
2633	594					635	join('',
2634							map {
2635	1127					1161	ref($_) eq CHAR_CLASS
2636							? $cc_to_regex_cache{$_.$to_perlre}
2637							\|\|= cc_to_regex($_, $to_perlre)
2638							: _tree_to_regex($_, $to_perlre)
2639							}
2640							@$_ # alternation
2641							)
2642							}
2643	594	100				984	@{$$tree[1]}
		100
		100
		100
2644							)
2645							)
2646							, ($needs_parenthesis ? ')' : ())
2647							, ($$tree[0] ? '*' : ())
2648							);
2649							}
2650							}
2651							}
2652
2653							# starification (regex)*
2654							sub tree_starify {
2655	285			285	0	286	my ($tree) = @_;
2656	285	100				456	if (ref($tree) eq CHAR_CLASS) {
2657	96					314	return [1, [[$tree]]];
2658							}
2659							else {
2660	189					354	return [1, $$tree[1]];
2661							}
2662							}
2663
2664							# The behavior of tree_concat2 can be altered
2665							# by setting $TREE_CONCAT_FULL_EXPAND = 1;
2666							sub tree_concat2 {
2667	2995			2995	0	8407	my ($tree_0, $tree_1) = @_;
2668	2995					2094	my $concat;
2669
2670							# main criteria:
2671							# CHAR_CLASS
2672							# @{$$tree_n[1]} == 0
2673							# $$tree_n[0]
2674							# @{$$tree_n[1]} == 1
2675
2676	2995	100				3732	if (ref($tree_0) eq CHAR_CLASS) {
	2155	100				4170
		100
		100
2677	840	100				1592	if (@$tree_0 == 0) {
		100
		100
		100
2678	5	100	100			14	if (
2679	3					12	ref($tree_1) ne CHAR_CLASS
2680							&& @{$$tree_1[1]} == 0
2681							) {
2682							# () -> empty
2683	1					1	$concat = $cc_none;
2684							}
2685							else {
2686							# ->
2687	4					5	$concat = $tree_1;
2688							}
2689							}
2690	509					1075	elsif (ref($tree_1) eq CHAR_CLASS) {
2691	326	100				425	if (@$tree_1 == 0) {
2692							# a -> a
2693	1					2	$concat = $tree_0;
2694							}
2695							else {
2696							# a b -> (ab)
2697	325					608	$concat = [0, [[ $tree_0, $tree_1 ]]];
2698							}
2699							}
2700							elsif (@{$$tree_1[1]} == 0) {
2701							# a () -> a
2702	1					2	$concat = $tree_0;
2703							}
2704							elsif ($$tree_1[0]) {
2705							# a (b)* -> (a(b)*)
2706	373					754	$concat = [0, [[ $tree_0, $tree_1 ]]];
2707							}
2708							else {
2709	135	100	100			266	if (
2710	519	100				1128	$FULL_FACTORIZE_FIXES
2711	236					319	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2712	133					171	map {@$_} @{$$tree_1[1]}
2713							) {
2714							# a (bc\|de) -> (a(bc\|de))
2715							# one of bcde is starified
2716	42					80	$concat = [0, [[ $tree_0, $tree_1 ]]];
2717							}
2718							else {
2719							# a (bc\|de) -> (abc\|ade)
2720							# none of bcde is starified
2721	161					373	$concat = [
2722							0
2723	93					99	, [ map { [ $tree_0, @$_ ] } @{$$tree_1[1]} ]
	93					136
2724							];
2725							}
2726							}
2727							}
2728							elsif (@{$$tree_0[1]} == 0) {
2729	5	100	100			14	if (
2730	3					9	ref($tree_1) ne CHAR_CLASS
2731							&& @{$$tree_1[1]} == 0
2732							) {
2733							# () () -> empty
2734	1					2	$concat = $cc_none;
2735							}
2736							else {
2737							# () ->
2738	4					2	$concat = $tree_1;
2739							}
2740							}
2741	1884					2394	elsif ($$tree_0[0]) {
2742	266	100				392	if (ref($tree_1) eq CHAR_CLASS) {
	176	100				379
		100
		100
2743	90	100				144	if (@$tree_1 == 0) {
2744							# (a)* -> (a)*
2745	1					2	$concat = $tree_0;
2746							}
2747							else {
2748							# (a)* b -> ((a)*b)
2749	89					222	$concat = [0, [[ $tree_0, $tree_1 ]]];
2750							}
2751							}
2752							elsif (@{$$tree_1[1]} == 0) {
2753							# (a)* () -> (a)*
2754	1					2	$concat = $tree_0;
2755							}
2756	174					268	elsif ($$tree_1[0]) {
2757							# (a)* (b)* -> ((a)(b))
2758	1					2	$concat = [0, [[ $tree_0, $tree_1 ]]];
2759							}
2760							elsif (@{$$tree_1[1]} == 1) {
2761							# (a)* (bcd) -> ((a)*bcd)
2762	100					228	$concat = [
2763							0
2764	100					158	, [[ $tree_0, @{$$tree_1[1][0]} ]]
2765							];
2766							}
2767							else {
2768							# (a)* (b\|c) -> ((a)*(b\|c))
2769	74					149	$concat = [0, [[ $tree_0, $tree_1 ]]];
2770							}
2771							}
2772							elsif (@{$$tree_0[1]} == 1) {
2773	1473	100				1716	if (ref($tree_1) eq CHAR_CLASS) {
	1075	100				1809
		100
		100
		100
2774	398	100				495	if (@$tree_1 == 0) {
2775							# (ab) -> (ab)
2776	1					2	$concat = $tree_0;
2777							}
2778							else {
2779							# (ab) c -> (abc)
2780	397					872	$concat = [
2781							0
2782	397					312	, [[ @{$$tree_0[1][0]}, $tree_1 ]]
2783							];
2784							}
2785							}
2786							elsif (@{$$tree_1[1]} == 0) {
2787							# (ab) () -> (ab)
2788	1					2	$concat = $tree_0;
2789							}
2790	658					968	elsif ($$tree_1[0]) {
2791							# (ab) (c)* -> (ab(c)*)
2792	416					360	$concat = [0, [[@{$$tree_0[1][0]}, $tree_1]]];
	416					871
2793							}
2794	1092					1458	elsif (@{$$tree_1[1]} == 1) {
2795							# (ab) (cd) -> (abcd)
2796	298					344	$concat = [
2797							0
2798	298					276	, [[ @{$$tree_0[1][0]}, @{$$tree_1[1][0]} ]]
	298					716
2799							];
2800							}
2801							elsif (
2802	360					431	!grep { ref($_) ne CHAR_CLASS } @{$$tree_0[1][0]}
2803							) {
2804	16	100	66			41	if (
2805	217	100				351	$FULL_FACTORIZE_FIXES
2806	48					62	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2807	16					22	map {@$_} @{$$tree_1[1]}
2808							) {
2809							# (ab) (cd\|ef) -> (ab(cd\|ef))
2810							# neither a nor b is a tree
2811							# one of cdef is starified
2812	5					7	$concat = [0, [[@{$$tree_0[1][0]}, $tree_1]]];
	5					11
2813							}
2814							else {
2815							# (ab) (cd\|ef) -> (abcd\|abef)
2816							# neither a nor b is a tree
2817							# none of cdef is starified
2818	34					79	$concat = [
2819							0
2820	11					15	, [ map { [ @{$$tree_0[1][0]}, @$_ ] } @{$$tree_1[1]} ]
	34					25
	11					18
2821							];
2822							}
2823							}
2824							else {
2825							# (ab) (cd\|ef) -> (ab(cd\|ef))
2826							# a or b is a tree
2827	344					291	$concat = [0, [[@{$$tree_0[1][0]} , $tree_1 ]]];
	344					927
2828							}
2829							}
2830							else {
2831	411	100				557	if (ref($tree_1) eq CHAR_CLASS) {
	352	50				650
		100
		100
		50
2832	59	50				97	if (@$tree_1 == 0) {
2833							# (ab\|cd) -> (ab\|cd)
2834	0					0	$concat = $tree_0;
2835							}
2836							else {
2837	59	100	100			135	if (
2838	371	100				678	$FULL_FACTORIZE_FIXES
2839	148					189	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2840	52					69	map {@$_} @{$$tree_0[1]}
2841							) {
2842							# (ab\|cd) e -> ((ab\|cd)e)
2843							# one of abcd is starified
2844	12					31	$concat = [0, [[ $tree_0, $tree_1 ]]];
2845							}
2846							else {
2847							# (ab\|cd) e -> (abe\|cde)
2848							# none of abcd is starified
2849	138					233	$concat = [
2850							0
2851	47					57	, [ map { [@$_, $tree_1] } @{$$tree_0[1]} ]
	47					62
2852							];
2853							}
2854							}
2855							}
2856							elsif (@{$$tree_1[1]} == 0) {
2857							# (ab\|cd) () -> (ab\|cd)
2858	0					0	$concat = $tree_0;
2859							}
2860	85					164	elsif ($$tree_1[0]) {
2861							# (ab\|cd) (e)* -> ((ab\|cd)(e)*)
2862	267					463	$concat = [0, [[ $tree_0, $tree_1 ]]];
2863							}
2864							elsif (
2865							@{$$tree_1[1]} == 1
2866							) {
2867	13	100				18	if (!grep { ref($_) ne CHAR_CLASS } @{$$tree_1[1][0]}) {
	50					79
	13					18
2868	11	100	66			39	if (
2869	145	100				236	$FULL_FACTORIZE_FIXES
2870	30					45	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2871	9					24	map {@$_} @{$$tree_0[1]}
2872							) {
2873							# (ab\|cd) (ef) -> ((ab\|cd)ef)
2874							# e and f both CHAR_CLASS
2875							# one of abcd is starified
2876	2					3	$concat = [0, [[$tree_0, @{$$tree_1[1][0]}]]];
	2					7
2877							}
2878							else {
2879							# (ab\|cd) (ef) -> (acef\|cdef)
2880							# e and f both CHAR_CLASS
2881							# none of abcd is starified
2882	30					71	$concat = [
2883							0
2884	9					11	, [ map { [@$_, @{$$tree_1[1][0]}] } @{$$tree_0[1]} ]
	30					55
	9					17
2885							];
2886							}
2887							}
2888							else {
2889							# (ab\|cd) (ef) -> ((ab\|cd)ef)
2890							# e or f is a tree
2891	2					4	$concat = [0, [[$tree_0, @{$$tree_1[1][0]}]]];
	2					7
2892							}
2893							}
2894							elsif ($TREE_CONCAT_FULL_EXPAND) {
2895							# (ab\|cd) (ef\|gh) -> (abef\|abgh\|cdef\|cdgh)
2896	0					0	$concat = [
2897							0
2898							, [
2899							map {
2900	0					0	my $alt_0 = $_;
2901	0					0	map { [@$alt_0, @$_] }
	0					0
2902	0					0	@{$$tree_1[1]}
2903							}
2904	0					0	@{$$tree_0[1]}
2905							]
2906							];
2907							}
2908							else {
2909							# (ab\|cd) (ef\|gh) -> ((ab\|cd)(ef\|gh))
2910	72					134	$concat = [0, [[ $tree_0, $tree_1 ]]];
2911							}
2912							}
2913	2995					5373	return $concat;
2914							}
2915
2916							# concatenation regex0regex1...
2917							sub tree_concat {
2918	1476	50		1476	0	2869	if (@_ == 0) {
		100
		50
2919	0					0	return $cc_none; # neutral element: accepting empty string
2920							}
2921	4156					5359	elsif (@_ == 1) {
2922	31					85	return $_[0];
2923							}
2924							elsif (grep {!defined($_)} @_) {
2925	0					0	return undef; # one accepting nothing -> concat accepting nothing
2926							}
2927
2928							# resolve words first
2929	1445					1043	my @word;
2930							my @trees;
2931	1445					1520	for (@_) {
2932	4156	100				4822	if (ref($_) eq CHAR_CLASS) {
2933	1452					1514	push(@word, $_);
2934							}
2935							else {
2936	2704	100				4521	if (@word > 1) {
		100
2937	30					66	push(@trees, [0, [[ @word ]] ] );
2938	30					41	@word = ();
2939							}
2940							elsif (@word) {
2941	438					392	push(@trees, $word[0]);
2942	438					492	@word = ();
2943							}
2944	2704					3004	push(@trees, $_);
2945							}
2946							}
2947	1445	100				2645	if (@word > 1) {
		100
2948	151					299	push(@trees, [0, [[ @word ]] ] );
2949							}
2950							elsif (@word) {
2951	351					371	push(@trees, $word[0]);
2952							}
2953
2954	1445					1267	my $concat = $trees[0];
2955	1445					1829	for my $tree (@trees[1..$#trees]) {
2956	2229					2471	$concat = tree_concat2($concat, $tree);
2957							}
2958
2959	1445					2118	return $concat;
2960							}
2961
2962							# alternation regex0\|regex1\|...
2963							sub tree_alt {
2964	1007			1007	0	843	my @starified_alts;
2965							my @non_starified_alts;
2966	0					0	my $has_empty;
2967
2968	1007					1021	for (grep { defined($_) } @_) {
	1972					2451
2969	1972	100				2400	if (ref($_) eq CHAR_CLASS) {
	1547	50				2822
		100
2970	425					598	push(@non_starified_alts, [$_]);
2971							}
2972							elsif (!@{$$_[1]}) {
2973	0					0	$has_empty = 1;
2974							}
2975							elsif ($$_[0]) {
2976	265					189	push(@starified_alts, @{$$_[1]});
	265					398
2977							}
2978							else {
2979	1282					832	push(@non_starified_alts, @{$$_[1]});
	1282					1745
2980							}
2981							}
2982
2983	1007	100				1455	if (!@starified_alts) {
		100
2984	874	100	66			2101	if (
		50	66
			66
2985	43					140	@non_starified_alts > 1
2986							\|\| $has_empty
2987							\|\| @non_starified_alts && @{$non_starified_alts[0]} > 1
2988							) {
2989							return [
2990	835	50				2899	0
2991							, [
2992							@non_starified_alts
2993							, ($has_empty ? [[0, []]] : ())
2994							]
2995							];
2996							}
2997							elsif (!@non_starified_alts) {
2998	0					0	return undef; # neutral element: accepting nothing
2999							}
3000							else {
3001	39					123	return $non_starified_alts[0][0];
3002							}
3003
3004							}
3005							elsif (!@non_starified_alts) {
3006	132					506	return [1, \@starified_alts];
3007							}
3008							else {
3009							return [
3010	1					4	0
3011							, [
3012							@non_starified_alts
3013							, [[1, \@starified_alts]]
3014							]
3015							];
3016							}
3017							}
3018
3019
3020							# returns an unanchored $ere having exactly the same structure
3021							# as the given $tree. Intended for tracing/debugging.
3022							sub tree_dump {
3023	131			131	0	209	my ($tree) = @_;
3024	131	50				205	if (!defined($_[0])) {
3025							# nothing accepted (not even the empty string)
3026	0					0	return '$.';
3027							}
3028	131	100				192	if (ref($tree) eq CHAR_CLASS) {
	83	100				131
3029	48					59	return cc_to_regex($tree);
3030							}
3031							elsif (@{$$tree[1]} == 0) {
3032	20					54	return '()';
3033							}
3034							else {
3035	63					58	return join(''
3036							, '('
3037							, (
3038							join('\|',
3039							map {
3040	63					76	my $alt = $_;
3041	132					90	join('',
3042							map {
3043	63					55	my $atom = $_;
3044	132	100				178	if (ref($atom) eq CHAR_CLASS) {
3045	126					136	cc_to_regex($atom);
3046							}
3047							else {
3048	6					10	tree_dump($atom);
3049							}
3050							}
3051							@$alt
3052							)
3053							}
3054	63	100				66	@{$$tree[1]}
3055							)
3056							)
3057							, ')'
3058							, ($$tree[0] ? '*' : ())
3059							);
3060							}
3061							}
3062
3063							# Heuristic weight function for the processing order of the warshall algorithm
3064							sub _tree_weight {
3065	1741			1741		1261	my ($tree) = @_;
3066	1741					1240	my $weight = 0;
3067	1741	100				2245	if (ref($tree) eq CHAR_CLASS) {
		50
3068	1476					1333	for (@$tree) {
3069	1692	100				2695	$weight += ($$_[0] == $$_[1] ? 1 : 2);
3070							}
3071							}
3072							elsif (defined($tree)) {
3073	265					215	for (map { @$_ } @{$$tree[1]}) {
	347					501
	265					359
3074	1267					1473	$weight += _tree_weight($_);
3075							}
3076							}
3077	1741					2068	return $weight;
3078							}
3079
3080
3081							##############################################################################
3082							# $input_constraints
3083							##############################################################################
3084
3085							use constant {
3086	8					26526	FREE_TEXT => 'free text'
3087	8			8		75560	};
	8					14
3088
3089							=back
3090
3091							=head2 Input constraints
3092
3093							$input_constraints = [ $input_constraint_0, $input_constraint_1, ... ]
3094							$input_constraint = [ 'word_0', 'word_1', ... ] (drop down)
3095							or 'free_text' (free text)
3096
3097
3098							=over 4
3099
3100							=item C
3101
3102							Converts a C<$tree> to a pair C<($input_constraints, $split_str)>.
3103
3104							C<$split_perlre> is a compiled perl regular expression splitting a string
3105							accordingly to C<$input_constraints>. This C<$perlre> matches if and only if
3106							each drop down can be assigned a value; then C<$str =~ $perlre> in list
3107							context returns as many values as C<@$input_constraints>.
3108
3109							=cut
3110
3111							sub tree_to_input_constraints {
3112	12			12	1	20	my ($input_constraints, $perlres) = &_tree_to_input_constraints;
3113
3114							# concatenate free texts and stronger underlying regexs
3115	12					15	my @previous_undefs;
3116							my @kept;
3117	12					28	for my $i (0..$#$input_constraints) {
3118	36	100				59	if ($$input_constraints[$i] eq FREE_TEXT) {
3119	15					21	push(@previous_undefs, $i);
3120							}
3121							else {
3122	21	100				42	if (@previous_undefs) {
3123	9					12	push(@kept, $i-1);
3124	9	100				22	if (@previous_undefs > 1) {
3125	6					18	$$perlres[$i-1] = join('',
3126	2					3	map { '(?:' . $$perlres[$_] . ')' }
3127							@previous_undefs
3128							);
3129							}
3130	9					16	@previous_undefs = ();
3131							}
3132	21					40	push(@kept, $i);
3133							}
3134							}
3135	12	100				24	if (@previous_undefs) {
3136	1					2	push(@kept, $#$input_constraints);
3137	1	50				3	if (@previous_undefs > 1) {
3138	2					8	$$perlres[$#$input_constraints] = join('',
3139	1					2	map { '(?:' . $$perlres[$_] . ')' }
3140							@previous_undefs
3141							);
3142							}
3143							}
3144	12					41	@$input_constraints = @$input_constraints[@kept];
3145	12					34	@$perlres = @$perlres[@kept];
3146
3147							# sort words, remove duplicates
3148	12					20	for (grep { $_ ne FREE_TEXT } @$input_constraints) {
	31					55
3149	21					21	$_ = [ sort(keys(%{ { map { ($_ => $_) } @$_ } })) ];
	21					24
	37					147
3150							}
3151
3152							# remove empty words
3153							# concatenate single words
3154	12					14	my @previous_singles;
3155	12					15	@kept = ();
3156	12					23	for my $i (0..$#$input_constraints) {
3157	31	100	100			66	if (
		50	33
3158	21					57	$$input_constraints[$i] eq FREE_TEXT
3159							\|\| @{$$input_constraints[$i]} > 1
3160	11					50	) {
3161	20	100				31	if (@previous_singles) {
3162	6					8	push(@kept, $i-1);
3163	6	50				70	if (@previous_singles > 1) {
3164	0					0	$$perlres[$i-1] = join('',
3165	0					0	map { $$perlres[$_] }
3166							@previous_singles
3167							);
3168	0					0	$$input_constraints[$i-1] = join('',
3169	0					0	map { $$input_constraints[$_][0] }
3170							@previous_singles
3171							);
3172							}
3173	6					11	@previous_singles = ();
3174							}
3175	20					29	push(@kept, $i);
3176							}
3177							elsif (
3178							@{$$input_constraints[$i]} == 1
3179							&& length($$input_constraints[$i][0])
3180							) {
3181	11					23	push(@previous_singles, $i);
3182							}
3183							}
3184	12	100				24	if (@previous_singles) {
3185	5					11	push(@kept, $#$input_constraints);
3186	5	50				18	if (@previous_singles > 1) {
3187	0					0	$$perlres[$#$input_constraints] = join('',
3188	0					0	map { $$perlres[$_] }
3189							@previous_singles
3190							);
3191	0					0	$$input_constraints[$#$input_constraints] = join('',
3192	0					0	map { $$input_constraints[$_][0] }
3193							@previous_singles
3194							);
3195							}
3196							}
3197	12					26	@$input_constraints = @$input_constraints[@kept];
3198	12					28	@$perlres = @$perlres[@kept];
3199
3200	12	50				27	if (!@$input_constraints) {
3201	0					0	@$input_constraints = (['']);
3202	0					0	@$perlres = ('');
3203							}
3204
3205	31	100				97	my $split_perlre
3206							= join('',
3207							map {
3208	12					26	$$input_constraints[$_] eq FREE_TEXT
3209							? "($$perlres[$_]\|.*?)"
3210							: "($$perlres[$_])"
3211							}
3212							(0..$#$perlres)
3213							)
3214							;
3215	12					743	return ($input_constraints, qr/\A$split_perlre\z/ms);
3216							}
3217
3218							{
3219
3220							my %cc_to_input_constraint_cache;
3221
3222							# returns ($input_constraints, $perlres)
3223							# two references to arrays of the same size.
3224							sub _tree_to_input_constraints {
3225	38			38		41	my ($tree) = @_;
3226	38					36	my $input_constraints;
3227							my $perlres;
3228	38	50				85	if (!defined($tree)) {
		100
		50
		100
		100
3229							# regex accepting nothing -> free text (always rejected)
3230
3231	0					0	$input_constraints = [FREE_TEXT];
3232	0					0	$perlres = ['$.'];
3233							}
3234	35					83	elsif (ref($tree) eq CHAR_CLASS) {
3235							# single character class -> drop down
3236
3237	3		66			13	$input_constraints = [
3238							$cc_to_input_constraint_cache{$tree}
3239							\|\|= cc_to_input_constraint($tree)
3240							];
3241	3					7	$perlres = [_tree_to_regex($tree, 1)];
3242							}
3243							elsif (@{$$tree[1]} == 0) {
3244							# no top-level alternation
3245
3246	0					0	$input_constraints = [['']];
3247	0					0	$perlres = [_tree_to_regex($tree, 1)];
3248							}
3249	29					50	elsif ($$tree[0]) {
3250							# starified regex -> free text
3251
3252	6					11	$input_constraints = [FREE_TEXT];
3253	6					13	$perlres = [_tree_to_regex($tree, 1)];
3254							}
3255							elsif (@{$$tree[1]} == 1) {
3256							# single top-level alternation -> mixed results
3257							# example: ab*c(d\|e)f
3258
3259	14					13	$input_constraints = [];
3260	14					16	$perlres = [];
3261
3262	14					14	my $i = 0;
3263	14					16	while ($i != @{$$tree[1][0]}) {
	43					90
3264	29					27	my $beg = $i;
3265	29					43	my @expanded_words = ('');
3266	29					25	my $cc;
3267	29		100			27	while (
			66
			66
3268	85					487	$i != @{$$tree[1][0]}
3269							&& ref($cc = $$tree[1][0][$i]) eq CHAR_CLASS
3270							&& (!@$cc \|\| $$cc[-1][1] != MAX_CHAR)
3271							) {
3272	56		66			137	my $input_constraint
3273							= $cc_to_input_constraint_cache{$cc}
3274							\|\|= cc_to_input_constraint($cc)
3275							;
3276
3277							@expanded_words
3278	57					50	= map {
3279	56					60	my $letter = $_;
3280	57					48	map { $_ . $letter }
	61					129
3281							@expanded_words
3282							}
3283							@$input_constraint
3284							;
3285	56					60	$i++;
3286							}
3287	29	100	66			94	if ($beg < $i && length($expanded_words[0])) {
3288	12					38	my $wrd_perlre = _tree_to_regex(
3289							[
3290							0
3291	12					23	, [[ @{$$tree[1][0]}[$beg..$i-1] ]]
3292							]
3293							, 1
3294							);
3295	12					34	push(@$input_constraints, \@expanded_words);
3296	12					18	push(@$perlres, $wrd_perlre);
3297							}
3298	29	100				70	if ($i < @{$$tree[1][0]}) {
	29					67
3299	23					69	my ($sub_input_constraints, $sub_perlres)
3300							= _tree_to_input_constraints($$tree[1][0][$i]);
3301	23	50	66			102	if (
			33
3302							@$sub_input_constraints
3303							&& (
3304							$$sub_input_constraints[0] eq FREE_TEXT
3305							\|\| grep { length($_) } @{$$sub_input_constraints[0]}
3306							)
3307							) {
3308	23					33	push(@$input_constraints, @$sub_input_constraints);
3309	23					28	push(@$perlres, @$sub_perlres);
3310							}
3311	23					45	$i++;
3312							}
3313							}
3314							}
3315							else {
3316							# multiple top-level alternations
3317
3318	15	100				13	if (
3319	279	100	66			948	grep { grep {
	15					24
3320	38					41	ref($_) ne CHAR_CLASS
3321							\|\| (@$_ && $$_[$#$_][1] == MAX_CHAR)
3322							} @$_ }
3323							@{$$tree[1]}
3324							) {
3325							# some alternation contains a sub-tree -> mixed results
3326							# example: abd\|ab*d
3327							# common pre/suf-fixes are factorized out
3328							# example: a(bd\|b*)d
3329
3330	6					6	my $fst_len = @{$$tree[1][0]};
	6					11
3331	6					9	my ($pre_len, $suf_len) = (0, 0);
3332	6					10	for (1, 0) {
3333	16					14	my ($len_ref, @range)
3334							= $_
3335							? (\$pre_len, (0..$fst_len-1))
3336	12	100				29	: (\$suf_len, map {-$_} (1..$fst_len-$pre_len))
3337							;
3338	12					22	for my $i (@range) {
3339	13	100				12	if (
3340	28	100	66			134	grep {
3341	13					16	$i >= @$_
3342							\|\| ref($$_[$i]) ne CHAR_CLASS
3343							\|\| $$tree[1][0][$i] != $$_[$i]
3344							}
3345	13					15	@{$$tree[1]}[0..$#{$$tree[1]}]
3346							) {
3347	3					7	last;
3348							}
3349	10					16	$$len_ref++;
3350							}
3351							}
3352	6	100				12	if ($pre_len) {
3353	2					14	my ($pre_input_constraints, $pre_perlres)
3354							= _tree_to_input_constraints(
3355							[
3356							0
3357	2					5	, [[ @{$$tree[1][0]}[0..$pre_len-1] ]]
3358							]
3359							);
3360	2					5	push(@$input_constraints, @$pre_input_constraints);
3361	2					3	push(@$perlres, @$pre_perlres);
3362							}
3363
3364	6	50				9	if (
3365							my @mid_alts
3366	14					37	= map { [ @$_[$pre_len..$#$_-$suf_len] ] }
	6					11
3367							@{$$tree[1]}
3368							) {
3369	6					12	push(@$input_constraints, FREE_TEXT);
3370	6					14	push(@$perlres, _tree_to_regex([ 0, \@mid_alts ] , 1));
3371							}
3372
3373	6	100				19	if ($suf_len) {
3374	1					4	my ($suf_input_constraints, $suf_perlres)
3375							= _tree_to_input_constraints(
3376							[
3377							0
3378							, [[
3379	1					3	@{$$tree[1][0]}
3380							[$fst_len-$suf_len..$fst_len-1]
3381							]]
3382							]
3383							);
3384	1					3	push(@$input_constraints, @$suf_input_constraints);
3385	1					2	push(@$perlres, @$suf_perlres);
3386							}
3387							}
3388							else {
3389							# each alternation contains only non negated char classes
3390							# -> drop down
3391
3392	9					16	$perlres = [_tree_to_regex($tree, 1)];
3393	9					15	for my $word (@{$$tree[1]}) {
	9					81
3394	24					32	my @expanded_words = ('');
3395	24		66			25	for my $input_constraint (
	224					427
3396							map {
3397							$cc_to_input_constraint_cache{$_}
3398							\|\|= cc_to_input_constraint($_);
3399							}
3400							@$word
3401							) {
3402	224	50				249	if (@$input_constraint == 1) {
3403	224					185	for (@expanded_words) {
3404	224					331	$_ .= $$input_constraint[0];
3405							}
3406							}
3407							else {
3408							@expanded_words
3409	0					0	= map {
3410	0					0	my $letter = $_;
3411	0					0	map { $_ . $letter }
	0					0
3412							@expanded_words
3413							}
3414							@$input_constraint
3415							;
3416							}
3417							}
3418	24					23	push(@{$$input_constraints[0]}, @expanded_words);
	24					55
3419							}
3420							}
3421							}
3422	38					65	return ($input_constraints, $perlres);
3423							}
3424							}
3425
3426							sub cc_to_input_constraint {
3427	28			28	0	29	my ($cc) = @_;
3428	28	50				66	if (@$cc == 0) {
		100
3429	0					0	return [''];
3430							}
3431							elsif ($$cc[$#$cc][1] == MAX_CHAR) {
3432	1					4	return FREE_TEXT;
3433							}
3434							else {
3435							return [
3436	27					40	map { map { chr($_) } ($$_[0]..$$_[1]) }
	27					39
	28					293
3437							@$cc
3438							];
3439							}
3440							}
3441
3442
3443							##############################################################################
3444							# $ere
3445							##############################################################################
3446
3447							=back
3448
3449							=head2 Ere
3450
3451							An C<$ere> is a perl string.
3452
3453							The syntax an C<$ere> is assumed to follow is based on POSIX ERE
3454							(else the C routines will C).
3455
3456							Unsupported POSIX features:
3457							back-references,
3458							equivalence classes C<[[=a=]]>,
3459							character class C<[[:digit:]]>,
3460							collating symbols C<[[.ch.]]>.
3461
3462							C<)> is always a special character. POSIX says that C<)> is a normal
3463							character if there is no matching C<(>.
3464
3465							There is no escape sequences such as C<\t> for tab or C<\n> for line feed.
3466							POSIX does not specify such escape sequences neither.
3467
3468							C<\> before a non-special character is ignored
3469							(except in bracket expressions). POSIX does not allow it.
3470
3471							The empty string is legal in alternations (C<(\|a)> is equivalent to C<(a?)>).
3472							POSIX does not allow it.
3473							The C<(\|a)> form is generated by the C routines
3474							(avoiding quantifiers other than C<*>).
3475
3476							C<[a-l-z]> is interpreted as C<([a-l] \| - \| z)> (but it is discouraged to
3477							rely upon this implementation artefact). POSIX says that the interpretation
3478							of this construct is undefined.
3479
3480							In bracket expressions, C<\> is a normal character,
3481							thus C<]> as character must occur first, or second after a C<^>
3482							(POSIX compliant, but possibly surprising for perl programmers).
3483
3484							All unicode characters supported by perl are allowed as literal characters.
3485
3486							=over 4
3487
3488							=item C
3489
3490							Parses an C<$ere> to a C<$nfa>.
3491
3492							WARNING: the parsing routines, in particular C,
3493							C on syntax errors; thus the caller may want to eval-trap such errors.
3494
3495							=cut
3496
3497							sub ere_to_nfa {
3498	215			215	1	29574	my ($ere, $has_anchor_ref) = @_;
3499
3500							# optimize very first and very last anchors
3501	215					999	my $has_beg_anchor = $ere =~ s/^\^+//;
3502	215					655	my $has_end_anchor = $ere =~ s/\$+$//;
3503
3504	215					279	$$has_anchor_ref = 0;
3505	215					196	my @alternation_nfas;
3506	215					251	do {
3507	215					432	push(@alternation_nfas, parse_alternation(\$ere, $has_anchor_ref));
3508							} while($ere =~ /\G \\| /xmsgc);
3509
3510	215	50	100			683	if ((pos($ere) \|\| 0) != length($ere)) {
3511	0					0	parse_die("unexpected character", \$ere);
3512							}
3513
3514	215					190	my $nfa;
3515	215	100	100			516	if (!$has_beg_anchor && !$has_end_anchor) {
3516							# a\|b\|c => ^.(a\|b\|c).$
3517
3518	10	50				66	$nfa = nfa_concat(
3519							[[1, [[$cc_any, 0]]]]
3520							, @alternation_nfas == 1
3521							? $alternation_nfas[0]
3522							: nfa_union(@alternation_nfas)
3523							, [[1, [[$cc_any, 0]]]]
3524							);
3525							}
3526							else {
3527	205					505	for my $alternation_nfa (@alternation_nfas[1..$#alternation_nfas-1]) {
3528	0					0	$alternation_nfa = nfa_concat(
3529							[[1, [[$cc_any, 0]]]]
3530							, $alternation_nfa
3531							, [[1, [[$cc_any, 0]]]]
3532							);
3533							}
3534	205	100	66			839	if (!$has_beg_anchor \|\| @alternation_nfas > 1) {
3535	7	50				41	$alternation_nfas[0] = nfa_concat(
		50
3536							!$has_beg_anchor ? [[1, [[$cc_any, 0]]]] : ()
3537							, $alternation_nfas[0]
3538							, @alternation_nfas > 1 ? [[1, [[$cc_any, 0]]]] : ()
3539							);
3540							}
3541	205	100	66			726	if (!$has_end_anchor \|\| @alternation_nfas > 1) {
3542	2	50				14	$alternation_nfas[-1] = nfa_concat(
		50
3543							@alternation_nfas > 1 ? [[1, [[$cc_any, 0]]]] : ()
3544							, $alternation_nfas[-1]
3545							, !$has_end_anchor ? [[1, [[$cc_any, 0]]]] : ()
3546							);
3547							}
3548							$nfa
3549	205	50				384	= @alternation_nfas == 1
3550							? $alternation_nfas[0]
3551							: nfa_union(@alternation_nfas)
3552							;
3553							}
3554
3555	215	100				774	return $$has_anchor_ref ? nfa_resolve_anchors($nfa) : $nfa;
3556							}
3557
3558							sub _ere_to_nfa {
3559	238			238		240	my ($str_ref, $has_anchor_ref) = @_;
3560
3561	238					219	my @alternation_nfas;
3562	238					202	do {
3563	385					591	push(@alternation_nfas, parse_alternation($str_ref, $has_anchor_ref));
3564							} while($$str_ref =~ /\G \\| /xmsgc);
3565
3566							return
3567	238	100				602	@alternation_nfas == 1
3568							? $alternation_nfas[0]
3569							: nfa_union(@alternation_nfas)
3570							;
3571							}
3572
3573							sub bracket_expression_to_cc {
3574	102			102	0	95	my ($str_ref) = @_;
3575	102					162	my $neg = $$str_ref =~ /\G \^/xmsgc;
3576	102					99	my $interval_list = [];
3577
3578							# anything is allowed a first char, in particular ']' and '-'
3579	102	100				347	if ($$str_ref =~ /\G (.) - ([^]]) /xmsgc) {
		50
3580	16					39	push(@$interval_list, [ord($1), ord($2)]);
3581							}
3582							elsif ($$str_ref =~ /\G (.) /xmsgc) {
3583	86					209	push(@$interval_list, [ord($1), ord($1)]);
3584							}
3585
3586	102					112	my $loop = 1;
3587	102					392	while ($loop) {
3588	177	50				420	if ($$str_ref =~ /\G ([^]]) - ([^]]) /xmsgc) {
		100
3589	0					0	push(@$interval_list, [ord($1), ord($2)]);
3590							}
3591							elsif ($$str_ref =~ /\G ([^]]) /xmsgc) {
3592	75					177	push(@$interval_list, [ord($1), ord($1)]);
3593							}
3594							else {
3595	102					188	$loop = 0;
3596							}
3597							}
3598
3599							return
3600	102	100				249	$neg
3601							? cc_neg(interval_list_to_cc($interval_list))
3602							: interval_list_to_cc($interval_list)
3603							;
3604							}
3605
3606							# Returns:
3607							# - the empty list iff no quantification has been parsed
3608							# - a 2-tuple ($min, $max)
3609							# either $max is the empty string
3610							# or $min <= $max
3611							sub parse_quant {
3612	271			271	0	279	my ($str_ref) = @_;
3613	271	100				723	if ($$str_ref =~ /\G \* /xmsgc) {
		100
		100
		50
3614	238					623	return (0, '');
3615							}
3616							elsif ($$str_ref =~ /\G \+ /xmsgc) {
3617	16					38	return (1, '');
3618							}
3619							elsif ($$str_ref =~ /\G \? /xmsgc) {
3620	8					20	return (0, 1);
3621							}
3622							elsif ($$str_ref =~ /\G \{ /xmsgc) {
3623	9					12	my ($min, $max);
3624	9	50				26	if ($$str_ref =~ /\G ( [0-9]+ ) /xmsgc) {
3625	9					18	$min = $1;
3626	9	100				26	if ($$str_ref =~ /\G , ([0-9]*) /xmsgc) {
3627	8					12	$max = $1; # may be ''
3628	8	50	66			44	if (length($max) && $min > $max) {
3629	0					0	parse_die("$min > $max", $str_ref);
3630							}
3631							}
3632							else {
3633	1					2	$max = $min;
3634							}
3635							}
3636							else {
3637	0					0	parse_die('number expected', $str_ref);
3638							}
3639
3640	9	50				33	if ($$str_ref !~ /\G \} /xmsgc) {
3641	0					0	parse_die('} expected', $str_ref);
3642							}
3643	9					25	return ($min, $max);
3644							}
3645							else {
3646	0					0	return;
3647							}
3648							}
3649
3650							=item quote($string)
3651
3652							Returns $string with escaped special characters.
3653
3654							=cut
3655
3656							sub quote {
3657	0			0	1	0	my ($str) = @_;
3658	0					0	$str =~ s/([.\[\\(*+?{\|^\$])/\\$1/xsmg;
3659	0					0	return $str;
3660							}
3661
3662							{
3663							my %char_to_cc_cache;
3664							sub parse_alternation {
3665	600			600	0	557	my ($str_ref, $has_anchor_ref) = @_;
3666	600					478	my @all_nfas;
3667							my $loop;
3668	0					0	my @quants;
3669	600					482	do {
3670	1005					861	$loop = 0;
3671	1005					1031	my $nfa = [];
3672	1005					856	my $next_state_index = 1;
3673	1005					735	while (1) {
3674	1631	100				6900	if ($$str_ref =~ /\G ( $ERE_literal + ) /xmsogc) {
		100
		100
		100
		100
		100
3675	1146		66			3906	push(@$nfa,
3676							map {
3677	437					1237	[ 0, [[
3678							$char_to_cc_cache{$_} \|\|= char_to_cc($_)
3679							, $next_state_index++
3680							]]]
3681							}
3682							split('', $1)
3683							);
3684							}
3685							elsif ($$str_ref =~ /\G ( \. + ) /xmsgc) {
3686	53					159	push(@$nfa,
3687							map {
3688	52					144	[ 0, [[
3689							$cc_any
3690							, $next_state_index++
3691							]]]
3692							}
3693							(1..length($1))
3694							);
3695							}
3696							elsif ($$str_ref =~ /\G ( \[ ) /xmsgc) {
3697	102					185	push(@$nfa,
3698							[ 0, [[
3699							bracket_expression_to_cc($str_ref)
3700							, $next_state_index++
3701							]]]
3702							);
3703	102	50				355	if ($$str_ref !~ /\G ] /xmsgc) {
3704	0					0	parse_die('] expected', $str_ref);
3705							}
3706							}
3707							elsif ($$str_ref =~ /\G \\ (.) /xmsgc) {
3708	17		66			83	push(@$nfa,
3709							[ 0, [[
3710							$char_to_cc_cache{$1} \|\|= char_to_cc($1)
3711							, $next_state_index++
3712							]]]
3713							);
3714							}
3715							elsif ($$str_ref =~ /\G \^ /xmsgc) {
3716	9					25	push(@$nfa,
3717							[ 0, [[
3718							$cc_beg
3719							, $next_state_index++
3720							]]]
3721							);
3722	9		100			26	$$has_anchor_ref \|\|= 1;
3723							}
3724							elsif ($$str_ref =~ /\G \$ /xmsgc) {
3725	9					28	push(@$nfa,
3726							[ 0, [[
3727							$cc_end
3728							, $next_state_index++
3729							]]]
3730							);
3731	9		100			35	$$has_anchor_ref \|\|= 1;
3732							}
3733							else {
3734	1005					967	last;
3735							}
3736							}
3737
3738	1005	100				1577	if (@$nfa) {
3739	500	100				1025	if ($$str_ref =~ /\G (?= [*+?{] ) /xmsgc) {
3740	175					339	my $last_char_class = $$nfa[$#$nfa][1][0][0];
3741	175	100				331	if (@$nfa > 1) {
3742	82					92	@{$$nfa[$#$nfa]} = (1, []);
	82					188
3743	82					108	push(@all_nfas, $nfa);
3744							}
3745	175					339	push(@quants, [scalar(@all_nfas), parse_quant($str_ref)]);
3746	175					484	push(@all_nfas, [[0, [[$last_char_class, 1 ]]], [1, []]]);
3747	175					218	$loop = 1;
3748							}
3749							else {
3750	325					545	push(@$nfa, [1, []]);
3751	325					364	push(@all_nfas, $nfa);
3752							}
3753							}
3754
3755	1005	100				2633	if ($$str_ref =~ /\G \( /xmsgc) {
3756	238					422	$nfa = _ere_to_nfa($str_ref, $has_anchor_ref);
3757	238	50				836	if ($$str_ref !~ /\G \) /xmsgc) {
3758	0					0	parse_die(') expected', $str_ref);
3759							}
3760	238	100				600	if ($$str_ref =~ /\G (?= [*+?{] ) /xmsgc) {
3761	96					191	push(@quants, [scalar(@all_nfas), parse_quant($str_ref)]);
3762							}
3763	238					277	push(@all_nfas, $nfa);
3764	238					456	$loop = 1;
3765							}
3766							} while ($loop);
3767
3768	600					791	for (@quants) {
3769	271					406	my ($i, $min, $max) = @$_;
3770	271		66			1355	$all_nfas[$i] = nfa_quant(
			66
3771							$all_nfas[$i]
3772							, $min, $max
3773							, $min && $i != 0 && _nfa_has_suffix($all_nfas[$i-1])
3774							, $min && $i != $#all_nfas && _nfa_has_prefix($all_nfas[$i+1])
3775							);
3776							}
3777
3778	600	100				1060	if (@all_nfas > 1) {
		100
3779	209					352	return nfa_concat(@all_nfas);
3780							}
3781							elsif (@all_nfas) {
3782	310					979	return $all_nfas[0];
3783							}
3784							else {
3785	81					290	return [[1, []]];
3786							}
3787							}
3788							}
3789
3790							sub _nfa_has_prefix {
3791	19			19		56	my ($nfa) = @_;
3792							# initial state non-accepting or no loop back to it
3793	19		33			99	!$$nfa[0][0] \|\| !grep { $$_[1] == 0 } map { @{$$_[1]} } @$nfa;
3794							}
3795
3796							sub _nfa_has_suffix {
3797	23			23		20	my ($nfa) = @_;
3798							# all accepting states are final
3799	23	100				28	!grep { $$_[0] && @{$$_[1]} } @$nfa
	55					108
	23					167
3800							}
3801
3802							sub parse_die {
3803	0			0	0	0	my ($msg, $str_ref) = @_;
3804	0		0			0	die("malformed regex: $msg at "
3805							. (pos($$str_ref) \|\| 0) . " in $$str_ref");
3806							}
3807
3808
3809							##############################################################################
3810							# Shorthands
3811							##############################################################################
3812
3813							=back
3814
3815							=head2 Shorthands
3816
3817							=over 4
3818
3819							=item C
3820							:= C
3821
3822							=cut
3823
3824							sub ere_to_tree {
3825	0			0	1	0	my ($ere) = @_;
3826	0					0	return nfa_to_tree(ere_to_nfa($ere));
3827							}
3828
3829							=item C
3830							:= C
3831
3832							=cut
3833
3834							sub ere_to_regex {
3835	0			0	1	0	my ($ere, $to_perlre) = (@_, 0);
3836	0					0	return tree_to_regex(ere_to_tree($ere), $to_perlre);
3837							}
3838
3839							=item C
3840							:= C
3841
3842							=cut
3843
3844							sub nfa_to_regex {
3845	119			119	1	298	my ($nfa, $to_perlre) = (@_, 0);
3846	119					265	return tree_to_regex(nfa_to_tree($nfa), $to_perlre);
3847							}
3848
3849							=item C
3850							:= C
3851
3852							=cut
3853
3854							sub ere_to_input_constraints {
3855	0			0	1	0	my ($ere) = @_;
3856	0					0	return tree_to_input_constraints(ere_to_tree($ere));
3857							}
3858
3859							=item C
3860							:= C
3861
3862							=cut
3863
3864							sub nfa_to_input_constraints {
3865	12			12	1	81	my ($nfa) = @_;
3866	12					27	return tree_to_input_constraints(nfa_to_tree($nfa));
3867							}
3868
3869							=item C
3870							:= C
3871
3872							=cut
3873
3874							sub nfa_to_min_dfa {
3875	207			207	1	28396	my ($nfa) = @_;
3876	207					426	return dfa_to_min_dfa(nfa_to_dfa($nfa));
3877							}
3878
3879							1;
3880
3881							=back
3882
3883							=head1 AUTHOR
3884
3885							Loïc Jonas Etienne
3886
3887							=head1 COPYRIGHT and LICENSE
3888
3889							Artistic License 2.0
3890							http://www.perlfoundation.org/artistic_license_2_0