File Coverage

blib/lib/Regexp/ERE.pm

Criterion	Covered	Total	%
statement	1296	1412	91.7
branch	541	632	85.6
condition	255	319	79.9
subroutine	59	65	90.7
pod	23	42	54.7
total	2174	2470	88.0

line	stmt	bran	cond	sub	pod	time	code
1	8			8		9634	use 5.008008;
	8					25
	8					318
2	8			8		38	use strict;
	8					13
	8					226
3	8			8		38	use warnings;
	8					14
	8					213
4	8			8		7101	use integer;
	8					114
	8					36
5
6							package Regexp::ERE;
7							our $VERSION = '0.03';
8
9							BEGIN {
10	8			8		439	use Exporter ();
	8					12
	8					550
11	8			8		15	our (@ISA, @EXPORT_OK);
12	8					119	@ISA = qw(Exporter);
13	8					682	@EXPORT_OK = qw(
14							&ere_to_nfa
15							&ere_to_tree
16							&ere_to_regex
17							&ere_to_input_constraints
18							&nfa_to_tree
19							&nfa_to_regex
20							&nfa_to_input_constraints
21							&nfa_clone
22							&nfa_concat
23							&nfa_union
24							&nfa_inter
25							&nfa_match
26							&nfa_quant
27							&nfa_isomorph
28							&nfa_to_dfa
29							&dfa_to_min_dfa
30							&nfa_to_min_dfa
31							&tree_to_regex
32							&tree_to_input_constraints
33							&char_to_cc
34							&interval_list_to_cc
35							&cc_union
36							"e
37							);
38							}
39
40							=encoding utf8
41
42							=head1 NAME
43
44							Regexp::ERE - extended regular expressions and finite automata
45
46							=head1 SYNOPSIS
47
48							use Regexp::ERE qw(
49							&ere_to_nfa
50							&nfa_inter
51							&nfa_to_regex
52							&nfa_to_input_constraints
53							&nfa_to_dfa
54							&dfa_to_min_dfa
55							);
56
57							# condition 1: begins with abc or def
58							my $nfa1 = ere_to_nfa('^(abc\|def)');
59
60							# condition 2: ends with 123 or 456
61							my $nfa2 = ere_to_nfa('(123\|456)$');
62
63							# condition 1 and condition 2
64							my $inter_nfa = nfa_inter($nfa1, $nfa2);
65
66							# compute extended regular expression (string)
67							my $ere = nfa_to_regex($inter_nfa);
68
69							# compute perl regular expression
70							my $perlre = nfa_to_regex($inter_nfa, 1);
71
72							# compute weaker input constraints suitable for widgets
73							my ($input_constraints, $split_perlre)
74							= nfa_to_input_constraints($inter_nfa);
75
76							# minimal dfa (simpler regular expression happens to result)
77							my $nfa3 = ere_to_nfa('^(a\|ab\|b)*$');
78							my $dfa3 = nfa_to_dfa($nfa3);
79							my $min_dfa3 = dfa_to_min_dfa($dfa3);
80							my $ere3 = nfa_to_regex($min_dfa3);
81
82							=head1 DESCRIPTION
83
84							Pure-perl module for:
85
86							=over 4
87
88							=item *
89
90							Parsing POSIX Extended Regular Expressions (C<$ere>) into
91							Non-Deterministic Finite Automata (C<$nfa>)
92
93							=item *
94
95							Manipulating C<$nfa>s (concatenating, or-ing, and-ing)
96
97							=item *
98
99							Computing Deterministic Finite Automata (C<$dfa>s) from C<$nfa>s
100							(powerset construction)
101
102							=item *
103
104							Computing minimal C<$dfa>s from C<$dfa>s (Hopcroft's algorithm)
105
106							=item *
107
108							Computing C<$ere>s or Perl Regular Expressions from C<$nfa> or C<$dfa>
109							(Warshall algorithm)
110
111							=item *
112
113							Heuristically deriving (possibly weaker) constraints from a C<$nfa> or C<$dfa>
114							suitable for display in a graphical user interface,
115							i.e. a sequence of widgets of type 'free text' and 'drop down';
116
117							Example: C<'^(abc\|def)'> => C<$nfa> => C<[['abc', 'def'], 'free text']>
118
119							=back
120
121							=head1 GLOSSARY AND CONVERSIONS OVERVIEW
122
123							=head2 Conversions overview
124
125							$ere -> $nfa -> $tree -> $regex ($ere or $perlre)
126							-> $input_constraints
127
128							The second argument of -> $regex conversions is an optional boolean,
129							true : conversion to a compiled perl regular expression
130							false: conversion to an ere string
131
132							The -> $input_constraints conversions return a pair (
133							$input_constraints: aref as described at tree_to_input_constraints()
134							$split_perlre : a compiled perl regular expression
135							)
136
137
138							=head2 Glossary
139
140							=over 4
141
142							=item $char_class
143
144							A set of unicode characters.
145
146							=item $ere
147
148							Extended regular expression (string).
149							See C for the exact syntax.
150
151							=item $perlre
152
153							Perl regular expression
154
155							=item $nfa
156
157							Non-deterministic finite automaton
158
159							=item $dfa
160
161							Deterministic finite automaton (special case of C<$nfa>)
162
163							=item $tree
164
165							Intermediate hierarchical representation of a regular expression
166							(which still can be manipulated before stringification),
167							similar to a parse tree (but used for generating, not for parsing).
168
169							=item $input_constraints
170
171							Ad-hoc data structure representing a list of gui-widgets
172							(free text fields and drop-down lists),
173							a helper for entering inputs
174							conforming to a given C<$nfa>.
175
176							=back
177
178							=cut
179
180
181							##############################################################################
182							# Config
183							##############################################################################
184
185							# If true, nfa_to_tree() always expands concatenated alternations.
186							# Example: (ab\|cd) (ef\|gh) -> (abef\|abgh\|cdef\|cdgh)
187							our $TREE_CONCAT_FULL_EXPAND = 0;
188
189							# If true, prefixes and suffixes are factorized out even for
190							# trees with a single alternation.
191							# Example: (a1b\|a2b) -> a(1\|2)b
192							our $FULL_FACTORIZE_FIXES = 0;
193
194							# Should be 0. Else, traces nfa_to_tree() on STDERR.
195							use constant {
196	8					682	TRACE_NFA_TO_TREE => 0
197	8			8		40	};
	8					41
198
199							use constant {
200	8					1931	MAX_CHAR => 0x10FFFF
201							, CHAR_CLASS => 'cc' # for blessing $char_classes (label only, no methods)
202	8			8		36	};
	8					19
203
204
205							=head1 DATA STRUCTURES AND SUBROUTINES
206
207							Each of the documented subroutines can be imported,
208							for instance C.
209
210							=cut
211
212
213							##############################################################################
214							# $char_class
215							##############################################################################
216
217							=head2 Character class
218
219
220							WARNING: C<$char_class>es must be created exclusively by
221							C
222							or C
223							for equivalent character classes to be always the same array reference.
224							For the same reason, C<$char_class>es must never be mutated.
225
226							In this implementation, the state transitions of a C<$nfa> are based upon
227							character classes (not single characters). A character class is an ordered
228							list of disjoint, non-mergeable intervals (over unicode code points,
229							i.e. positive integers).
230
231							$char_class = [
232							[ $low_0, $high_0 ] # $interval_0
233							, [ $low_1, $high_1 ] # $interval_1
234							, ...
235							]
236
237
238							Constraints:
239
240							1: 0 <= $$char_class[$i][0] (0 <= low)
241							2: $$char_class[$i][1] <= MAX_CHAR (high <= MAX_CHAR)
242							3: $$char_class[$i][0] <= $$char_class[$i][1] (low <= high)
243							4: $$char_class[$i][1] + 1 < $$char_class[$i+1][0] (non mergeable)
244
245
246							Exceptions (anchors used only in the parsing phase only):
247
248							begin : [ -2, -1 ]
249							end : [ -3, -2 ]
250							begin or end : [ -3, -1 ]
251
252							Immediately after parsing, such pseudo-character classes
253							are removed by C (internal subroutine).
254
255							=over 4
256
257							=cut
258
259							our $ERE_literal = qr/ [^.[\\()*+?{\|^\$] /xms;
260							our $PERLRE_char_class_special = qr/ [\[\]\\\^\-] /xms;
261
262							our $cc_any = bless([[ 0, MAX_CHAR ]], CHAR_CLASS);
263							our $cc_none = bless([], CHAR_CLASS);
264							our $cc_beg = bless([[ -2, -1]], CHAR_CLASS);
265							our $cc_end = bless([[ -3, -2]], CHAR_CLASS);
266							{
267
268	8			8		41	no warnings qw(utf8); # in particular for 0x10FFFF
	8					11
	8					21105
269
270							my %cc_cache;
271							# keys: join(',',1,map{@$_}@{$char_class})
272
273							for ($cc_any, $cc_none, $cc_beg, $cc_end) {
274							$cc_cache{ join(',', 1, map {@$_} @$_) } = $_;
275							}
276
277							=item char_to_cc($c)
278
279							Returns the unique C<$char_class> equivalent to C<[[ord($c), ord($c)]]>.
280
281							=cut
282
283							sub char_to_cc {
284	857		100	857	1	592237	return $cc_cache{ join(',', 1, (ord($_[0])) x 2) }
285							\|\|= bless([[ord($_[0]), ord($_[0])]], CHAR_CLASS);
286							}
287
288							# $interval_list is the same data structure as $char_class.
289							# Constraints 1, 2 are assumed.
290							# Constraints 3, 4 are enforced.
291
292							=item interval_list_to_cc($interval_list)
293
294							C<$interval_list> is an arbitrary list of intervals.
295							Returns the unique C<$char_class> whose reunion of intervals
296							is the same set as the reunion of the intervals of C<$interval_list>.
297
298							Example:
299
300							interval_list_to_cc([[102, 112], [65, 90], [97, 102], [113, 122]])
301							returns [[65, 90], [97, 122]]
302							(i.e [f-p]\|[A-Z]\|[a-f]\|[q-z] => [A-Z]\|[a-z])
303
304							Note that both C<$interval_list> and C<$char_class> are lists of intervals,
305							but only C<$char_class> obeys the constraints above,
306							while C<$interval_list> does not.
307
308							Remark also that C is the identity
309							(returns the same reference as given) on C<$char_class>es returned
310							by either C or C.
311
312							=cut
313
314							sub interval_list_to_cc {
315	6016			6016	1	6975	my ($interval_list) = @_;
316							my @sorted
317	7330					9926	= sort { $$a[0] <=> $$b[0] }
	9974					23702
318	6016					7254	grep { $$_[0] <= $$_[1] }
319							@$interval_list
320							;
321	6016					10822	my $char_class = bless([], CHAR_CLASS);
322	6016					6555	my $i = 0;
323	6016					11867	while ($i != @sorted) {
324	6921					8264	my $interval = $sorted[$i];
325	6921					6486	$i++;
326	6921		100			20895	while ($i != @sorted && $$interval[1] + 1 >= $sorted[$i][0]) {
327	3053	100				6173	if ($$interval[1] < $sorted[$i][1]) {
328	3052					4505	$$interval[1] = $sorted[$i][1];
329							}
330	3053					10680	$i++;
331							}
332	6921					17845	push(@$char_class, $interval);
333							}
334	6016		66			8821	return $cc_cache{ join(',', 1, map {@$_} @$char_class) }
	6921					37245
335							\|\|= $char_class;
336							}
337
338							sub cc_neg {
339	1260			1260	0	1558	my ($char_class) = @_;
340
341	1260	100				2519	if (!@$char_class) { return $cc_any; }
	85					531
342
343	1175					2501	my $neg = bless([], CHAR_CLASS);
344	1175	100				2661	if ($$char_class[0][0] != 0) {
345	1107					2647	push(@$neg, [0, $$char_class[0][0] - 1]);
346							}
347	1175					1305	my $i = 0;
348	1175					2548	while ($i != $#$char_class) {
349	194					597	push(@$neg, [$$char_class[$i][1] + 1, $$char_class[$i+1][0] - 1]);
350	194					455	$i++;
351							}
352	1175	100				2453	if ($$char_class[$i][1] != MAX_CHAR) {
353	1107					2481	push(@$neg, [$$char_class[$i][1] + 1, MAX_CHAR]);
354							}
355	1175		66			1645	return $cc_cache{ join(',', 1, map{@$_} @$neg) } \|\|= $neg;
	2408					17548
356							}
357
358							sub cc_inter2 {
359	440			440	0	550	my ($char_class_0, $char_class_1) = @_;
360
361	440					845	my $inter = bless([], CHAR_CLASS);
362	440					518	my $i_0 = 0;
363	440					508	my $i_1 = 0;
364	440		100			1793	while ($i_0 < @$char_class_0 && $i_1 < @$char_class_1) {
365
366							# skip interval_0 if interval_0 < interval_1
367	543		66			3388	while (
			100
368							$i_0 < @$char_class_0
369							&& $i_1 < @$char_class_1
370							&& $$char_class_0[$i_0][1] < $$char_class_1[$i_1][0]
371							) {
372	367					1875	$i_0++;
373							}
374
375							# skip interval_1 if interval_1 < interval_0
376	543		100			2964	while (
			100
377							$i_0 < @$char_class_0
378							&& $i_1 < @$char_class_1
379							&& $$char_class_1[$i_1][1] < $$char_class_0[$i_0][0]
380							) {
381	192					865	$i_1++;
382							}
383
384							# Check that the exit condition of the first while still holds.
385	543	100	100			3126	if (
			100
386							$i_0 < @$char_class_0
387							&& $i_1 < @$char_class_1
388							&& $$char_class_1[$i_1][0] <= $$char_class_0[$i_0][1]
389							) {
390							# The exit conditions of both whiles hold:
391							#
392							# $$char_class_0[$i_0][1] >= $$char_class_1[$i_1][0]
393							# && $$char_class_1[$i_1][1] >= $$char_class_0[$i_0][0]
394							#
395							# short:
396							# high_0 >= low_1
397							# high_1 >= low_0
398							#
399							# furthermore:
400							# high_0 >= low_0
401							# high_1 >= low_1
402							#
403							# with:
404							# min_high := min(high_0, high_1)
405							# max_low := max(low_0, low_1)
406							#
407							# holds:
408							# min_high >= max_low_0
409
410	291					281	my ($interval_0_done, $interval_1_done);
411
412	291	100				709	my $max_low =
413							$$char_class_0[$i_0][0] > $$char_class_1[$i_1][0]
414							? $$char_class_0[$i_0][0]
415							: $$char_class_1[$i_1][0]
416							;
417
418	291					284	my $min_high;
419	291	100				637	if ($$char_class_0[$i_0][1] <= $$char_class_1[$i_1][1]) {
420	186					243	$min_high = $$char_class_0[$i_0][1];
421							# interval_0 < next interval_1
422	186					232	$interval_0_done = 1;
423							}
424	291	100				570	if ($$char_class_1[$i_1][1] <= $$char_class_0[$i_0][1]) {
425	208					254	$min_high = $$char_class_1[$i_1][1];
426							# interval_1 < next interval_0
427	208					223	$interval_1_done = 1;
428							}
429	291	100				909	if ($interval_0_done) { $i_0++; }
	186					188
430	291	100				458	if ($interval_1_done) { $i_1++; }
	208					212
431
432	291					1580	push(@$inter, [$max_low, $min_high]);
433							}
434							}
435	440		33			1949	return $cc_cache{ join(',', 1, map{@$_} @$inter) } \|\|=$inter;
	291					2041
436							}
437							}
438
439							sub cc_match {
440	117618			117618	0	141405	my ($char_class, $c) = @_;
441	117618					140752	for my $interval (@$char_class) {
442	138702	100				342193	if ($c < $$interval[0]) {
		100
443	46713					132983	return 0;
444							}
445							elsif ($c <= $$interval[1]) {
446	27658					85547	return 1;
447							}
448							}
449	43247					117985	return 0;
450							}
451
452							=item cc_union(@char_classes)
453
454							Returns the unique C<$char_class> containing all characters of all given
455							C<@char_classes>.
456
457							=cut
458
459							sub cc_union {
460	2093			2093	1	4569	return interval_list_to_cc( [ map { map { [@$_] } @$_ } @_ ] );
	4951					5578
	5475					15325
461							}
462
463							sub cc_is_subset {
464	55			55	0	64	my ($char_class_0, $char_class_1) = @_;
465	55					82	for my $c ( map { @$_ } @$char_class_0 ) {
	58					152
466	88	100				142	if (!cc_match($char_class_1, $c)) { return 0; }
	31					102
467							}
468	24					114	return 1;
469							}
470
471							# $to_perlre (boolean)
472							# true : perl syntax
473							# false: ere syntax
474							sub cc_to_regex {
475	564			564	0	952	my ($char_class, $to_perlre) = (@_, 0);
476
477	564					627	my @items;
478	564	50	66			2462	if (@$char_class && $$char_class[0][0] < 0) {
479	0	0				0	if ($$char_class[0][0] == -2) {
480	0	0				0	if ($$char_class[0][1] == -1) {
481	0					0	push(@items, '^');
482							}
483							else {
484	0					0	push(@items, '^$');
485							}
486							}
487							else {
488	0	0				0	if ($$char_class[0][1] == -2) {
489	0					0	push(@items, '$');
490							}
491							else {
492	0					0	push(@items, '^', '$');
493							}
494							}
495	0					0	$char_class = [@$char_class[1..$#$char_class]];
496							}
497	564	100				990	if (@$char_class) {
498	524	100	100			2140	if (
		100	100
		100	66
499							@$char_class == 1
500							&& $$char_class[0][0] == $$char_class[0][1]
501							) {
502	491					820	my $c = chr($$char_class[0][0]);
503	491	100				696	if ($to_perlre) {
504	115					206	push(@items, quotemeta($c))
505							}
506							else {
507	376	100				1396	push(@items,
508							$c =~ /$ERE_literal/o
509							? $c
510							: "\\$c"
511							);
512							}
513							}
514							elsif (
515							@$char_class == 1
516							&& $$char_class[0][0] == 0
517							&& $$char_class[0][1] == MAX_CHAR
518							) {
519	4					12	push(@items, '.');
520							}
521							elsif ($$char_class[$#$char_class][1] == MAX_CHAR) {
522	8	100				19	if ($to_perlre) {
523	2					6	push(@items,
524							'[^' . _cc_to_perlre(cc_neg($char_class)) . ']'
525							);
526							}
527							else {
528	6					19	push(@items,
529							'[^' . _cc_to_ere(cc_neg($char_class)) . ']'
530							);
531							}
532							}
533							else {
534	21	100				43	if ($to_perlre) {
535	18					37	push(@items, '[' . _cc_to_perlre($char_class) . ']');
536							}
537							else {
538	3					14	push(@items, '[' . _cc_to_ere($char_class) . ']');
539							}
540							}
541							}
542
543	564					568	my $regex;
544	564	100				1392	if (@items == 0) {
		50
545	40					228	return '';
546							}
547							elsif (@items == 1) {
548	524					3039	return $items[0];
549							}
550							else {
551	0	0				0	if ($to_perlre) {
552	0					0	return '(?:' . join('\|', @items) . ')';
553							}
554							else {
555	0					0	return '(' . join('\|', @items) . ')';
556							}
557							}
558							}
559
560							sub _cc_to_ere {
561	9			9		19	my ($char_class) = @_;
562	9					13	my $has_minus;
563							my $has_r_bracket;
564							my $ere = join('',
565							map {
566	9	100				19	if ($$_[0] == $$_[1]) {
	11					37
567	6	50				27	if ($$_[0] == ord('-')) {
		50
568	0					0	$has_minus = 1;
569	0					0	'';
570							}
571							elsif ($$_[0] == ord(']')) {
572	0					0	$has_r_bracket = 1;
573	0					0	'';
574							}
575							else {
576	6					25	chr($$_[0]);
577							}
578							}
579							else {
580	5	50	33			36	if (
581							$$_[0] == ord('-')
582							\|\| $$_[0] == ord(']')
583							) {
584	0	0				0	if ($$_[0] == ord('-')) {
585	0					0	$has_minus = 1;
586							}
587							else {
588	0					0	$has_r_bracket = 1;
589							}
590	0	0				0	if ($$_[1] == $$_[0] + 1) {
		0
591	0					0	chr($$_[1]);
592							}
593							elsif ($$_[1] == $$_[0] + 2) {
594	0					0	chr($$_[0] + 1) . chr($$_[1]);
595							}
596							else {
597	0					0	chr($$_[0] + 1) . '-' . chr($$_[1]);
598							}
599							}
600							else {
601	5	100				20	if ($$_[1] == $$_[0] + 1) {
602	4					22	chr($$_[0]) . chr($$_[1]);
603							}
604							else {
605	1					7	chr($$_[0]) . '-' . chr($$_[1]);
606							}
607							}
608							}
609							}
610							@$char_class
611							);
612	9	50				31	if ($has_minus) { $ere .= '-'; }
	0					0
613	9	50				27	if ($has_r_bracket) { $ere = "]$ere"; }
	0					0
614	9					30	return $ere;
615							}
616
617							sub _cc_to_perlre {
618	20			20		28	my ($char_class) = @_;
619							return join('',
620							map {
621	20	100				38	if ($$_[0] == $$_[1]) {
	49					84
622	44					64	my $c = chr($$_[0]);
623	44	50				320	$c =~ /$PERLRE_char_class_special/o ? "\\$c" : $c;
624							}
625							else {
626	5					18	my ($c1, $c2) = (chr($$_[0]), chr($$_[1]));
627	5	50				77	($c1 =~ /$PERLRE_char_class_special/o ? "\\$c1" : $c1)
		100
		50
628							. ($$_[0] + 1 < $$_[1] ? '-' : '')
629							. ($c2 =~ /$PERLRE_char_class_special/o ? "\\$c2" : $c2)
630							}
631							} @$char_class
632							);
633							}
634
635
636							##############################################################################
637							# $nfa
638							##############################################################################
639
640							=back
641
642							=head2 Nfa
643
644
645							WARNING: C routines are destructive,
646							the C<$nfa> references given as arguments will not be valid C<$nfa> any more.
647							Furthermore, the same C<$nfa> reference must be used only once as argument.
648							For instance, for concatenating a C<$nfa> with itself, C
649							does not work; instead, C must be used;
650							or even C if the original
651							C<$nfa> is to be used further.
652
653							$nfa = [ $state_0, $state_1, ... ]
654
655							$state = [
656							$accepting
657							, $transitions
658							]
659
660							$transitions = [
661							[ $char_class_0 => $state_ind_0 ]
662							, [ $char_class_1 => $state_ind_1 ]
663							, ...
664							]
665
666							In the same C<$transition>, C<$state_ind_i> are pairwise different and are
667							valid indexes of C<@$nfa>. There is exactly one initial state at index 0.
668
669							=over 4
670
671							=item C
672
673							Maps each of the given C<@nfas> to a clone.
674
675							=cut
676
677							sub nfa_clone {
678							return
679	296			296	1	379	map { [
	598					5523
680	342					403	map { [
681							$$_[0] # accepting
682	822					911	, [ map { [ @$_ ] } @{$$_[1]} ] # transitions
	822					1847
683							] }
684							@$_ # states of the $nfa
685							] } @_ # list of $nfas
686							;
687							}
688
689							sub _transitions_is_subset {
690	692			692		1221	my ($transitions_0, $transitions_1, $state_ind_map) = @_;
691	805	100	66			6185	my %state_ind_to_t_1
692	692					1021	= map {(
693							$state_ind_map && exists($$state_ind_map{$$_[1]})
694							? $$state_ind_map{$$_[1]}
695							: $$_[1]
696							=> $_
697							)}
698							@$transitions_1
699							;
700	692					1398	for my $t_0 (@$transitions_0) {
701	210	100	66			994	my $state_ind_0
702							= $state_ind_map && exists($$state_ind_map{$$t_0[1]})
703							? $$state_ind_map{$$t_0[1]}
704							: $$t_0[1]
705							;
706	210	100				422	if (!exists($state_ind_to_t_1{$state_ind_0})) { return 0; }
	184					792
707	26					29	my $t_1 = $state_ind_to_t_1{$state_ind_0};
708	26	100				58	if (!cc_is_subset($$t_0[0], $$t_1[0])) { return 0; }
	16					61
709							}
710	492					1526	return 1;
711							}
712
713							# The keys of %$state_ind_to_equiv are state_inds of @$nfa to be removed.
714							# State indexes in transitions are remapped following %$state_ind_to_equiv.
715							# A state index mapped to itself denotes an unreachable state index.
716							sub _nfa_shrink_equiv {
717	829			829		1069	my ($nfa, $state_ind_to_equiv) = @_;
718	829					926	my $i = 0;
719	3919					9692	my %compact_map
720	5176					11096	= map { ($_ => $i++) }
721							my @active_state_inds
722	829					1617	= grep { !exists($$state_ind_to_equiv{$_}) }
723							(0..$#$nfa)
724							;
725
726	829					1755	my %equiv_index_to_char_classes;
727							my %plain_index_to_char_class;
728	829					3764	for (@$nfa = @$nfa[@active_state_inds]) {
729
730							# update $state_ind
731							# -> $compact_map{$state_ind}
732							# or $compact_map{$$state_ind_to_equiv{$state_ind}}
733	3919					4733	%equiv_index_to_char_classes = ();
734	3919					4980	%plain_index_to_char_class = ();
735	3919					3800	for (@{$$_[1]}) { # transition list
	3919					7214
736	7083	100				13832	if (exists($$state_ind_to_equiv{$$_[1]})) {
737							push(
738	1012					1078	@{$equiv_index_to_char_classes{
739	1012					3712	$$_[1]
740							= $compact_map{$$state_ind_to_equiv{$$_[1]}}
741							}}
742							, $$_[0]
743							);
744							}
745							else {
746							$plain_index_to_char_class{
747	6071					15519	$$_[1]
748							= $compact_map{$$_[1]}
749							} = $$_[0];
750							}
751							}
752							# merge char_classes to the same state index
753	3919	100				479349	if (keys(%equiv_index_to_char_classes)) {
754	799					2667	@{$$_[1]} = ((
	114					240
755	807					2800	map {[
756							exists($equiv_index_to_char_classes{$_})
757							? cc_union(
758							$plain_index_to_char_class{$_}
759	638	100				1795	, @{$equiv_index_to_char_classes{$_}}
760							)
761							: $plain_index_to_char_class{$_}
762							, $_
763							]}
764							keys(%plain_index_to_char_class)
765							) , (
766	921					1991	map {[
767	16					36	@{$equiv_index_to_char_classes{$_}} == 1
768							? $equiv_index_to_char_classes{$_}[0]
769	807	100				779	: cc_union(@{$equiv_index_to_char_classes{$_}})
770							, $_
771							]}
772	799					1501	grep { !exists($plain_index_to_char_class{$_}) }
773							keys(%equiv_index_to_char_classes)
774							))
775							}
776							}
777	829					7330	return $nfa;
778							}
779
780							=item C
781
782							Precondition: C<0 <= $min && ( $max eq '' \|\| $min <= $max)>
783
784							Returns C<$out_nfa>, a C<$nfa> computed from C<$in_nfa>.
785
786							Let L be the language accepted by C<$in_nfa> and M the language accepted
787							by C<$out_nfa>. Then a word m belongs to M if and only if and ordered list
788							(l_1, ..., l_r) of words belonging to L exists such that:
789
790							$min <= r
791							and ($max eq '' or r <= $max)
792							and m is the concatenation of (l_1, ..., l_r)
793
794							Examples with C<$in_nfa> being a C<$nfa> accepting C<'^a$'>:
795
796							nfa_quant($in_nfa, 2, 4 ) accepts '^a{2,4}$'
797							nfa_quant($in_nfa, 0, '') accepts '^a{0,}$' (i.e. '^a*$')
798
799							C<$pref_has_prefix> and C<$next_has_prefix> are hints for dispatching C<$min>,
800							for example:
801
802							'a+' => 'a*a' (!$prev_has_suffix && $next_has_prefix)
803							'a+' => 'aa*' ( $prev_has_suffix && !$next_has_prefix)
804							'a{2,}' => 'aa*a' ( $prev_has_suffix && $next_has_prefix)
805
806							=cut
807
808							sub nfa_quant {
809	268			268	1	402	my ($nfa, $min, $max, $prev_has_suffix, $next_has_prefix) = @_;
810	268					274	my @quant_parts;
811							my $optional_part;
812
813							# dispatch min left and right: a+b => aab, ba+ => baa
814	8			8		225	use integer;
	8					17
	8					41
815	268	100	66			596	my ($min_left, $min_right)
		100	66
		100
816							=
817							# no suffix, no prefix
818							$min == 0 ? (0 , 0 )
819
820							# no suffix, maybe prefix
821							: !($next_has_prefix && _nfa_has_suffix($nfa)) ? ($min , 0 )
822
823							# suffix, no prefix
824							: !($prev_has_suffix && _nfa_has_prefix($nfa)) ? (0 , $min )
825
826							# suffix and prefix
827							: (($min+1)/2, $min/2)
828							;
829
830	268	100				494	if ($min_left > 0) {
831	13					44	push(@quant_parts, nfa_concat(nfa_clone(($nfa) x $min_left)));
832							}
833	268	100	100			818	if (length($max) == 0 \|\| $max > $min) {
834	267	100				619	if ($$nfa[0][0]) {
	491	100				1111
835							# initial state already accepting
836	7					21	($optional_part) = nfa_clone($nfa);
837							}
838							elsif (
839	633					1204	!grep { $$_[1] == 0 }
840	633					537	map { @{$$_[1]} }
841							@$nfa
842							) {
843							# initial state not accepting and unreachable
844	249					487	($optional_part) = nfa_clone($nfa);
845	249					410	$$optional_part[0][0] = 1;
846							}
847							else {
848							# initial state not accepting and reachable
849	22					59	$optional_part = [
850							# additional root initial state accepting state
851							[
852							1 # accepting
853	11					24	, [ map {[$$_[0] , $$_[1]+1]} @{$$nfa[0][1]} ] # transitions
	30					91
854							]
855							# original states with offset 1
856	11					29	, map { [
857							$$_[0] # accepting
858	30					41	, [ map {[ $$_[0], $$_[1]+1 ]} @{$$_[1]} ] # transitions
	30					66
859							] }
860							@$nfa
861							];
862							}
863							}
864	268	100				511	if (length($max) == 0) {
		100
865
866							# starify optional part
867
868	298					1081	my %root_index_to_char_class
869	253					393	= map { ($$_[1] => $$_[0]) }
870	253					273	@{$$optional_part[0][1]}
871							;
872
873	253					401	my $state_ind_to_equiv = {};
874							# loop over accepting state indexes
875	253					450	for (grep { $$optional_part[$_][0] } (1..$#$optional_part)) {
	362					882
876	253	100				911	if (
877							_transitions_is_subset(
878							$$optional_part[$_][1]
879							, $$optional_part[0][1]
880							, { $_ => 0 }
881							)
882							) {
883							# Accepting states whose transitions are
884							# a subset of the transitions of the initial state
885							# are equivalent to the initial state.
886	248					804	$$state_ind_to_equiv{$_} = 0;
887							}
888							else {
889	5	100				10	if (
890	5					18	grep { exists($root_index_to_char_class{$_}) }
	5					12
891	5					12	map { $$_[1] }
892							@{$$optional_part[$_][1]}
893							) {
894							# merge char classes to the same state index
895	2					8	my %new_index_to_char_classes
896	2					5	= map { ($$_[1] => [$$_[0]]) }
897	2					2	@{$$optional_part[$_][1]}
898							;
899	2					5	for (keys(%root_index_to_char_class)) {
900	4					9	push (
901	4					5	@{$new_index_to_char_classes{$_}}
902							, $root_index_to_char_class{$_}
903							);
904							}
905	2					11	@{$$optional_part[$_][1]}
	4					12
906	2					6	= map {[
907	2					4	@{$new_index_to_char_classes{$_}} == 1
908							? $new_index_to_char_classes{$_}[0]
909	4	100				5	: cc_union(@{$new_index_to_char_classes{$_}})
910							, $_
911							]}
912							keys(%new_index_to_char_classes)
913							;
914							}
915							else {
916	3					7	push(
917	5					18	@{$$optional_part[$_][1]}
918	3					7	, map { [@$_] } @{$$optional_part[0][1]}
	3					8
919							);
920							}
921							}
922							}
923	253	100				1068	push(@quant_parts,
924							keys(%$state_ind_to_equiv)
925							? _nfa_shrink_equiv($optional_part, $state_ind_to_equiv)
926							: $optional_part
927							);
928							}
929							elsif ($max > $min) {
930
931							# concatenate optional_part $max - $min times
932
933	14					41	push(@quant_parts, _nfa_concat(1, nfa_clone(
934							($optional_part) x ($max - $min)
935							)));
936							}
937	268	100				585	if ($min_right > 0) {
938	13					42	push(@quant_parts, nfa_concat(nfa_clone(($nfa) x $min_right)));
939							}
940	268	100				1423	return @quant_parts == 1 ? $quant_parts[0] : nfa_concat(@quant_parts);
941							}
942
943							=item C
944
945							Returns C<$out_nfa>, a C<$nfa> computed from C<@in_nfas>.
946
947							Let r be the number of given C<@in_nfas>,
948							L_i the language accepted by C<$in_nfas[$i]> and M the language accepted
949							by C<$out_nfa>. Then a word m belongs to M if and only if an ordered list
950							(l_1, ..., l_r) of words exists, l_i belonging to L_i, such that
951							m is the concatenation of (l_1, ..., l_r).
952
953							=cut
954
955							sub nfa_concat {
956	300			300	1	598	_nfa_concat(0, @_);
957							}
958
959							sub _nfa_concat {
960	314			314		387	my $starifying = shift(@_);
961	314	50				615	if (!@_) {
962	0					0	return [[1, []]]; # neutral element: accepting empty string
963							}
964	314					390	my $concat = shift(@_);
965	314					591	my @accepting_state_inds = grep { $$concat[$_][0] } (0..$#$concat);
	783					1683
966	314					13830	my $state_ind_to_equiv = {};
967							my (
968	314					344	$nfa
969							, $state
970							, $init_state_ind
971							, $init_reachable
972							, $init_equiv_reachable
973							, $init_skipped
974							, @new_accepting_state_inds
975							);
976	314					645	while (@_) {
977	399					462	$nfa = shift(@_);
978	399					444	$init_state_ind = @$concat;
979	399					398	$init_reachable = 0;
980	399					365	$init_equiv_reachable = 0;
981	399					424	$init_skipped = 0;
982							@new_accepting_state_inds
983	464					880	= map { $_ + $init_state_ind }
	1111					1686
984	399					653	grep { $$nfa[$_][0] }
985							(0..$#$nfa)
986							;
987	399					647	for (map { @{$$_[1]} } @$nfa) {
	1111					1062
	1111					1950
988	1058	100	100			2828	($$_[1] += $init_state_ind) == $init_state_ind
989							&& ($init_reachable \|\|= 1);
990							}
991	399					686	for my $acc_ind (@accepting_state_inds) {
992	508					647	$state = $$concat[$acc_ind];
993	508					652	$$state[0] = $$nfa[0][0];
994	508	100	100			560	if (
		100	100
995	508					2305	@{$$state[1]} <= 1
996							&& _transitions_is_subset(
997							$$state[1] # transitions of the old accepting state
998							, $$nfa[0][1] # transitions of the new initial state
999							, { $acc_ind => $init_state_ind }
1000							)
1001	292					1052	) {
1002
1003							# Old accepting states whose transitions are
1004							# a subset of the transitions of the new initial state
1005							# are equivalent to the initial state.
1006							#
1007							# Note that such an old accepting states can have either
1008							# no transition or one self-transition;
1009							# the case that the old accepting state has no transition
1010							# occurs very often.
1011							#
1012							# %$state_ind_to_equiv gets extended by
1013							#
1014							# $acc_ind_ (old accepting state) => $init_state_ind
1015							#
1016							# But the keys and the values of %$state_ind_to_equiv
1017							# MUST remain disjoint (except for pairs key = val).
1018							#
1019							# Since $init_state_index are growing
1020							# and $acc_ind < $init_state_index:
1021							# - the new value does not belong the the keys
1022							# - the new key may belong to the vals,
1023							# such values must be updated.
1024							#
1025							# Example:
1026							# 0 => 1 ( %$state_ind_to_equiv )
1027							# 1 => 2 ( $acc_ind => $init_state_index )
1028							# %$state_ind_to_equiv must be updated to
1029							# 0 => 2
1030							# before being extended by
1031							# 1 => 2
1032	244					577	for (grep { $_ == $acc_ind } values(%$state_ind_to_equiv)) {
	151					322
1033	3					7	$_ = $init_state_ind;
1034							}
1035	244					449	$$state_ind_to_equiv{$acc_ind} = $init_state_ind;
1036	244					586	$init_equiv_reachable = 1;
1037							}
1038							elsif (
1039	264					456	(grep { $$_[1] == $init_state_ind } @{$$nfa[0][1]})
	29					51
1040							&& cc_is_subset(
1041
1042							# char_class of the self-transition
1043							# of the new initial state
1044							(
1045	33					94	map { $$_[0] }
1046	29					46	grep { $$_[1] == $init_state_ind }
1047	22					45	@{$$nfa[0][1]}
1048							)
1049
1050							# char_class of the self-transition
1051							# of the old accepting state
1052							, (
1053	42					94	map { $$_[0] }
1054	29					40	grep { $$_[1] == $acc_ind }
1055							@{$$state[1]}
1056							)
1057							)
1058							) {
1059							# If the self-transitions of the new init state are
1060							# a subset of the transitions of the old accepting state,
1061							# the new state is not needed for looping;
1062							# the transition to the new init state can be skipped.
1063							#
1064							# Example 1:
1065							# [ab]a
1066							# the state for a* is superfluous.
1067							# Example 2:
1068							# ( x[ab]* \| y[ac]* \| z[bc]* ) a* c
1069							# the state for a* is only needed after [bc]*
1070							# the regular expression is equivalent to:
1071							# x[ab]c \| y[ac]c \| z[bc]ac
1072							#
1073							# Note that this one-letter-star optimization is
1074							# probably not very useful for practical purposes;
1075							# more general equivalences like (abc)(abc) ~ (abc)*
1076							# are not caught up, while the focused use cases
1077							# of prefix and suffix recognition need no star at all.
1078							#
1079							# It is merely a toy optimization for solving some exercises
1080							# of an introductory course on regexs.
1081							#
1082	14					27	push(@{$$state[1]},
	2					6
1083	16					88	map { [ @$_ ] }
1084	14					22	grep { $$_[1] != $init_state_ind}
1085	14					37	@{$$nfa[0][1]})
1086							;
1087	14					38	$init_skipped++;
1088							}
1089							else {
1090	250					370	push(@{$$state[1]},
	276					1027
1091	250					551	map { [ @$_ ] }
1092	250					265	@{$$nfa[0][1]})
1093							;
1094							}
1095							}
1096	399	100	100			2146	if (
			100
1097							!$init_reachable && !$init_equiv_reachable
1098							\|\| $init_skipped == @accepting_state_inds
1099							) {
1100							# for being removed by _nfa_shrink_equiv()
1101	145					319	$$state_ind_to_equiv{$init_state_ind} = $init_state_ind;
1102							}
1103
1104	399	100				874	if (!$$nfa[0][0]) {
		100
1105	204					341	@accepting_state_inds = ();
1106							}
1107							elsif ($starifying) {
1108							# $starifying set for optimizing x{n,m}.
1109							# The old accepting states are redundant,
1110							# since reachable iff the newer ones are.
1111	13					33	for (@accepting_state_inds[1..$#accepting_state_inds]) {
1112	15					35	$$concat[$_][0] = 0;
1113							}
1114	13	50				33	if (!$init_reachable) {
1115	13					18	$$nfa[0][0] = 0;
1116	13					18	shift(@new_accepting_state_inds);
1117							}
1118	13					26	@accepting_state_inds = (0);
1119							}
1120							else {
1121							@accepting_state_inds
1122	182					260	= grep { !exists($$state_ind_to_equiv{$_}) }
	199					521
1123							@accepting_state_inds
1124							;
1125							}
1126
1127	399					1308	push(@$concat, @$nfa);
1128	399					990	push(@accepting_state_inds, @new_accepting_state_inds);
1129							}
1130	314	100				615	if (keys(%$state_ind_to_equiv)) {
1131	256					460	return _nfa_shrink_equiv($concat, $state_ind_to_equiv);
1132							}
1133							else {
1134	58					349	return $concat;
1135							}
1136							}
1137
1138							=item C
1139
1140							Returns C<$out_nfa>, a C<$nfa> computed from C<@in_nfas>.
1141
1142							C<$out_nfa> accepts a word w if and only if at least one of C<@in_nfas>
1143							accepts w.
1144
1145							=cut
1146
1147							# Adds the total number of states
1148							sub nfa_union {
1149	130			130	1	296	my $union = [[0, []]]; # root, neutral element: accepting nothing
1150	130					184	my $state_ind_to_equiv = {};
1151	130					133	my $first_trivial_accepting_state_ind;
1152							my (
1153	130					143	$nfa
1154							, $init_state_ind
1155							, $init_reachable
1156							, $orig_state
1157							);
1158
1159	130					231	for $nfa (@_) {
1160
1161							# merge initial $accepting
1162	300		100			958	$$union[0][0] \|\|= $$nfa[0][0];
1163	300	100	100			642	if (@$nfa == 1 && @{$$nfa[0][1]} == 0) {
	68					207
1164	57					86	next;
1165							# Must be skipped because such a trivial state
1166							# would be removed below (!$init_reachable)
1167							# although it may be the $first_trivial_accepting state.
1168							#
1169							# On the other side, a well defined $nfa
1170							# with a single state and with a non-empty transition list
1171							# must loop to itself, thus $init_reachable.
1172							}
1173
1174	243					264	$init_state_ind = @$union;
1175	243					243	$init_reachable = 0;
1176	243					470	for (0..$#$nfa) {
1177	1261					1580	$orig_state = $$nfa[$_];
1178	1261	100	100			2756	if (
1179	266					894	$$orig_state[0] # accepting
1180							&& !@{$$orig_state[1]} # trivial
1181							) {
1182	164	100				249	if (defined($first_trivial_accepting_state_ind)) {
1183	87					288	$$state_ind_to_equiv{$_ + $init_state_ind}
1184							= $first_trivial_accepting_state_ind;
1185							}
1186							else {
1187	77					157	$first_trivial_accepting_state_ind
1188							= $_ + $init_state_ind;
1189							}
1190							}
1191							else {
1192	1097					1094	for ( @{$$orig_state[1]} ) { # transition list
	1097					1822
1193	1311	100	100			3937	($$_[1] += $init_state_ind) == $init_state_ind
1194							&& ($init_reachable \|\|= 1);
1195							}
1196							}
1197							};
1198	243					662	push(@$union, @$nfa);
1199
1200							# merge initial $transitions
1201	243					249	push(@{$$union[0][1]}, map { [ @$_ ] } @{$$nfa[0][1]});
	243					335
	274					693
	243					386
1202	243	100				495	if (!$init_reachable) {
1203							# for being removed by _nfa_shrink_equiv()
1204	214					564	$$state_ind_to_equiv{$init_state_ind} = $init_state_ind;
1205							}
1206							};
1207	130	100				346	if (keys(%$state_ind_to_equiv)) {
1208	120					225	return _nfa_shrink_equiv($union, $state_ind_to_equiv);
1209							}
1210							else {
1211	10					38	return $union;
1212							}
1213							}
1214
1215							{
1216
1217							my %cached_cc_inter2;
1218
1219							=item C
1220
1221							Returns C<$out_nfa>, a $C<$nfa> computed from C<@in_nfas>.
1222
1223							C<$out_nfa> accepts a word w if and only if each of C<@in_nfas> accepts w.
1224
1225							=cut
1226
1227							sub nfa_inter {
1228	30			30	1	775	my ($inter, @nfas) = sort { @$a <=> @$b } @_;
	28					96
1229	30					61	for (@nfas) { $inter = nfa_inter2($inter, $_); }
	25					70
1230							return
1231	30		50			190	$inter
1232							\|\| [[1, [[$cc_any, 0]]]] # neutral element: accepting anything
1233							;
1234							}
1235
1236							# Multiplies the total number of states
1237							sub nfa_inter2 {
1238	25			25	0	46	my ($nfa_0, $nfa_1) = @_;
1239
1240							# computed states
1241	25					47	my @todo = (0);
1242	25					46	my %todo_seen; # set of state_inds
1243							my %done; # key-subset of %todo_seen (values are states)
1244							# After the following while, %done are %todo_seen the same set.
1245
1246							# dead end detection
1247	0					0	my %path_tr;
1248	0					0	my @cur_livings;
1249	0					0	my %livings;
1250
1251							# tmp variables
1252							my (
1253	0					0	$from_state_ind, $to_state_ind
1254							, $nfa_0_accepting, $nfa_0_transitions
1255							, $nfa_1_accepting, $nfa_1_transitions
1256							, $t_0, $t_1
1257							, $char_class
1258							, $accepting
1259							, @keys_path_to_state_ind
1260							);
1261
1262	25					46	my $nfa_1_len = @$nfa_1;
1263
1264	25					66	while (@todo) {
1265	1428					2544	$todo_seen{$from_state_ind} = $from_state_ind = pop(@todo);
1266
1267	1428					3023	($nfa_0_accepting, $nfa_0_transitions)
1268	1428					1581	= @{$$nfa_0[$from_state_ind / $nfa_1_len]}; # i-th state
1269	1428					2297	($nfa_1_accepting, $nfa_1_transitions)
1270	1428					1591	= @{$$nfa_1[$from_state_ind % $nfa_1_len]}; # j-th state
1271
1272	1428					2066	my $new_transitions = [];
1273	1428					1890	for $t_0 (@$nfa_0_transitions) {
1274	2953					3654	for $t_1 (@$nfa_1_transitions) {
1275
1276	5804	100	66			25964	if (
1277							(
1278							$char_class
1279							= $cached_cc_inter2{$$t_0[0]}{$$t_1[0]}
1280							\|\|= &cc_inter2($$t_0[0], $$t_1[0])
1281							) != $cc_none
1282							) {
1283	2936					7542	push (@$new_transitions, [
1284							$char_class
1285							, $to_state_ind = $$t_0[1] * $nfa_1_len + $$t_1[1]
1286							]);
1287	2936	100				6159	if (!exists($todo_seen{$to_state_ind})) {
1288	1403					3669	push(@todo,
1289							$todo_seen{$to_state_ind} = $to_state_ind);
1290							}
1291	2936					8804	$path_tr{$to_state_ind}{$from_state_ind} = undef;
1292							}
1293							}
1294							}
1295	1428	100	100			4130	if ($accepting = $nfa_0_accepting && $nfa_1_accepting) {
1296	26					46	push(@cur_livings, $from_state_ind);
1297							}
1298	1428					4921	$done{$from_state_ind} = [
1299							$accepting
1300							, $new_transitions
1301							];
1302							}
1303
1304							# remove dead ends
1305	25					57	%livings = map { ($_ => $_) } @cur_livings;
	26					124
1306	25					71	while (@cur_livings) {
1307	945					2617	push(@cur_livings,
1308	2072					3625	map { $livings{$_} = $_ }
1309	971					2396	grep { !exists($livings{$_}) }
1310	971					1111	keys(%{$path_tr{pop(@cur_livings)}})
1311							);
1312							}
1313
1314	25	50				86	if (keys(%livings) == 0) {
1315	0					0	return [[0, []]];
1316							}
1317
1318							# compact renumbering
1319	25					35	my @sorted_keys;
1320							my $inter = [@done{
1321	25					251	@sorted_keys = sort { $a <=> $b } keys(%livings)
	4410					4535
1322							}];
1323	25					97	my $i = 0;
1324	25					53	my %compact_map = map { ($_ => $i++) } @sorted_keys;
	971					1467
1325
1326	25					116	for (
1327	971					2426	map {
1328	2380					4571	@{$$_[1]}
1329	971					1404	= grep { exists($compact_map{$$_[1]}) }
1330	971					923	@{$$_[1]}
1331							}
1332							@$inter
1333							) {
1334	2072					2805	$$_[1] = $compact_map{$$_[1]};
1335							}
1336	25					1704	return $inter;
1337							}
1338							}
1339
1340							sub nfa_resolve_anchors {
1341	10			10	0	18	my ($nfa) = @_;
1342
1343							# find state_inds reachable from the root by begin-anchor transitions
1344	10					21	my %begs = (0 => undef);
1345	10					91	my @todo = (0);
1346	10					34	while (defined(my $beg = pop(@todo))) {
1347	13					20	for (
1348	3					6	map { $$_[1] } # state_ind
	23					87
1349	13					26	grep { $$_[0][0][1] == -1 } # begin-anchor
1350							@{$$nfa[$beg][1]}
1351							) {
1352	3	50				8	if (!exists($begs{$_})) {
1353	3					3	$begs{$_} = undef;
1354	3					9	push(@todo, $_);
1355							}
1356							}
1357							}
1358
1359							# find state_inds leading to an accepting state by end-anchor transitions
1360	10					29	my @cur_livings;
1361							my %path_tr;
1362	10					26	for my $from_state_ind (0..$#$nfa) {
1363	38					39	for (@{$$nfa[$from_state_ind][1]}) {
	38					80
1364	46					125	$path_tr{$$_[1]}{$from_state_ind} = $$_[0];
1365							}
1366	38	100				103	if ($$nfa[$from_state_ind][0]) {
1367	10					25	push(@cur_livings, $from_state_ind);
1368							}
1369							}
1370	10					22	my %livings = map {($_ => undef)} @cur_livings;
	10					35
1371	10					36	while (defined(my $end = pop(@cur_livings))) {
1372	10					24	for (
1373	16					75	grep {
1374	10					39	$path_tr{$end}{$_}[0][0] == -3; # end-anchor
1375							}
1376							keys(%{$path_tr{$end}})
1377							) {
1378	0	0				0	if (!exists($livings{$_})) {
1379	0					0	push(@cur_livings, $livings{$_} = undef);
1380	0					0	$$nfa[$_][0] = 1;
1381							}
1382							}
1383							}
1384
1385	10					14	my $accept_empty;
1386	10	100				22	if (!($accept_empty = scalar(grep {$$nfa[$_][0]} keys(%begs)) ? 1 : 0)) {
	13	100				57
1387							# special case for $^ for and the like: empty string matches
1388	9					12	my %begends;
1389	9					24	my @todo = keys(%begs);
1390	9					27	while (defined(my $begend = pop(@todo))) {
1391	23					31	for (
1392	17					39	map { $$_[1] } # state_ind
	33					99
1393	23					46	grep { $$_[0][0][1] < 0 } # anchor
1394							@{$$nfa[$begend][1]}
1395							) {
1396	17	50	66			70	if (!exists($begs{$_}) && !exists($begends{$_})) {
1397	14	100				38	if ($$nfa[$_][0]) {
1398	3					5	$accept_empty = 1;
1399	3					15	@todo = ();
1400	3					14	last;
1401							}
1402	11					22	$begends{$_} = undef;
1403	11					48	push(@todo, $_);
1404							}
1405							}
1406							}
1407							}
1408
1409							# remove anchors
1410	10					28	for my $from_state_ind (
	46					113
1411							grep {
1412	38					55	grep { $$_[0][0][0] < 0 } # anchor
1413	38					39	@{$$nfa[$_][1]} # transitions
1414							}
1415							(0..$#$nfa)
1416							) {
1417	20					29	my $state = $$nfa[$from_state_ind];
1418							$$state[1] = [
1419							map {
1420	31	100				71	if ($$_[0][0][0] >= 0) {
	22	50				45
	20					32
1421	9					27	$_;
1422							}
1423							elsif ( @{$$_[0]} == 1 ) {
1424	22					45	delete($path_tr{$$_[1]}{$from_state_ind});
1425	22					74	();
1426							}
1427							else {
1428	0					0	$path_tr{$$_[1]}{$from_state_ind}
1429							= $$_[0]
1430	0					0	= interval_list_to_cc(@{$$_[0]}[1..$#{$$_[0]}]);
	0					0
1431	0					0	$_;
1432							}
1433							}
1434	20					22	@{$$state[1]} # transitions
1435							];
1436							}
1437
1438							# ensure that the initial state cannot be reached
1439	10	100				15	if (@{$$nfa[0][1]}) {
	10					38
1440							# proper init transitions (clone of the initial state needed)
1441
1442							# replace transitions to the initial state
1443							# with transitions to the cloned initial state
1444	8					12	my $new_state_ind = @$nfa;
1445	8					10	my $clone_reachable;
1446	8					16	for my $transition (
	22					45
1447	33					62	grep { $$_[1] == 0 } # to initial state
1448	33					35	map { @{$$_[1]} } # transitions
1449							@$nfa
1450							) {
1451	8					13	$$transition[1] = $new_state_ind;
1452	8					15	$clone_reachable = 1;
1453							}
1454
1455	8	50				25	if ($clone_reachable) {
1456	8					21	my $new_state = [
1457							$$nfa[0][0]
1458	8					14	, [@{$$nfa[0][1]}]
1459							];
1460	8					15	push(@$nfa, $new_state);
1461	8					19	$path_tr{$new_state_ind} = $path_tr{0};
1462	8					12	for (@{$$nfa[0][1]}) {
	8					18
1463	10					29	$path_tr{$$_[1]}{$new_state_ind} = $$_[0];
1464							}
1465	8	50				83	if ($$nfa[0][0]) {
1466	0					0	$livings{$new_state_ind} = undef;
1467							}
1468							}
1469							}
1470							else {
1471							# no proper init transitions
1472
1473							# drop transitions to the initial state
1474	2					6	for my $state (@$nfa) {
1475	5					7	@{$$state[1]} = grep { $$_[1] != 0 } @{$$state[1]};
	5					13
	2					5
	5					9
1476							}
1477							}
1478	10					18	delete($path_tr{0});
1479
1480							# extend initial state (merge all initial states of %begs)
1481	10	100				26	if (keys(%begs) > 1) {
1482	2					3	my %state_ind_to_char_classes;
1483	2					4	for ( map { @{$$nfa[$_][1]} } keys(%begs) ) {
	5					6
	5					9
1484	6					6	push(@{$state_ind_to_char_classes{$$_[1]}}, $$_[0]);
	6					12
1485							}
1486	2					9	@{$$nfa[0][1]}
	6					12
1487	2					5	= map { [
1488	6					7	$path_tr{$_}{0} = cc_union(@{$state_ind_to_char_classes{$_}})
1489							, int($_)
1490							] }
1491							keys(%state_ind_to_char_classes)
1492							;
1493							}
1494	10	100				34	if ($$nfa[0][0] = $accept_empty) {
1495	4					8	$livings{0} = undef;
1496							}
1497
1498							# remove unreachable states
1499	10					17	my @cur_reachables = (0);
1500	10					21	my %reachables = (0 => 0);
1501	10					28	while (@cur_reachables) {
1502	24					32	my $from_state_ind = shift(@cur_reachables);
1503	24					35	for (
1504	25					57	map { $$_[1] }
	24					56
1505							@{$$nfa[$from_state_ind][1]}
1506							) {
1507	25	100				75	if (!exists($reachables{$_})) {
1508	14					45	push(@cur_reachables, $reachables{$_} = $_);
1509							}
1510							}
1511							}
1512
1513							# remove dead ends
1514	10					27	delete(@livings{grep { !exists($reachables{$_}) } keys(%livings)});
	13					35
1515	10					22	@cur_livings = keys(%livings);
1516	10					28	while (@cur_livings) {
1517	11					13	for (
1518	10					19	grep { exists($reachables{$_}) }
	11					190
1519							keys(%{$path_tr{pop(@cur_livings)}})
1520							) {
1521	8	100				18	if (!exists($livings{$_})) {
1522	5					6	push(@cur_livings, $_);
1523	5					12	$livings{$_} = undef;
1524							}
1525							}
1526							}
1527
1528	10	100				38	if (keys(%livings) == 0) {
		50
1529	4					46	return [[0, []]];
1530							}
1531							elsif (keys(%livings) == @$nfa) {
1532	0					0	return $nfa;
1533							}
1534
1535							# compact renumbering
1536	6					19	my @sorted_keys = sort { $a <=> $b } keys(%livings);
	8					12
1537	6					12	my $i = 0;
1538	6					10	my %compact_map = map { ($_ => $i++) } @sorted_keys;
	11					25
1539
1540							return [
1541	11					28	map {
1542	6					14	@{$$_[1]}
	8					11
1543							= map {
1544	13					26	$$_[1] = $compact_map{$$_[1]};
1545	8					10	$_;
1546							}
1547	11					17	grep { exists($compact_map{$$_[1]}) }
1548	11					15	@{$$_[1]}
1549							;
1550	11					73	$_;
1551							}
1552							@$nfa[@sorted_keys]
1553							];
1554							}
1555
1556							=item C
1557
1558							Returns true if and only if C<$in_nfa> accepts C<$str>.
1559
1560							=cut
1561
1562							sub nfa_match {
1563	19			19	1	3630	my ($nfa, $str) = @_;
1564
1565	19					40	my %state_inds = (0 => 0);
1566	19					57	for my $c ( map { ord($_) } split('', $str) ) {
	119					165
1567	66					290	%state_inds
1568	105					194	= map { $$_[1] => $$_[1] }
1569	74					249	grep { cc_match($$_[0], $c) } # matching transition list
1570	119					193	map { @{$$_[1]} } # all transition list
	74					70
1571							@$nfa[values(%state_inds)] # current states
1572							;
1573							}
1574
1575	19					91	return grep { $$_[0] } @$nfa[values(%state_inds)];
	11					62
1576							}
1577
1578							sub nfa_dump {
1579	0			0	0	0	my ($nfa) = @_;
1580	0					0	my $dump = '';
1581	0					0	for my $i (0..$#$nfa) {
1582	0	0				0	$dump
1583							.= "$i:"
1584							. ($$nfa[$i][0] ? " (accepting)" : "")
1585							. "\n"
1586							;
1587	0					0	for my $transition (@{$$nfa[$i][1]}) {
	0					0
1588	0					0	$dump
1589							.= " "
1590							. cc_to_regex($$transition[0]) . " => $$transition[1]\n";
1591							}
1592							}
1593	0					0	return $dump;
1594							}
1595
1596							=item C
1597
1598							Returns true if and only if the labeled graphs represented by C<$nfa1>
1599							and C<$nfa2> are isomorph. While isomorph C<$nfa>s accept the same language,
1600							the converse is not true.
1601
1602							=cut
1603
1604							sub nfa_isomorph {
1605	98			98	1	1906	my ($nfa1, $nfa2) = @_;
1606
1607	98					230	my %nfa1_nfa2_indexes = (0 => 0);
1608	98					191	my %nfa2_nfa1_indexes = (0 => 0);
1609	98					173	my @nfa1_index_todo = (0);
1610
1611	98					380	while (defined(my $nfa1_index = pop(@nfa1_index_todo))) {
1612
1613	582					848	my $state1 = $$nfa1[$nfa1_index];
1614	582					825	my $state2 = $$nfa2[$nfa1_nfa2_indexes{$nfa1_index}];
1615
1616							# accepting
1617	582	50				1229	if ($$state1[0] != $$state2[0]) {
1618	0					0	return 0;
1619							}
1620
1621							# transitions
1622	582					633	my $transitions1 = [sort { $$a[0] <=> $$b[0] } @{$$state1[1]}];
	2045					3239
	582					1454
1623	582					699	my $transitions2 = [sort { $$a[0] <=> $$b[0] } @{$$state2[1]}];
	2058					2922
	582					1201
1624	582	50				1177	if (@$transitions1 != @$transitions2) {
1625	0					0	return 0;
1626							}
1627	582					1105	for my $i (0..$#$transitions1) {
1628	1661					1520	my ($cc1, $next_index1) = @{$$transitions1[$i]};
	1661					2731
1629	1661					1720	my ($cc2, $next_index2) = @{$$transitions2[$i]};
	1661					2451
1630	1661	50				3963	if ($cc1 ne $cc2) {
1631	0					0	return 0;
1632							}
1633	1661	100				3032	if (exists($nfa1_nfa2_indexes{$next_index1})) {
		50
1634	1177	50				4592	if ($nfa1_nfa2_indexes{$next_index1} != $next_index2) {
1635	0					0	return 0;
1636							}
1637							}
1638							elsif (exists($nfa2_nfa1_indexes{$next_index2})) {
1639							# $nfa2_nfa1_indexes{$next_index2} != $next_index1
1640							# because
1641							# - !exists($nfa1_nfa2_indexes{$next_index1})
1642							# - $nfa1_nfa2_indexes and $nfa2_nfa1_indexes
1643							# are reverse to each other by construction
1644	0					0	return 0;
1645							}
1646							else {
1647	484					835	$nfa1_nfa2_indexes{$next_index1} = $next_index2;
1648	484					666	$nfa2_nfa1_indexes{$next_index2} = $next_index1;
1649	484					1321	push(@nfa1_index_todo, $next_index1);
1650							}
1651							}
1652							}
1653	98					749	return 1;
1654							}
1655
1656
1657							##############################################################################
1658							# $dfa
1659							##############################################################################
1660
1661							# input X:
1662							# Arbitrary list of intervals.
1663							# output Y:
1664							# List of pairwise disjoint intervals spanning the same subset such that
1665							# for any intersections/unions of intervals of X
1666							# an equal union of intervals of Y exists.
1667							# In short, all boundaries of X are preserved.
1668							#
1669							# Motivation:
1670							# nfas use character classes as alphabet (instead of single code points).
1671							# dfa operations needs a common refinement of sets of character classes.
1672							#
1673							# Example:
1674							# interval_cases( [ [0, 5], [2, 8] ] )
1675							# = [ [0, 1], [2, 5], [6, 8] ]
1676							#
1677							# X: \|0 1 2 3 4 5\|
1678							# \|2 3 4 5 6 7 8\|
1679							# Y: \|0 1\|2 3 4 5\|6 7 8\|
1680							#
1681							sub interval_cases {
1682	1521			1521	0	1737	my ($interval_list) = @_;
1683							my @sorted
1684	45242	50				82992	= sort {
1685	1521					3209	$$a[0] <=> $$b[0]
1686							\|\| $$b[1] <=> $$a[1]
1687							}
1688							@$interval_list
1689							;
1690	1521					1575	my %los;
1691							my %his;
1692	1521					1550	my $i = 0;
1693	1521					3255	while ($i < @sorted) {
1694	4753					8650	$los{$sorted[$i][0]} = undef;
1695	4753					6564	$his{$sorted[$i][1]} = undef;
1696	4753					5053	my $j = $i + 1;
1697	4753		100			21717	while (
			100
1698							$j < @sorted
1699							&& $sorted[$j][0] == $sorted[$i][0]
1700							&& $sorted[$j][1] == $sorted[$i][1]
1701							) {
1702							# $sorted[$i] ---------
1703							# $sorted[$j] ---------
1704	2418					11594	$j++;
1705							}
1706	4753		100			18071	while (
			66
1707							$j < @sorted
1708							&& $sorted[$j][0] == $sorted[$i][0]
1709							&& $sorted[$j][1] < $sorted[$i][1]
1710							) {
1711							# $sorted[$i] ---------
1712							# $sorted[$j] -----
1713	1712					2613	$his{$sorted[$j][1]} = undef;
1714	1712					2393	$los{$sorted[$j][1]+1} = undef;
1715	1712					9414	$j++;
1716							}
1717							# $sorted[$j][0] > $sorted[$i][0]
1718	4753		100			16488	while (
1719							$j < @sorted
1720							&& $sorted[$j][1] < $sorted[$i][1]
1721							) {
1722							# $sorted[$i] ---------
1723							# $sorted[$j] -----
1724	3615					5238	$his{$sorted[$j][0]-1} = undef;
1725	3615					4233	$los{$sorted[$j][0]} = undef;
1726	3615					4083	$his{$sorted[$j][1]} = undef;
1727	3615					4606	$los{$sorted[$j][1]+1} = undef;
1728	3615					13411	$j++;
1729							}
1730	4753	100	100			15661	if (
1731							$j < @sorted
1732							&& $sorted[$j][0] <= $sorted[$i][1]
1733							) {
1734							# $sorted[$j][0] > $sorted[$i][0]
1735							# && $sorted[$j][0] <= $sorted[$i][1]
1736							# && $sorted[$j][1] >= $sorted[$i][1]
1737							#
1738							# $sorted[$i] ---------
1739							# $sorted[$j] -----
1740	446					696	$his{$sorted[$j][0]-1} = undef;
1741	446	50				1039	if ($sorted[$i][1] != $sorted[$j][1]) {
1742	0					0	$los{$sorted[$i][1]+1} = undef;
1743							}
1744							}
1745	4753					9420	$i = $j;
1746							}
1747	1521					4179	my @sorted_los = sort( { $a <=> $b } keys(%los));
	8891					10805
1748	1521					3768	my @sorted_his = sort( { $a <=> $b } keys(%his));
	8889					9914
1749	1521					3513	return [ map { [$sorted_los[$_], $sorted_his[$_]] } (0..$#sorted_los) ];
	5439					16082
1750							}
1751
1752							=item C
1753
1754							Compute a deterministic finite automaton from C<$in_nfa>
1755							(powerset construction).
1756
1757							The data structure of a deterministic finite automaton (dfa) is
1758							the same as that of a non-deterministic one, but it is further constrained:
1759							For each state and each unicode character there exist exactly one transition
1760							(i.e. a pair C<($char_class, $state_index)>) matching this character.
1761
1762							Note that the following constraint hold for both a C<$dfa> and a C<$nfa>:
1763							For each pair of state p1 and p2, there exists at most one transition
1764							from p1 to p2 (artefact of this implementation).
1765
1766							=cut
1767
1768							sub nfa_to_dfa {
1769	205			205	1	271	my ($nfa) = @_;
1770	205					334	my $dfa = [];
1771	205	50				487	if (!@$nfa) {
1772	0					0	return [[0, [$cc_any, 0]]];
1773							}
1774	205					251	my $trap_needed = 0;
1775	205					238	my $dfa_size = 0;
1776	205					524	my %dfa_indexes = ("0" => $dfa_size++);
1777	205					451	my @todo = ([0]);
1778	205					439	while (@todo) {
1779	1322					1852	my $nfa_indexes = pop(@todo);
1780	1322					2687	my $dfa_index = $dfa_indexes{join('.', @$nfa_indexes)};
1781	1322					3210	my @nfa_states = @$nfa[@$nfa_indexes];
1782
1783							# accepting
1784	1322	100				1731	$$dfa[$dfa_index][0] = scalar(grep { $$_[0] } @nfa_states) ? 1 : 0;
	2183					5837
1785
1786							# transitions
1787	4972					10025	my $cases = interval_cases([
1788	4972					4306	map { @{$$_[0]} }
	2183					4911
1789	1322					1934	map { @{$$_[1]} }
	2183					1979
1790							@nfa_states
1791							]);
1792	1322					2921	my %dfa_index_to_intervals;
1793	1322					2077	for my $interval (@$cases) {
1794							my @next_nfa_indexes
1795	4322					4539	= sort(keys(%{{
	6211					24575
1796	28238					46220	map { ($$_[1] => undef) }
1797	7101					14978	grep { cc_match($$_[0], $$interval[0]) }
1798	4322					5829	map { @{$$_[1]} }
	7101					6172
1799							@nfa_states
1800							}}))
1801							;
1802	4322					12055	my $next_index_key = join('.', @next_nfa_indexes);
1803	4322	100				9771	if (!exists($dfa_indexes{$next_index_key})) {
1804	1117					1944	$dfa_indexes{$next_index_key} = $dfa_size++;
1805	1117					1588	push(@todo, \@next_nfa_indexes);
1806							}
1807	4322					4150	push(@{$dfa_index_to_intervals{$dfa_indexes{$next_index_key}}},
	4322					14595
1808							$interval
1809							);
1810							}
1811
1812	1322					1669	my @any_ccs;
1813	3821					11035	$$dfa[$dfa_index][1] = [
1814							map {
1815	1322					3388	my $cc = interval_list_to_cc($dfa_index_to_intervals{$_});
1816	3821					5542	push(@any_ccs, $cc);
1817	3821					9343	[$cc, $_ ];
1818							}
1819							sort(keys(%dfa_index_to_intervals))
1820							];
1821	1322	100				3813	if ((my $all_cc = cc_union(@any_ccs)) != $cc_any) {
1822	1177					1358	$trap_needed = 1;
1823	1177					1088	push(@{$$dfa[$dfa_index][1]},
	1177					3189
1824							[ cc_neg($all_cc), -1 ]
1825							);
1826							}
1827							}
1828
1829	205	100				769	if ($trap_needed) {
1830	195					320	for (
1831	4889					7803	grep { $$_[1] == -1 }
	1284					3360
1832	1284					1656	map { @{$$_[1]} }
1833							@$dfa
1834							) {
1835	1177					1586	$$_[1] = $dfa_size;
1836							}
1837	195					826	$$dfa[$dfa_size] = [0, [[$cc_any, $dfa_size]]];
1838							}
1839
1840	205					1206	return $dfa;
1841							}
1842
1843
1844							=item C
1845
1846
1847							Computes a minimal deterministic C<$dfa> from the given C<$in_dfa>
1848							(Hopcroft's algorithm).
1849
1850							Note that the given C<$in_dfa> must be a C<$dfa>, as
1851							returned from C, and not a mere C<$nfa>.
1852
1853							Myhill-Nerode theorem: two minimal dfa accepting
1854							the same language are isomorph (i.e. C returns true).
1855
1856							=cut
1857
1858							sub dfa_to_min_dfa {
1859	205			205	1	294	my ($dfa) = @_;
1860	205					227	my @acceptings;
1861							my @non_acceptings;
1862	0					0	my @intervals;
1863	205					494	for my $index (0..$#$dfa) {
1864	1517	100				2629	if ($$dfa[$index][0]) {
1865	271					362	push(@acceptings, $index);
1866							}
1867							else {
1868	1246					1382	push(@non_acceptings, $index);
1869							}
1870	1517					1497	push(@intervals, map { @{$$_[0]} } @{$$dfa[$index][1]})
	5193					4469
	5193					9133
	1517					2209
1871							}
1872	205					322	my $partition;
1873	205	100				489	if (@non_acceptings) {
1874	199					547	$partition = [\@non_acceptings, \@acceptings];
1875	199					1033	my %todo = (join('.', @non_acceptings) => \@non_acceptings);
1876	199					433	my $cases = interval_cases(\@intervals);
1877	199					804	while (my ($todo_key) = keys(%todo)) {
1878	994					1353	my %indexes = map { ($_ => undef) } @{delete($todo{$todo_key})};
	2325					5319
	994					2482
1879	994					2075	for my $interval (@$cases) {
1880	21324					40839	my %prev_inds = (
1881	269076					291232	map { ($_ => undef) }
1882							grep {
1883	9909					27256	my $i = $_;
1884	914283	100				2253726	grep {
1885	269076					414302	exists($indexes{$$_[1]})
1886							&& cc_match($$_[0], $$interval[0])
1887							}
1888	269076					242458	@{$$dfa[$i][1]}
1889							}
1890							(0..$#$dfa)
1891							);
1892	9909					20366	my $refined_partition;
1893	9909					14003	for my $partition_indexes (@$partition) {
1894	127343					126988	my (@inter, @diff);
1895	127343					167912	for (@$partition_indexes) {
1896	269076	100				404960	if (exists($prev_inds{$_})) {
1897	21324					40617	push(@inter, $_);
1898							}
1899							else {
1900	247752					405475	push(@diff, $_);
1901							}
1902							}
1903	127343	100	100			344680	if (!@inter \|\| !@diff) {
1904	126548					259868	push(@$refined_partition, $partition_indexes);
1905							}
1906							else {
1907	795					1736	push(@$refined_partition, \@inter, \@diff);
1908	795					4505	my $prev_inds_key = join('.', sort(keys(%prev_inds)));
1909	795	50				2627	if ($todo{$prev_inds_key}) {
		100
1910	0					0	delete($todo{$prev_inds_key});
1911	0					0	$todo{join('.', @diff)} = \@diff;
1912	0					0	$todo{join('.', @inter)} = \@inter;
1913							}
1914							elsif (@diff < @inter) {
1915	177					733	$todo{join('.', @diff)} = \@diff;
1916							}
1917							else {
1918	618					2328	$todo{join('.', @inter)} = \@inter;
1919							}
1920							}
1921							}
1922	9909					35783	$partition = $refined_partition;
1923							}
1924							}
1925							}
1926							else {
1927	6					11	$partition = [\@acceptings];
1928							}
1929	205					296	my $state_ind_to_equiv;
1930	205					349	for (grep { @$_ != 1 } @$partition) {
	1199					2055
1931	149					781	@$state_ind_to_equiv{@$_[1..$#$_]} = ($$_[0]) x $#$_;
1932							}
1933	205					622	return _nfa_shrink_equiv($dfa, $state_ind_to_equiv);
1934							}
1935
1936
1937							##############################################################################
1938							# $tree
1939							##############################################################################
1940
1941							=back
1942
1943							=head2 Tree
1944
1945							$tree = [ $star, [ $alt_0, $alt_1, ... ] ]
1946							or $char_class # ref($char_class) eq CHAR_CLASS
1947							or undef # accepting nothing
1948							$alt = [ $tree_0, $tree_1, ... ]
1949
1950							A C<$tree> is a hierarchical data structure used as intermediate form for
1951							regular expression generation routines.
1952
1953							Similar to a parse tree, except that the C<$tree>s described here are not the
1954							direct result of the parsing routines C; indeed, the parsing
1955							routines generate a C<$nfa>, which then can be converted to a C<$tree>.
1956
1957							A string is spanned by C<$tree = [$star, [ $alt_0, $alt_1, ... ] ]> if it is
1958							spanned by one of the C<$alt_i> (if C<$star> is false) of a repetition thereof
1959							(if C<$star> is true).
1960
1961							A string is spanned by C<$alt = [ $tree_0, $tree_1, ...]> if it is the
1962							concatenation of C<@substrings>, each C<$substrings[$i]> being spanned by
1963							C<$$alt[$i]>.
1964
1965							=over 4
1966
1967							=item C
1968
1969							Converts a C<$nfa> to a C<$tree>.
1970							Returns C if the C<$nfa> accepts nothing (not even the empty string).
1971
1972							=cut
1973
1974							sub nfa_to_tree {
1975	130			130	1	185	my ($nfa) = @_;
1976
1977							# Warshall algorithm (Kleene's theorem)
1978							# with preliminary computations:
1979							# - words-paths (unbranched paths) are shrunken
1980							# - unique accepting state is ensured
1981							# - branches (with single parent) are skipped
1982
1983	130					205	my $path = {};
1984	130					194	my $path_tr = {};
1985	130					156	my %accepting_state_inds;
1986
1987							# Initialization of the paths
1988
1989	130					335	for my $i (0..$#$nfa) {
1990	922	100				2150	if ($$nfa[$i][0]) {
1991	152					310	$accepting_state_inds{$i} = $i;
1992							}
1993	922					978	for (@{$$nfa[$i][1]}) {
	922					1728
1994	1361					5655	$$path{$i}{$$_[1]}
1995							= $$path_tr{$$_[1]}{$i}
1996							= $$_[0];
1997							}
1998							}
1999
2000	130					352	if (TRACE_NFA_TO_TREE) {
2001							print STDERR "before word shrink\n";
2002							for my $i (sort {$a <=> $b} (keys(%$path))) {
2003							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2004							print STDERR "$i $j: " . cc_to_regex($$path{$i}{$j}) . "\n";
2005							}}
2006							}
2007
2008	130					288	my @tree_list;
2009							my @state_ind_path;
2010
2011							# word-paths (unbranched paths) are shrunken
2012	130					301	for my $first (0..$#$nfa) {
2013	922	100				2095	if (!exists($$path{$first})) { next; }
	411					531
2014							my @todo
2015	450					762	= sort {
2016	450	50				522	keys(%{$$path_tr{$b}}) <=> keys(%{$$path_tr{$a}})
	450					1462
	955					2330
2017							\|\| $b <=> $a
2018							}
2019	511					1396	grep { $_ != $first }
2020	511					564	keys(%{$$path{$first}})
2021							;
2022	511					677	my %todo_ctrl;
2023	511					602	my $todo_sorted = 1;
2024	511		100			1897	while (
			66
2025							@todo
2026							&& (
2027							!$todo_sorted
2028							\|\| keys(%{$$path_tr{$todo[-1]}}) == 1
2029							)
2030							) {
2031	461					829	$todo_ctrl{my $i = pop(@todo)} = undef;
2032	461	100				494	if (keys(%{$$path_tr{$i}}) != 1) {
	461					1191
2033	165	100	33			1098	if ($i != $first && !$todo_sorted && @todo) {
			66
2034							@todo
2035	84					127	= sort {
2036	84	50				98	keys(%{$$path_tr{$b}}) <=> keys(%{$$path_tr{$a}})
	84					333
	162					485
2037							\|\| $b <=> $a
2038							}
2039	69					92	keys%{{ map { ($_ => undef) } (@todo, $i) }}
	69					102
2040							;
2041	69					178	$todo_sorted = 1;
2042							}
2043	165					654	next;
2044							}
2045	296					373	$todo_sorted = 0;
2046
2047	296					654	my @tree_list = ($$path{$first}{$i});
2048	296					441	my @state_ind_path = ($i);
2049
2050	296		66			377	while (
2051	595					5537	keys(%{$$path{$i}}) == 1
	406					1910
2052							&& (my $j = (keys(%{$$path{$i}}))[0]) != $first
2053							) {
2054	406					657	push(@tree_list, $$path{$i}{$j});
2055	406					552	push(@state_ind_path, $i = $j);
2056	406	100				399	if (keys(%{$$path_tr{$j}}) != 1) {
	406					1392
2057	107					320	last;
2058							}
2059							}
2060
2061	296					331	if (TRACE_NFA_TO_TREE) {
2062							print STDERR "first, state_ind_path: $first, @state_ind_path\n";
2063							}
2064
2065	296	100				1339	if (@state_ind_path > 1) {
2066
2067	137					137	if (TRACE_NFA_TO_TREE) {
2068							print STDERR "delete head $first -> $state_ind_path[0]\n";
2069							}
2070	137					286	delete($$path{$first}{$state_ind_path[0]});
2071	137					375	for (@state_ind_path[0..$#state_ind_path-1]) {
2072	406					1016	delete($$path{$_});
2073	406					822	delete($$path_tr{$_});
2074	406					1103	if (TRACE_NFA_TO_TREE) {
2075							print STDERR "delete path $_ -> *\n";
2076							print STDERR "delete path * <- $_\n";
2077							}
2078							}
2079	137					485	delete($$path_tr{$state_ind_path[-1]}{$state_ind_path[-2]});
2080	137	100				344	if (!exists($todo_ctrl{$state_ind_path[-1]})) {
2081	122					180	$todo_ctrl{$state_ind_path[-1]} = undef;
2082	122					192	push(@todo, $state_ind_path[-1]);
2083							}
2084	137					127	if (TRACE_NFA_TO_TREE) {
2085							print STDERR "delete tail $state_ind_path[-1] <- $state_ind_path[-2]\n";
2086							}
2087
2088
2089							# $first -> $last
2090	137					176	my $last = $state_ind_path[-1];
2091	137	100				926	$$path{$first}{$last}
2092							= $$path_tr{$last}{$first}
2093							= exists($$path{$first}{$last})
2094							? tree_alt(
2095							$$path{$first}{$last}
2096							, tree_concat(@tree_list)
2097							)
2098							: tree_concat(@tree_list)
2099							;
2100
2101	137					215	if (TRACE_NFA_TO_TREE) {
2102							print STDERR
2103							"$first -> $last created (first ->last): "
2104							. join('', map {_tree_to_regex($_)} @tree_list) . "\n";
2105							}
2106
2107	137					326	for (0..$#state_ind_path-1) {
2108
2109							# $first -> accepting
2110	406	100				1716	if ($accepting_state_inds{
2111							my $state_ind = $state_ind_path[$_]
2112							}) {
2113	30	50				135	$$path{$first}{$state_ind}
2114							= $$path_tr{$state_ind}{$first}
2115							= exists($$path{$first}{$state_ind})
2116							? tree_alt(
2117							$$path{$first}{$state_ind}
2118							, tree_concat(@tree_list[0..$_])
2119							)
2120							: tree_concat(@tree_list[0..$_])
2121							;
2122	30					185	if (TRACE_NFA_TO_TREE) {
2123							print STDERR
2124							"$first -> $state_ind created (first -> accepting): "
2125							. join('', map {_tree_to_regex($_)} @tree_list[0..$_]) . "\n";
2126							}
2127							}
2128							}
2129							}
2130							}
2131							}
2132
2133	130					221	if (TRACE_NFA_TO_TREE) {
2134							print STDERR "after word shrink\n";
2135							for my $i (sort {$a <=> $b} (keys(%$path))) {
2136							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2137							print STDERR "$i $j: " . tree_dump($$path{$i}{$j}) . "\n";
2138							}}
2139							for my $j (sort {$a <=> $b} (keys(%$path_tr))) {
2140							for my $i (sort {$a <=> $b} (keys(%{$$path_tr{$j}}))) {
2141							print STDERR "$j <- $i: " . tree_dump($$path_tr{$j}{$i}) . "\n";
2142							}}
2143							}
2144
2145							# unique accepting state is ensured
2146							# (pseudo-unique: the initial state may additionally be accepting)
2147	130					231	my $unique_accepting_state_ind = @$nfa;
2148	130	100	100			394	if (
		100
2149							keys(%accepting_state_inds) == 1
2150							) {
2151	113					328	$unique_accepting_state_ind = (keys(%accepting_state_inds))[0];
2152							}
2153							elsif (
2154							keys(%accepting_state_inds) == 2
2155							&& exists($accepting_state_inds{0})
2156							) {
2157	6					10	$unique_accepting_state_ind
2158	3					8	= (grep {$_} keys(%accepting_state_inds))[0];
2159							}
2160							else {
2161	14					30	$unique_accepting_state_ind = @$nfa;
2162	14					45	for my $to_state_ind (keys(%accepting_state_inds)) {
2163	33					33	for my $from_state_ind (keys(%{$$path_tr{$to_state_ind}})) {
	33					75
2164	58					178	push(
2165	58					49	@{$$path_tr{$unique_accepting_state_ind}{$from_state_ind}}
2166							, $$path_tr{$to_state_ind}{$from_state_ind}
2167							);
2168							}
2169							}
2170	14					24	for my $from_state_ind (
	14					51
2171							keys(%{$$path_tr{$unique_accepting_state_ind}})
2172							) {
2173	50					133	$$path_tr{$unique_accepting_state_ind}{$from_state_ind}
2174							= $$path{$from_state_ind}{$unique_accepting_state_ind}
2175							= tree_alt(
2176	50					49	@{$$path_tr{$unique_accepting_state_ind}{$from_state_ind}}
2177							);
2178							}
2179							}
2180
2181	130					171	if (TRACE_NFA_TO_TREE) {
2182							print STDERR "after unique state addition\n";
2183							for my $i (sort {$a <=> $b} (keys(%$path))) {
2184							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2185							print STDERR "$i $j: " . tree_dump($$path{$i}{$j}) . "\n";
2186							}}
2187							for my $j (sort {$a <=> $b} (keys(%$path_tr))) {
2188							for my $i (sort {$a <=> $b} (keys(%{$$path_tr{$j}}))) {
2189							print STDERR "$j <- $i: " . tree_dump($$path_tr{$j}{$i}) . "\n";
2190							}}
2191							}
2192
2193	130					259	for my $reversed (0, 1) {
2194	260	100				930	my ($tmp_path, $tmp_path_tr)
2195							= $reversed
2196							? ($path_tr, $path)
2197							: ($path, $path_tr)
2198							;
2199
2200							# branches (with single parent) are skipped
2201							my @branch_inds
2202	483					836	= $reversed
2203	784					1124	? sort {$a <=> $b} (keys(%$tmp_path))
2204	260	100				1046	: sort {$b <=> $a} (keys(%$tmp_path))
2205							;
2206	260					609	while (@branch_inds) {
2207	1148					1517	my $branch = pop(@branch_inds);
2208	1148	100	100			6377	if (
			100
			100
2209	642					2154	!exists($$tmp_path{$branch})
2210							# root cannot be un-branched
2211							\|\| $branch == 0
2212							# accepting states cannot be un-branched
2213							\|\| $branch == $unique_accepting_state_ind
2214							# single parent (non-root have one or more parents)
2215							\|\| keys(%{$$tmp_path_tr{$branch}}) != 1
2216							) {
2217	879					1994	next;
2218							}
2219
2220	269					270	if (TRACE_NFA_TO_TREE) {
2221							print STDERR "branch at $branch\n";
2222							}
2223	269					277	my ($parent) = keys(%{$$tmp_path_tr{$branch}}); # single parent
	269					588
2224	269	100	66			1010	if (
			66
2225							ref($$tmp_path{$parent}{$branch}) ne CHAR_CLASS
2226							&& (
2227							# starified parent
2228							$$tmp_path{$parent}{$branch}[0]
2229							# parent containing several paths
2230							\|\| @{$$tmp_path{$parent}{$branch}[1]} > 1
2231							)
2232							) {
2233	35					90	next;
2234							}
2235
2236	234					262	my (@children) = keys(%{$$tmp_path{$branch}});
	234					723
2237
2238	234					404	for my $child (@children) {
2239	533	100				2320	$$tmp_path{$parent}{$child}
		100
		100
2240							= $$tmp_path_tr{$child}{$parent}
2241							= exists($$tmp_path{$parent}{$child})
2242							? tree_alt(
2243							$$tmp_path{$parent}{$child}
2244							, tree_concat2(
2245							$reversed
2246							? (
2247							$$tmp_path{$branch}{$child}
2248							, $$tmp_path{$parent}{$branch}
2249							)
2250							: (
2251							$$tmp_path{$parent}{$branch}
2252							, $$tmp_path{$branch}{$child}
2253							)
2254							)
2255							)
2256							: tree_concat2(
2257							$reversed
2258							? (
2259							$$tmp_path{$branch}{$child}
2260							, $$tmp_path{$parent}{$branch}
2261							)
2262							: (
2263							$$tmp_path{$parent}{$branch}
2264							, $$tmp_path{$branch}{$child}
2265							)
2266							)
2267							;
2268	533					1248	delete($$tmp_path_tr{$child}{$branch});
2269
2270	533					790	if (TRACE_NFA_TO_TREE) {
2271							print STDERR
2272							"parent -> branch: "
2273							. tree_dump($$tmp_path{$parent}{$branch}) . "\n";
2274							print STDERR
2275							"branch -> child : "
2276							. tree_dump($$tmp_path{$branch}{$child}) . "\n";
2277							print STDERR
2278							"$parent -> $child created (un-branch): "
2279							. tree_dump($$tmp_path{$parent}{$child})
2280							. ($reversed ? " (reversed)" : "" ) . "\n";
2281							print STDERR
2282							"delete $child <- $branch\n";
2283							}
2284
2285							}
2286	234					531	delete($$tmp_path{$parent}{$branch});
2287	234					664	delete($$tmp_path{$branch});
2288	234					388	delete($$tmp_path_tr{$branch});
2289
2290	234					245	if (TRACE_NFA_TO_TREE) {
2291							print STDERR "delete $parent -> $branch\n";
2292							print STDERR "delete $branch -> *\n";
2293							print STDERR "delete $branch <- *\n";
2294							}
2295
2296	234					758	push(@branch_inds, $parent);
2297							}
2298
2299	260					594	if (TRACE_NFA_TO_TREE) {
2300							print STDERR "after branch skip\n";
2301							for my $i (sort {$a <=> $b} (keys(%$tmp_path))) {
2302							for my $j (sort {$a <=> $b} (keys(%{$$tmp_path{$i}}))) {
2303							if ($reversed) {
2304							print STDERR "$j $i: " . tree_dump($$tmp_path{$i}{$j}) . "\n";
2305							}
2306							else {
2307							print STDERR "$i $j: " . tree_dump($$tmp_path{$i}{$j}) . "\n";
2308							}
2309							}}
2310							for my $j (sort {$a <=> $b} (keys(%$tmp_path_tr))) {
2311							for my $i (sort {$a <=> $b} (keys(%{$$tmp_path_tr{$j}}))) {
2312							print STDERR
2313							($reversed ? "$i <- $j: " : "$j <- $i:")
2314							. tree_dump($$tmp_path_tr{$j}{$i}) . "\n";
2315							}}
2316							}
2317
2318							}
2319
2320
2321							# starify diagonal
2322	130					340	for (grep { exists($$path{$_}{$_}) } keys(%$path)) {
	299					785
2323	127					1891	$$path{$_}{$_}
2324							= $$path_tr{$_}{$_}
2325							= tree_starify($$path{$_}{$_});
2326							}
2327
2328	130					340	if (TRACE_NFA_TO_TREE) {
2329							print STDERR "after diagonal starification\n";
2330							for my $i (sort {$a <=> $b} (keys(%$path))) {
2331							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2332							print STDERR "$i $j: ";
2333							print STDERR tree_dump($$path{$i}{$j}) . "\n";
2334							}}
2335							}
2336
2337							# Warshall algorithm (Kleene's theorem)
2338	130					168	my %updates;
2339	299					532	my %weight = map {
2340	130					329	my $w = 0;
2341	299					368	for (values(%{$$path{$_}})) { $w += _tree_weight($_) }
	299					857
	473					1049
2342	299					801	($_ => $w);
2343							} keys(%$path);
2344	130	50				437	my @ks = sort { $weight{$a} <=> $weight{$b} \|\| $a <=> $b } keys(%$path);
	296					733
2345							# note that keys(%$path_tr) are not additionally needed
2346							# case i == k && k == j: nothing to do
2347							# case i != k && k != j: $$path{$k}{$j} must exist
2348							# case i == k && k != j: $$path{$k}{$k} must exist
2349							# case i != k && k == j: $$path{$k}{$k} must exist
2350	130					259	for my $k (@ks) {
2351	299					325	for my $i (keys(%{$$path_tr{$k}})) { # i -> k
	299					821
2352	480					482	for my $j (keys(%{$$path{$k}})) { # k -> j
	480					1142
2353	1335	100	100			3474	if ($i == $k && $k == $j) { next; }
	138					446
2354	1197					1071	my @trees;
2355	1197	100	100			5080	if (
			66
2356							exists($$path{$i}{$j})
2357							&& ($i != $k && $k != $j)
2358							) {
2359	533					806	push(@trees, $$path{$i}{$j});
2360							}
2361	1197	100				4512	my $new_tree
		100
		100
2362							= exists($$path{$k}{$k})
2363							? tree_concat(
2364							(
2365							$i != $k
2366							? $$path{$i}{$k}
2367							: ()
2368							)
2369							, $$path{$k}{$k}
2370							, (
2371							$k != $j
2372							? $$path{$k}{$j}
2373							: ()
2374							)
2375							)
2376							: tree_concat2($$path{$i}{$k}, $$path{$k}{$j})
2377							;
2378	1197	100				2185	push(@trees, $i == $j ? tree_starify($new_tree) : $new_tree);
2379
2380	1197	100				1864	if (@trees == 1) {
2381	664					1941	$updates{$i}{$j} = $trees[0];
2382							}
2383							else {
2384	533					797	$updates{$i}{$j} = tree_alt(@trees);
2385							}
2386							}
2387							}
2388	299					658	for my $i (keys(%updates)) {
2389	407					396	for my $j (keys(%{$updates{$i}})) {
	407					929
2390	1197					3040	$$path{$i}{$j} = $$path_tr{$j}{$i} = $updates{$i}{$j};
2391							}
2392							}
2393
2394	299					372	if (TRACE_NFA_TO_TREE) {
2395							my $num_of_updates = map {keys(%{$updates{$_}})} keys(%updates);
2396							print STDERR "k = $k ($num_of_updates updates)\n";
2397							if ($num_of_updates) {
2398							for my $i (sort {$a <=> $b} (keys(%$path))) {
2399							for my $j (sort {$a <=> $b} (keys(%{$$path{$i}}))) {
2400							print STDERR "$i $j: ";
2401							print STDERR tree_dump($$path{$i}{$j}) . "\n";
2402							}}
2403							}
2404							}
2405
2406	299					846	%updates = ();
2407							}
2408
2409	130					211	my $tree;
2410
2411							# accepting empty init
2412	130	100				313	if ($$nfa[0][0]) {
2413
2414	52	100				226	my $path_0_0 = exists($$path{0}{0}) ? $$path{0}{0} : $cc_none;
2415
2416	52	100				96	if ($unique_accepting_state_ind == 0) {
2417	47					80	$tree = $path_0_0;
2418							}
2419							else {
2420	5					9	my $path_0_end = $$path{0}{$unique_accepting_state_ind};
2421
2422	5	50	100			36	if (
			66
2423							$path_0_0 == $cc_none
2424							&& ref($path_0_end) ne CHAR_CLASS
2425							&& $$path_0_end[0]
2426							) {
2427							# starified expression e* does not need (\|e*)
2428	0					0	$tree = $path_0_end;
2429							}
2430							else {
2431							# non-starified expression e needs (\|e)
2432	5					13	$tree = tree_alt($path_0_0, $path_0_end);
2433							}
2434							}
2435							}
2436							else {
2437	78					195	$tree = $$path{0}{$unique_accepting_state_ind};
2438							}
2439
2440	130					160	if (TRACE_NFA_TO_TREE) {
2441							print STDERR "tree: " . tree_dump($tree) . "\n";
2442							}
2443
2444	130					334	_tree_factorize_fixes($tree);
2445
2446	130					226	if (TRACE_NFA_TO_TREE) {
2447							print STDERR "tree (after factorization): " . tree_dump($tree) . "\n";
2448							}
2449	130					1517	return $tree;
2450							}
2451
2452
2453							# Recursively (bottom up) factorizes prefixes and suffixes out from
2454							# alternations if at least one of them contains a sub-tree.
2455							#
2456							# Example 1: (ab1cd\|ab2cd\|ab3cd) -> ab(1\|2\|3)cd
2457							# Example 2: (ab1cd\|ab2cd\|ab3cd) remains the same (no sub-tree)
2458							#
2459							# Example 2 does not need to be factorized
2460							# because it can be represented by a drop-down list,
2461							# which is the primary purpose of this module;
2462							# in this case, a factorization may lead to counter-intuitive results,
2463							# like words cut in the middle.
2464							#
2465							# But example 1 (less common) could only be represented as mere free-text
2466							# if the common pre- and suf-fixes were not factorized out,
2467							# thus loosing information for the input helper (xxx_to_input_constraints).
2468							#
2469							# This behavior can be changed by setting our $FULL_FACTORIZE_FIXES = 1;
2470							# in this case, Example 2 would produce ab(1\|2\|3)cd.
2471							#
2472							# Modifies $tree in place
2473							#
2474							sub _tree_factorize_fixes {
2475	1198			1198		1286	my ($tree) = @_;
2476	1198	100	100			5218	if (
			66
			100
			66
			66
2477	627					2849	!defined($tree)
2478							\|\| ref($tree) eq CHAR_CLASS
2479							\|\| @{$$tree[1]} == 0
2480							\|\| !$FULL_FACTORIZE_FIXES
2481							&& (
2482							@{$$tree[1]} == 1
2483							\|\| !grep { ref($_) ne CHAR_CLASS } map { @$_ } @{$$tree[1]}
2484							)
2485							) {
2486	953					1868	return $tree;
2487							}
2488							else {
2489	245					292	for (grep { grep { ref($_) ne CHAR_CLASS } @$_ } @{$$tree[1]} ) {
	564					650
	1503					2579
	245					338
2490	1068					1669	my $tmp_tree =
2491	320					637	tree_concat(map { _tree_factorize_fixes($_) } @$_)
2492							;
2493	320	100	66			1417	if (
			66
2494	319					843	ref($tmp_tree) eq CHAR_CLASS
2495							\|\| $$tmp_tree[0]
2496							\|\| @{$$tmp_tree[1]} > 1
2497							) {
2498	1					3	$_ = [$tmp_tree];
2499							}
2500							else {
2501	319					960	$_ = $$tmp_tree[1][0];
2502							}
2503							}
2504
2505							# flatten
2506	245					641	@{$$tree[1]} = map {
2507	245					351	[map {
2508	1598					2283	ref($_) ne CHAR_CLASS
2509	1					4	&& !$$_[0] && @{$$_[1]} == 1
2510							# non-starified with single alternation
2511	1598	100	100			5965	? @{$$_[1][0]}
2512							: $_
2513	564					709	} grep { defined($_) } @$_]
2514	245					289	} @{$$tree[1]};
2515
2516	245	100				280	if (@{$$tree[1]} == 1) {
	245					539
2517	4					10	return $tree;
2518							}
2519
2520	241					229	my $fst_len = @{$$tree[1][0]};
	241					347
2521	241					305	my ($pre_len, $suf_len) = (0, 0);
2522	241					326	for (1, 0) {
2523	392					660	my ($len_ref, @range)
2524							= $_
2525							? (\$pre_len, (0..$fst_len-1))
2526	482	100				1207	: (\$suf_len, map {-$_} (1..$fst_len-$pre_len))
2527							;
2528	482					739	for my $i (@range) {
2529	532	100				541	if (
2530	713	100	100			4723	grep {
2531	532					697	$i >= @$_
2532							\|\| ref($$_[$i]) ne CHAR_CLASS
2533							\|\| $$tree[1][0][$i] != $$_[$i]
2534							}
2535	532					864	@{$$tree[1]}[1..$#{$$tree[1]}]
2536							) {
2537	415					895	last;
2538							}
2539	117					256	$$len_ref++;
2540							}
2541							}
2542	241	100	100			1022	if ($pre_len == 0 && $suf_len == 0) {
2543	172					428	return $tree;
2544							}
2545
2546	69					89	my $empty_seen = 0;
2547							my $mid_tree = [
2548							0
2549							, [
2550							map {
2551	157	100				350	if ($pre_len <= $#$_ - $suf_len) {
	69	50				126
2552	104					350	[ @$_[$pre_len..$#$_-$suf_len] ];
2553							}
2554							elsif (!$empty_seen++) {
2555	53					96	[];
2556							}
2557							else {
2558	0					0	();
2559							}
2560							}
2561	69					82	@{$$tree[1]}
2562							]
2563							];
2564	69					117	$$tree[1] = [[
2565	69					172	@{$$tree[1][0]}[0..$pre_len-1]
2566	69					157	, $empty_seen == @{$$tree[1]} ? () : $mid_tree
2567	69	50				135	, @{$$tree[1][0]}[$fst_len-$suf_len..$fst_len-1]
2568							]];
2569	69					234	return $tree;
2570							}
2571							}
2572
2573							=item C
2574
2575							Converts a C<$tree> to an C<$ere> (if C<$to_perlre> is false)
2576							or to a C<$perlre> (if C<$to_perlre> is true).
2577
2578							=cut
2579
2580							sub tree_to_regex {
2581	118	100		118	1	418	my $re = defined($_[0]) ? &_tree_to_regex : '$.';
2582	118	100				2327	return $_[1] ? qr/\A$re\z/ms : "^$re\$";
2583							}
2584
2585							{
2586							my %cc_to_regex_cache;
2587
2588							sub _tree_to_regex {
2589	929			929		1296	my ($tree, $to_perlre) = (@_, 0);
2590	929	100	100			1477	if (ref($tree) eq CHAR_CLASS) {
	905	50				1625
		100
2591							return
2592	24		100			145	$cc_to_regex_cache{$tree.$to_perlre}
2593							\|\|= cc_to_regex($tree, $to_perlre)
2594							;
2595							}
2596	905					2139	elsif (@{$$tree[1]} == 0) {
2597	0					0	return '';
2598							}
2599							elsif (
2600	486					1426	@{$$tree[1]} == 1 # single alteration
2601							&& @{$$tree[1][0]} == 1 # single atom
2602							) {
2603	311					406	my $atom = $$tree[1][0][0];
2604	311	50				604	if (ref($atom) eq CHAR_CLASS) {
2605	311	100	100			1899	return join('',
2606							$cc_to_regex_cache{$atom.$to_perlre}
2607							\|\|= cc_to_regex($atom, $to_perlre)
2608							, $$tree[0] ? '*' : ()
2609							);
2610							}
2611							else {
2612	0		0			0	return _tree_to_regex(
2613							[$$tree[0] \|\| $$atom[0], $$atom[1]]
2614							, $to_perlre
2615							);
2616							}
2617							}
2618							else {
2619							my $needs_parenthesis
2620							= @{$$tree[1]} > 1 # (a\|...)
2621	594		66			654	\|\| $$tree[0] && @{$$tree[1][0]} > 1 # (ab...)*
2622							;
2623
2624	2878	100	100			12751	return join(''
2625							, ($needs_parenthesis ? ($to_perlre ? '(?:' : '(') : ())
2626							, (
2627							join('\|',
2628							map {
2629	594					795	join('',
2630							map {
2631	1127					1728	ref($_) eq CHAR_CLASS
2632							? $cc_to_regex_cache{$_.$to_perlre}
2633							\|\|= cc_to_regex($_, $to_perlre)
2634							: _tree_to_regex($_, $to_perlre)
2635							}
2636							@$_ # alternation
2637							)
2638							}
2639	594	100				1216	@{$$tree[1]}
		100
		100
		100
2640							)
2641							)
2642							, ($needs_parenthesis ? ')' : ())
2643							, ($$tree[0] ? '*' : ())
2644							);
2645							}
2646							}
2647							}
2648
2649							# starification (regex)*
2650							sub tree_starify {
2651	284			284	0	375	my ($tree) = @_;
2652	284	100				739	if (ref($tree) eq CHAR_CLASS) {
2653	95					424	return [1, [[$tree]]];
2654							}
2655							else {
2656	189					570	return [1, $$tree[1]];
2657							}
2658							}
2659
2660							# The behavior of tree_concat2 can be altered
2661							# by setting $TREE_CONCAT_FULL_EXPAND = 1;
2662							sub tree_concat2 {
2663	2995			2995	0	17838	my ($tree_0, $tree_1) = @_;
2664	2995					2602	my $concat;
2665
2666							# main criteria:
2667							# CHAR_CLASS
2668							# @{$$tree_n[1]} == 0
2669							# $$tree_n[0]
2670							# @{$$tree_n[1]} == 1
2671
2672	2995	100				5038	if (ref($tree_0) eq CHAR_CLASS) {
	2155	100				5450
		100
		100
2673	840	100				2087	if (@$tree_0 == 0) {
		100
		100
		100
2674	5	100	100			31	if (
2675	3					13	ref($tree_1) ne CHAR_CLASS
2676							&& @{$$tree_1[1]} == 0
2677							) {
2678							# () -> empty
2679	1					2	$concat = $cc_none;
2680							}
2681							else {
2682							# ->
2683	4					5	$concat = $tree_1;
2684							}
2685							}
2686	514					1372	elsif (ref($tree_1) eq CHAR_CLASS) {
2687	321	100				550	if (@$tree_1 == 0) {
2688							# a -> a
2689	1					2	$concat = $tree_0;
2690							}
2691							else {
2692							# a b -> (ab)
2693	320					887	$concat = [0, [[ $tree_0, $tree_1 ]]];
2694							}
2695							}
2696							elsif (@{$$tree_1[1]} == 0) {
2697							# a () -> a
2698	1					2	$concat = $tree_0;
2699							}
2700							elsif ($$tree_1[0]) {
2701							# a (b)* -> (a(b)*)
2702	373					982	$concat = [0, [[ $tree_0, $tree_1 ]]];
2703							}
2704							else {
2705	140	100	100			325	if (
2706	534	100				1549	$FULL_FACTORIZE_FIXES
2707	246					436	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2708	138					198	map {@$_} @{$$tree_1[1]}
2709							) {
2710							# a (bc\|de) -> (a(bc\|de))
2711							# one of bcde is starified
2712	42					104	$concat = [0, [[ $tree_0, $tree_1 ]]];
2713							}
2714							else {
2715							# a (bc\|de) -> (abc\|ade)
2716							# none of bcde is starified
2717	171					524	$concat = [
2718							0
2719	98					115	, [ map { [ $tree_0, @$_ ] } @{$$tree_1[1]} ]
	98					155
2720							];
2721							}
2722							}
2723							}
2724							elsif (@{$$tree_0[1]} == 0) {
2725	5	100	100			22	if (
2726	3					13	ref($tree_1) ne CHAR_CLASS
2727							&& @{$$tree_1[1]} == 0
2728							) {
2729							# () () -> empty
2730	1					2	$concat = $cc_none;
2731							}
2732							else {
2733							# () ->
2734	4					8	$concat = $tree_1;
2735							}
2736							}
2737	1884					3019	elsif ($$tree_0[0]) {
2738	266	100				564	if (ref($tree_1) eq CHAR_CLASS) {
	176	100				437
		100
		100
2739	90	100				147	if (@$tree_1 == 0) {
2740							# (a)* -> (a)*
2741	1					3	$concat = $tree_0;
2742							}
2743							else {
2744							# (a)* b -> ((a)*b)
2745	89					288	$concat = [0, [[ $tree_0, $tree_1 ]]];
2746							}
2747							}
2748							elsif (@{$$tree_1[1]} == 0) {
2749							# (a)* () -> (a)*
2750	1					1	$concat = $tree_0;
2751							}
2752	174					362	elsif ($$tree_1[0]) {
2753							# (a)* (b)* -> ((a)(b))
2754	1					4	$concat = [0, [[ $tree_0, $tree_1 ]]];
2755							}
2756							elsif (@{$$tree_1[1]} == 1) {
2757							# (a)* (bcd) -> ((a)*bcd)
2758	100					332	$concat = [
2759							0
2760	100					194	, [[ $tree_0, @{$$tree_1[1][0]} ]]
2761							];
2762							}
2763							else {
2764							# (a)* (b\|c) -> ((a)*(b\|c))
2765	74					196	$concat = [0, [[ $tree_0, $tree_1 ]]];
2766							}
2767							}
2768							elsif (@{$$tree_0[1]} == 1) {
2769	1472	100				2352	if (ref($tree_1) eq CHAR_CLASS) {
	1074	100				2335
		100
		100
		100
2770	398	100				659	if (@$tree_1 == 0) {
2771							# (ab) -> (ab)
2772	1					3	$concat = $tree_0;
2773							}
2774							else {
2775							# (ab) c -> (abc)
2776	397					1247	$concat = [
2777							0
2778	397					511	, [[ @{$$tree_0[1][0]}, $tree_1 ]]
2779							];
2780							}
2781							}
2782							elsif (@{$$tree_1[1]} == 0) {
2783							# (ab) () -> (ab)
2784	1					2	$concat = $tree_0;
2785							}
2786	657					1099	elsif ($$tree_1[0]) {
2787							# (ab) (c)* -> (ab(c)*)
2788	416					424	$concat = [0, [[@{$$tree_0[1][0]}, $tree_1]]];
	416					1207
2789							}
2790	1090					2014	elsif (@{$$tree_1[1]} == 1) {
2791							# (ab) (cd) -> (abcd)
2792	298					378	$concat = [
2793							0
2794	298					321	, [[ @{$$tree_0[1][0]}, @{$$tree_1[1][0]} ]]
	298					1105
2795							];
2796							}
2797							elsif (
2798	359					573	!grep { ref($_) ne CHAR_CLASS } @{$$tree_0[1][0]}
2799							) {
2800	15	100	66			49	if (
2801	214	100				569	$FULL_FACTORIZE_FIXES
2802	46					82	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2803	15					28	map {@$_} @{$$tree_1[1]}
2804							) {
2805							# (ab) (cd\|ef) -> (ab(cd\|ef))
2806							# neither a nor b is a tree
2807							# one of cdef is starified
2808	5					6	$concat = [0, [[@{$$tree_0[1][0]}, $tree_1]]];
	5					17
2809							}
2810							else {
2811							# (ab) (cd\|ef) -> (abcd\|abef)
2812							# neither a nor b is a tree
2813							# none of cdef is starified
2814	32					111	$concat = [
2815							0
2816	10					20	, [ map { [ @{$$tree_0[1][0]}, @$_ ] } @{$$tree_1[1]} ]
	32					39
	10					19
2817							];
2818							}
2819							}
2820							else {
2821							# (ab) (cd\|ef) -> (ab(cd\|ef))
2822							# a or b is a tree
2823	344					356	$concat = [0, [[@{$$tree_0[1][0]} , $tree_1 ]]];
	344					1131
2824							}
2825							}
2826							else {
2827	412	100				703	if (ref($tree_1) eq CHAR_CLASS) {
	348	50				794
		100
		100
		50
2828	64	50				136	if (@$tree_1 == 0) {
2829							# (ab\|cd) -> (ab\|cd)
2830	0					0	$concat = $tree_0;
2831							}
2832							else {
2833	64	100	100			179	if (
2834	366	100				1154	$FULL_FACTORIZE_FIXES
2835	152					430	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2836	57					112	map {@$_} @{$$tree_0[1]}
2837							) {
2838							# (ab\|cd) e -> ((ab\|cd)e)
2839							# one of abcd is starified
2840	12					42	$concat = [0, [[ $tree_0, $tree_1 ]]];
2841							}
2842							else {
2843							# (ab\|cd) e -> (abe\|cde)
2844							# none of abcd is starified
2845	142					402	$concat = [
2846							0
2847	52					73	, [ map { [@$_, $tree_1] } @{$$tree_0[1]} ]
	52					82
2848							];
2849							}
2850							}
2851							}
2852							elsif (@{$$tree_1[1]} == 0) {
2853							# (ab\|cd) () -> (ab\|cd)
2854	0					0	$concat = $tree_0;
2855							}
2856	81					262	elsif ($$tree_1[0]) {
2857							# (ab\|cd) (e)* -> ((ab\|cd)(e)*)
2858	267					619	$concat = [0, [[ $tree_0, $tree_1 ]]];
2859							}
2860							elsif (
2861							@{$$tree_1[1]} == 1
2862							) {
2863	13	100				27	if (!grep { ref($_) ne CHAR_CLASS } @{$$tree_1[1][0]}) {
	50					111
	13					32
2864	11	100	66			50	if (
2865	145	100				404	$FULL_FACTORIZE_FIXES
2866	30					57	\|\| grep { ref($_) ne CHAR_CLASS && $$_[0] }
2867	9					19	map {@$_} @{$$tree_0[1]}
2868							) {
2869							# (ab\|cd) (ef) -> ((ab\|cd)ef)
2870							# e and f both CHAR_CLASS
2871							# one of abcd is starified
2872	2					3	$concat = [0, [[$tree_0, @{$$tree_1[1][0]}]]];
	2					8
2873							}
2874							else {
2875							# (ab\|cd) (ef) -> (acef\|cdef)
2876							# e and f both CHAR_CLASS
2877							# none of abcd is starified
2878	30					254	$concat = [
2879							0
2880	9					16	, [ map { [@$_, @{$$tree_1[1][0]}] } @{$$tree_0[1]} ]
	30					48
	9					17
2881							];
2882							}
2883							}
2884							else {
2885							# (ab\|cd) (ef) -> ((ab\|cd)ef)
2886							# e or f is a tree
2887	2					4	$concat = [0, [[$tree_0, @{$$tree_1[1][0]}]]];
	2					17
2888							}
2889							}
2890							elsif ($TREE_CONCAT_FULL_EXPAND) {
2891							# (ab\|cd) (ef\|gh) -> (abef\|abgh\|cdef\|cdgh)
2892	0					0	$concat = [
2893							0
2894							, [
2895							map {
2896	0					0	my $alt_0 = $_;
2897	0					0	map { [@$alt_0, @$_] }
	0					0
2898	0					0	@{$$tree_1[1]}
2899							}
2900	0					0	@{$$tree_0[1]}
2901							]
2902							];
2903							}
2904							else {
2905							# (ab\|cd) (ef\|gh) -> ((ab\|cd)(ef\|gh))
2906	68					227	$concat = [0, [[ $tree_0, $tree_1 ]]];
2907							}
2908							}
2909	2995					7746	return $concat;
2910							}
2911
2912							# concatenation regex0regex1...
2913							sub tree_concat {
2914	1476	50		1476	0	7674	if (@_ == 0) {
		100
		50
2915	0					0	return $cc_none; # neutral element: accepting empty string
2916							}
2917	4156					7283	elsif (@_ == 1) {
2918	31					103	return $_[0];
2919							}
2920							elsif (grep {!defined($_)} @_) {
2921	0					0	return undef; # one accepting nothing -> concat accepting nothing
2922							}
2923
2924							# resolve words first
2925	1445					1318	my @word;
2926							my @trees;
2927	1445					2041	for (@_) {
2928	4156	100				6393	if (ref($_) eq CHAR_CLASS) {
2929	1452					2226	push(@word, $_);
2930							}
2931							else {
2932	2704	100				5727	if (@word > 1) {
		100
2933	30					76	push(@trees, [0, [[ @word ]] ] );
2934	30					48	@word = ();
2935							}
2936							elsif (@word) {
2937	438					478	push(@trees, $word[0]);
2938	438					546	@word = ();
2939							}
2940	2704					3940	push(@trees, $_);
2941							}
2942							}
2943	1445	100				3266	if (@word > 1) {
		100
2944	151					437	push(@trees, [0, [[ @word ]] ] );
2945							}
2946							elsif (@word) {
2947	351					417	push(@trees, $word[0]);
2948							}
2949
2950	1445					1630	my $concat = $trees[0];
2951	1445					2490	for my $tree (@trees[1..$#trees]) {
2952	2229					3268	$concat = tree_concat2($concat, $tree);
2953							}
2954
2955	1445					3176	return $concat;
2956							}
2957
2958							# alternation regex0\|regex1\|...
2959							sub tree_alt {
2960	1007			1007	0	976	my @starified_alts;
2961							my @non_starified_alts;
2962	0					0	my $has_empty;
2963
2964	1007					1271	for (grep { defined($_) } @_) {
	1972					3418
2965	1972	100				3388	if (ref($_) eq CHAR_CLASS) {
	1547	50				3555
		100
2966	425					825	push(@non_starified_alts, [$_]);
2967							}
2968							elsif (!@{$$_[1]}) {
2969	0					0	$has_empty = 1;
2970							}
2971							elsif ($$_[0]) {
2972	265					251	push(@starified_alts, @{$$_[1]});
	265					527
2973							}
2974							else {
2975	1282					1098	push(@non_starified_alts, @{$$_[1]});
	1282					2500
2976							}
2977							}
2978
2979	1007	100				1946	if (!@starified_alts) {
		100
2980	874	100	66			2438	if (
		50	66
			66
2981	43					152	@non_starified_alts > 1
2982							\|\| $has_empty
2983							\|\| @non_starified_alts && @{$non_starified_alts[0]} > 1
2984							) {
2985							return [
2986	835	50				4322	0
2987							, [
2988							@non_starified_alts
2989							, ($has_empty ? [[0, []]] : ())
2990							]
2991							];
2992							}
2993							elsif (!@non_starified_alts) {
2994	0					0	return undef; # neutral element: accepting nothing
2995							}
2996							else {
2997	39					184	return $non_starified_alts[0][0];
2998							}
2999
3000							}
3001							elsif (!@non_starified_alts) {
3002	132					701	return [1, \@starified_alts];
3003							}
3004							else {
3005							return [
3006	1					5	0
3007							, [
3008							@non_starified_alts
3009							, [[1, \@starified_alts]]
3010							]
3011							];
3012							}
3013							}
3014
3015
3016							# returns an unanchored $ere having exactly the same structure
3017							# as the given $tree. Intended for tracing/debugging.
3018							sub tree_dump {
3019	131			131	0	316	my ($tree) = @_;
3020	131	50				274	if (!defined($_[0])) {
3021							# nothing accepted (not even the empty string)
3022	0					0	return '$.';
3023							}
3024	131	100				272	if (ref($tree) eq CHAR_CLASS) {
	83	100				187
3025	48					90	return cc_to_regex($tree);
3026							}
3027							elsif (@{$$tree[1]} == 0) {
3028	20					79	return '()';
3029							}
3030							else {
3031	63					82	return join(''
3032							, '('
3033							, (
3034							join('\|',
3035							map {
3036	63					115	my $alt = $_;
3037	132					147	join('',
3038							map {
3039	63					89	my $atom = $_;
3040	132	100				345	if (ref($atom) eq CHAR_CLASS) {
3041	126					223	cc_to_regex($atom);
3042							}
3043							else {
3044	6					17	tree_dump($atom);
3045							}
3046							}
3047							@$alt
3048							)
3049							}
3050	63	100				90	@{$$tree[1]}
3051							)
3052							)
3053							, ')'
3054							, ($$tree[0] ? '*' : ())
3055							);
3056							}
3057							}
3058
3059							# Heuristic weight function for the processing order of the warshall algorithm
3060							sub _tree_weight {
3061	1739			1739		1819	my ($tree) = @_;
3062	1739					2347	my $weight = 0;
3063	1739	100				3529	if (ref($tree) eq CHAR_CLASS) {
		50
3064	1475					2407	for (@$tree) {
3065	1691	100				5758	$weight += ($$_[0] == $$_[1] ? 1 : 2);
3066							}
3067							}
3068							elsif (defined($tree)) {
3069	264					455	for (map { @$_ } @{$$tree[1]}) {
	346					706
	264					590
3070	1266					1914	$weight += _tree_weight($_);
3071							}
3072							}
3073	1739					4002	return $weight;
3074							}
3075
3076
3077							##############################################################################
3078							# $input_constraints
3079							##############################################################################
3080
3081							use constant {
3082	8					37906	FREE_TEXT => 'free text'
3083	8			8		142395	};
	8					22
3084
3085							=back
3086
3087							=head2 Input constraints
3088
3089							$input_constraints = [ $input_constraint_0, $input_constraint_1, ... ]
3090							$input_constraint = [ 'word_0', 'word_1', ... ] (drop down)
3091							or 'free_text' (free text)
3092
3093
3094							=over 4
3095
3096							=item C
3097
3098							Converts a C<$tree> to a pair C<($input_constraints, $split_str)>.
3099
3100							C<$split_perlre> is a compiled perl regular expression splitting a string
3101							accordingly to C<$input_constraints>. This C<$perlre> matches if and only if
3102							each drop down can be assigned a value; then C<$str =~ $perlre> in list
3103							context returns as many values as C<@$input_constraints>.
3104
3105							=cut
3106
3107							sub tree_to_input_constraints {
3108	12			12	1	27	my ($input_constraints, $perlres) = &_tree_to_input_constraints;
3109
3110							# concatenate free texts and stronger underlying regexs
3111	12					17	my @previous_undefs;
3112							my @kept;
3113	12					31	for my $i (0..$#$input_constraints) {
3114	36	100				72	if ($$input_constraints[$i] eq FREE_TEXT) {
3115	15					24	push(@previous_undefs, $i);
3116							}
3117							else {
3118	21	100				52	if (@previous_undefs) {
3119	9					13	push(@kept, $i-1);
3120	9	100				22	if (@previous_undefs > 1) {
3121	6					21	$$perlres[$i-1] = join('',
3122	2					3	map { '(?:' . $$perlres[$_] . ')' }
3123							@previous_undefs
3124							);
3125							}
3126	9					15	@previous_undefs = ();
3127							}
3128	21					49	push(@kept, $i);
3129							}
3130							}
3131	12	100				41	if (@previous_undefs) {
3132	1					3	push(@kept, $#$input_constraints);
3133	1	50				4	if (@previous_undefs > 1) {
3134	2					7	$$perlres[$#$input_constraints] = join('',
3135	1					2	map { '(?:' . $$perlres[$_] . ')' }
3136							@previous_undefs
3137							);
3138							}
3139							}
3140	12					35	@$input_constraints = @$input_constraints[@kept];
3141	12					35	@$perlres = @$perlres[@kept];
3142
3143							# sort words, remove duplicates
3144	12					31	for (grep { $_ ne FREE_TEXT } @$input_constraints) {
	31					69
3145	21					24	$_ = [ sort(keys(%{ { map { ($_ => $_) } @$_ } })) ];
	21					59
	37					191
3146							}
3147
3148							# remove empty words
3149							# concatenate single words
3150	12					17	my @previous_singles;
3151	12					21	@kept = ();
3152	12					29	for my $i (0..$#$input_constraints) {
3153	31	100	100			112	if (
		50	33
3154	21					68	$$input_constraints[$i] eq FREE_TEXT
3155							\|\| @{$$input_constraints[$i]} > 1
3156	11					80	) {
3157	20	100				35	if (@previous_singles) {
3158	6					10	push(@kept, $i-1);
3159	6	50				14	if (@previous_singles > 1) {
3160	0					0	$$perlres[$i-1] = join('',
3161	0					0	map { $$perlres[$_] }
3162							@previous_singles
3163							);
3164	0					0	$$input_constraints[$i-1] = join('',
3165	0					0	map { $$input_constraints[$_][0] }
3166							@previous_singles
3167							);
3168							}
3169	6					11	@previous_singles = ();
3170							}
3171	20					37	push(@kept, $i);
3172							}
3173							elsif (
3174							@{$$input_constraints[$i]} == 1
3175							&& length($$input_constraints[$i][0])
3176							) {
3177	11					26	push(@previous_singles, $i);
3178							}
3179							}
3180	12	100				28	if (@previous_singles) {
3181	5					8	push(@kept, $#$input_constraints);
3182	5	50				25	if (@previous_singles > 1) {
3183	0					0	$$perlres[$#$input_constraints] = join('',
3184	0					0	map { $$perlres[$_] }
3185							@previous_singles
3186							);
3187	0					0	$$input_constraints[$#$input_constraints] = join('',
3188	0					0	map { $$input_constraints[$_][0] }
3189							@previous_singles
3190							);
3191							}
3192							}
3193	12					143	@$input_constraints = @$input_constraints[@kept];
3194	12					31	@$perlres = @$perlres[@kept];
3195
3196	12	50				29	if (!@$input_constraints) {
3197	0					0	@$input_constraints = (['']);
3198	0					0	@$perlres = ('');
3199							}
3200
3201	31	100				125	my $split_perlre
3202							= join('',
3203							map {
3204	12					32	$$input_constraints[$_] eq FREE_TEXT
3205							? "($$perlres[$_]\|.*?)"
3206							: "($$perlres[$_])"
3207							}
3208							(0..$#$perlres)
3209							)
3210							;
3211	12					829	return ($input_constraints, qr/\A$split_perlre\z/ms);
3212							}
3213
3214							{
3215
3216							my %cc_to_input_constraint_cache;
3217
3218							# returns ($input_constraints, $perlres)
3219							# two references to arrays of the same size.
3220							sub _tree_to_input_constraints {
3221	38			38		44	my ($tree) = @_;
3222	38					43	my $input_constraints;
3223							my $perlres;
3224	38	50				164	if (!defined($tree)) {
		100
		50
		100
		100
3225							# regex accepting nothing -> free text (always rejected)
3226
3227	0					0	$input_constraints = [FREE_TEXT];
3228	0					0	$perlres = ['$.'];
3229							}
3230	35					98	elsif (ref($tree) eq CHAR_CLASS) {
3231							# single character class -> drop down
3232
3233	3		66			14	$input_constraints = [
3234							$cc_to_input_constraint_cache{$tree}
3235							\|\|= cc_to_input_constraint($tree)
3236							];
3237	3					8	$perlres = [_tree_to_regex($tree, 1)];
3238							}
3239							elsif (@{$$tree[1]} == 0) {
3240							# no top-level alternation
3241
3242	0					0	$input_constraints = [['']];
3243	0					0	$perlres = [_tree_to_regex($tree, 1)];
3244							}
3245	29					64	elsif ($$tree[0]) {
3246							# starified regex -> free text
3247
3248	6					14	$input_constraints = [FREE_TEXT];
3249	6					17	$perlres = [_tree_to_regex($tree, 1)];
3250							}
3251							elsif (@{$$tree[1]} == 1) {
3252							# single top-level alternation -> mixed results
3253							# example: ab*c(d\|e)f
3254
3255	14					17	$input_constraints = [];
3256	14					21	$perlres = [];
3257
3258	14					20	my $i = 0;
3259	14					17	while ($i != @{$$tree[1][0]}) {
	43					113
3260	29					36	my $beg = $i;
3261	29					52	my @expanded_words = ('');
3262	29					29	my $cc;
3263	29		100			34	while (
			66
			66
3264	85					575	$i != @{$$tree[1][0]}
3265							&& ref($cc = $$tree[1][0][$i]) eq CHAR_CLASS
3266							&& (!@$cc \|\| $$cc[-1][1] != MAX_CHAR)
3267							) {
3268	56		66			179	my $input_constraint
3269							= $cc_to_input_constraint_cache{$cc}
3270							\|\|= cc_to_input_constraint($cc)
3271							;
3272
3273							@expanded_words
3274	57					79	= map {
3275	56					75	my $letter = $_;
3276	57					68	map { $_ . $letter }
	61					193
3277							@expanded_words
3278							}
3279							@$input_constraint
3280							;
3281	56					80	$i++;
3282							}
3283	29	100	66			104	if ($beg < $i && length($expanded_words[0])) {
3284	12					47	my $wrd_perlre = _tree_to_regex(
3285							[
3286							0
3287	12					26	, [[ @{$$tree[1][0]}[$beg..$i-1] ]]
3288							]
3289							, 1
3290							);
3291	12					39	push(@$input_constraints, \@expanded_words);
3292	12					20	push(@$perlres, $wrd_perlre);
3293							}
3294	29	100				74	if ($i < @{$$tree[1][0]}) {
	29					75
3295	23					82	my ($sub_input_constraints, $sub_perlres)
3296							= _tree_to_input_constraints($$tree[1][0][$i]);
3297	23	50	66			106	if (
			33
3298							@$sub_input_constraints
3299							&& (
3300							$$sub_input_constraints[0] eq FREE_TEXT
3301							\|\| grep { length($_) } @{$$sub_input_constraints[0]}
3302							)
3303							) {
3304	23					43	push(@$input_constraints, @$sub_input_constraints);
3305	23					32	push(@$perlres, @$sub_perlres);
3306							}
3307	23					61	$i++;
3308							}
3309							}
3310							}
3311							else {
3312							# multiple top-level alternations
3313
3314	15	100				19	if (
3315	279	100	66			1282	grep { grep {
	15					25
3316	38					58	ref($_) ne CHAR_CLASS
3317							\|\| (@$_ && $$_[$#$_][1] == MAX_CHAR)
3318							} @$_ }
3319							@{$$tree[1]}
3320							) {
3321							# some alternation contains a sub-tree -> mixed results
3322							# example: abd\|ab*d
3323							# common pre/suf-fixes are factorized out
3324							# example: a(bd\|b*)d
3325
3326	6					8	my $fst_len = @{$$tree[1][0]};
	6					12
3327	6					10	my ($pre_len, $suf_len) = (0, 0);
3328	6					14	for (1, 0) {
3329	16					24	my ($len_ref, @range)
3330							= $_
3331							? (\$pre_len, (0..$fst_len-1))
3332	12	100				44	: (\$suf_len, map {-$_} (1..$fst_len-$pre_len))
3333							;
3334	12					28	for my $i (@range) {
3335	13	100				19	if (
3336	28	100	66			185	grep {
3337	13					21	$i >= @$_
3338							\|\| ref($$_[$i]) ne CHAR_CLASS
3339							\|\| $$tree[1][0][$i] != $$_[$i]
3340							}
3341	13					21	@{$$tree[1]}[0..$#{$$tree[1]}]
3342							) {
3343	3					9	last;
3344							}
3345	10					20	$$len_ref++;
3346							}
3347							}
3348	6	100				18	if ($pre_len) {
3349	2					24	my ($pre_input_constraints, $pre_perlres)
3350							= _tree_to_input_constraints(
3351							[
3352							0
3353	2					7	, [[ @{$$tree[1][0]}[0..$pre_len-1] ]]
3354							]
3355							);
3356	2					5	push(@$input_constraints, @$pre_input_constraints);
3357	2					5	push(@$perlres, @$pre_perlres);
3358							}
3359
3360	6	50				13	if (
3361							my @mid_alts
3362	14					49	= map { [ @$_[$pre_len..$#$_-$suf_len] ] }
	6					10
3363							@{$$tree[1]}
3364							) {
3365	6					10	push(@$input_constraints, FREE_TEXT);
3366	6					17	push(@$perlres, _tree_to_regex([ 0, \@mid_alts ] , 1));
3367							}
3368
3369	6	100				24	if ($suf_len) {
3370	1					5	my ($suf_input_constraints, $suf_perlres)
3371							= _tree_to_input_constraints(
3372							[
3373							0
3374							, [[
3375	1					4	@{$$tree[1][0]}
3376							[$fst_len-$suf_len..$fst_len-1]
3377							]]
3378							]
3379							);
3380	1					4	push(@$input_constraints, @$suf_input_constraints);
3381	1					4	push(@$perlres, @$suf_perlres);
3382							}
3383							}
3384							else {
3385							# each alternation contains only non negated char classes
3386							# -> drop down
3387
3388	9					22	$perlres = [_tree_to_regex($tree, 1)];
3389	9					16	for my $word (@{$$tree[1]}) {
	9					21
3390	24					44	my @expanded_words = ('');
3391	24		66			33	for my $input_constraint (
	224					546
3392							map {
3393							$cc_to_input_constraint_cache{$_}
3394							\|\|= cc_to_input_constraint($_);
3395							}
3396							@$word
3397							) {
3398	224	50				338	if (@$input_constraint == 1) {
3399	224					270	for (@expanded_words) {
3400	224					435	$_ .= $$input_constraint[0];
3401							}
3402							}
3403							else {
3404							@expanded_words
3405	0					0	= map {
3406	0					0	my $letter = $_;
3407	0					0	map { $_ . $letter }
	0					0
3408							@expanded_words
3409							}
3410							@$input_constraint
3411							;
3412							}
3413							}
3414	24					34	push(@{$$input_constraints[0]}, @expanded_words);
	24					65
3415							}
3416							}
3417							}
3418	38					82	return ($input_constraints, $perlres);
3419							}
3420							}
3421
3422							sub cc_to_input_constraint {
3423	28			28	0	34	my ($cc) = @_;
3424	28	50				81	if (@$cc == 0) {
		100
3425	0					0	return [''];
3426							}
3427							elsif ($$cc[$#$cc][1] == MAX_CHAR) {
3428	1					5	return FREE_TEXT;
3429							}
3430							else {
3431							return [
3432	27					41	map { map { chr($_) } ($$_[0]..$$_[1]) }
	27					41
	28					138
3433							@$cc
3434							];
3435							}
3436							}
3437
3438
3439							##############################################################################
3440							# $ere
3441							##############################################################################
3442
3443							=back
3444
3445							=head2 Ere
3446
3447							An C<$ere> is a perl string.
3448
3449							The syntax an C<$ere> is assumed to follow is based on POSIX ERE
3450							(else the C routines will C).
3451
3452							Unsupported POSIX features:
3453							back-references,
3454							equivalence classes C<[[=a=]]>,
3455							character class C<[[:digit:]]>,
3456							collating symbols C<[[.ch.]]>.
3457
3458							C<)> is always a special character. POSIX says that C<)> is a normal
3459							character if there is no matching C<(>.
3460
3461							There is no escape sequences such as C<\t> for tab or C<\n> for line feed.
3462							POSIX does not specify such escape sequences neither.
3463
3464							C<\> before a non-special character is ignored
3465							(except in bracket expressions). POSIX does not allow it.
3466
3467							The empty string is legal in alternations (C<(\|a)> is equivalent to C<(a?)>).
3468							POSIX does not allow it.
3469							The C<(\|a)> form is generated by the C routines
3470							(avoiding quantifiers other than C<*>).
3471
3472							C<[a-l-z]> is interpreted as C<([a-l] \| - \| z)> (but it is discouraged to
3473							rely upon this implementation artefact). POSIX says that the interpretation
3474							of this construct is undefined.
3475
3476							In bracket expressions, C<\> is a normal character,
3477							thus C<]> as character must occur first, or second after a C<^>
3478							(POSIX compliant, but possibly surprising for perl programmers).
3479
3480							All unicode characters supported by perl are allowed as literal characters.
3481
3482							=over 4
3483
3484							=item C
3485
3486							Parses an C<$ere> to a C<$nfa>.
3487
3488							WARNING: the parsing routines, in particular C,
3489							C on syntax errors; thus the caller may want to eval-trap such errors.
3490
3491							=cut
3492
3493							sub ere_to_nfa {
3494	213			213	1	37148	my ($ere, $has_anchor_ref) = @_;
3495
3496							# optimize very first and very last anchors
3497	213					1190	my $has_beg_anchor = $ere =~ s/^\^+//;
3498	213					892	my $has_end_anchor = $ere =~ s/\$+$//;
3499
3500	213					349	$$has_anchor_ref = 0;
3501	213					254	my @alternation_nfas;
3502	213					234	do {
3503	213					544	push(@alternation_nfas, parse_alternation(\$ere, $has_anchor_ref));
3504							} while($ere =~ /\G \\| /xmsgc);
3505
3506	213	50	100			727	if ((pos($ere) \|\| 0) != length($ere)) {
3507	0					0	parse_die("unexpected character", \$ere);
3508							}
3509
3510	213					246	my $nfa;
3511	213	100	100			563	if (!$has_beg_anchor && !$has_end_anchor) {
3512							# a\|b\|c => ^.(a\|b\|c).$
3513
3514	10	50				82	$nfa = nfa_concat(
3515							[[1, [[$cc_any, 0]]]]
3516							, @alternation_nfas == 1
3517							? $alternation_nfas[0]
3518							: nfa_union(@alternation_nfas)
3519							, [[1, [[$cc_any, 0]]]]
3520							);
3521							}
3522							else {
3523	203					620	for my $alternation_nfa (@alternation_nfas[1..$#alternation_nfas-1]) {
3524	0					0	$alternation_nfa = nfa_concat(
3525							[[1, [[$cc_any, 0]]]]
3526							, $alternation_nfa
3527							, [[1, [[$cc_any, 0]]]]
3528							);
3529							}
3530	203	100	66			866	if (!$has_beg_anchor \|\| @alternation_nfas > 1) {
3531	6	50				45	$alternation_nfas[0] = nfa_concat(
		50
3532							!$has_beg_anchor ? [[1, [[$cc_any, 0]]]] : ()
3533							, $alternation_nfas[0]
3534							, @alternation_nfas > 1 ? [[1, [[$cc_any, 0]]]] : ()
3535							);
3536							}
3537	203	100	66			838	if (!$has_end_anchor \|\| @alternation_nfas > 1) {
3538	2	50				17	$alternation_nfas[-1] = nfa_concat(
		50
3539							@alternation_nfas > 1 ? [[1, [[$cc_any, 0]]]] : ()
3540							, $alternation_nfas[-1]
3541							, !$has_end_anchor ? [[1, [[$cc_any, 0]]]] : ()
3542							);
3543							}
3544							$nfa
3545	203	50				445	= @alternation_nfas == 1
3546							? $alternation_nfas[0]
3547							: nfa_union(@alternation_nfas)
3548							;
3549							}
3550
3551	213	100				1005	return $$has_anchor_ref ? nfa_resolve_anchors($nfa) : $nfa;
3552							}
3553
3554							sub _ere_to_nfa {
3555	237			237		277	my ($str_ref, $has_anchor_ref) = @_;
3556
3557	237					228	my @alternation_nfas;
3558	237					270	do {
3559	384					739	push(@alternation_nfas, parse_alternation($str_ref, $has_anchor_ref));
3560							} while($$str_ref =~ /\G \\| /xmsgc);
3561
3562							return
3563	237	100				671	@alternation_nfas == 1
3564							? $alternation_nfas[0]
3565							: nfa_union(@alternation_nfas)
3566							;
3567							}
3568
3569							sub bracket_expression_to_cc {
3570	102			102	0	117	my ($str_ref) = @_;
3571	102					192	my $neg = $$str_ref =~ /\G \^/xmsgc;
3572	102					133	my $interval_list = [];
3573
3574							# anything is allowed a first char, in particular ']' and '-'
3575	102	100				411	if ($$str_ref =~ /\G (.) - ([^]]) /xmsgc) {
		50
3576	16					50	push(@$interval_list, [ord($1), ord($2)]);
3577							}
3578							elsif ($$str_ref =~ /\G (.) /xmsgc) {
3579	86					237	push(@$interval_list, [ord($1), ord($1)]);
3580							}
3581
3582	102					117	my $loop = 1;
3583	102					189	while ($loop) {
3584	177	50				520	if ($$str_ref =~ /\G ([^]]) - ([^]]) /xmsgc) {
		100
3585	0					0	push(@$interval_list, [ord($1), ord($2)]);
3586							}
3587							elsif ($$str_ref =~ /\G ([^]]) /xmsgc) {
3588	75					229	push(@$interval_list, [ord($1), ord($1)]);
3589							}
3590							else {
3591	102					237	$loop = 0;
3592							}
3593							}
3594
3595							return
3596	102	100				236	$neg
3597							? cc_neg(interval_list_to_cc($interval_list))
3598							: interval_list_to_cc($interval_list)
3599							;
3600							}
3601
3602							# Returns:
3603							# - the empty list iff no quantification has been parsed
3604							# - a 2-tuple ($min, $max)
3605							# either $max is the empty string
3606							# or $min <= $max
3607							sub parse_quant {
3608	268			268	0	339	my ($str_ref) = @_;
3609	268	100				887	if ($$str_ref =~ /\G \* /xmsgc) {
		100
		100
		50
3610	235					697	return (0, '');
3611							}
3612							elsif ($$str_ref =~ /\G \+ /xmsgc) {
3613	16					48	return (1, '');
3614							}
3615							elsif ($$str_ref =~ /\G \? /xmsgc) {
3616	8					23	return (0, 1);
3617							}
3618							elsif ($$str_ref =~ /\G \{ /xmsgc) {
3619	9					14	my ($min, $max);
3620	9	50				37	if ($$str_ref =~ /\G ( [0-9]+ ) /xmsgc) {
3621	9					26	$min = $1;
3622	9	100				32	if ($$str_ref =~ /\G , ([0-9]*) /xmsgc) {
3623	8					17	$max = $1; # may be ''
3624	8	50	66			51	if (length($max) && $min > $max) {
3625	0					0	parse_die("$min > $max", $str_ref);
3626							}
3627							}
3628							else {
3629	1					2	$max = $min;
3630							}
3631							}
3632							else {
3633	0					0	parse_die('number expected', $str_ref);
3634							}
3635
3636	9	50				36	if ($$str_ref !~ /\G \} /xmsgc) {
3637	0					0	parse_die('} expected', $str_ref);
3638							}
3639	9					36	return ($min, $max);
3640							}
3641							else {
3642	0					0	return;
3643							}
3644							}
3645
3646							=item quote($string)
3647
3648							Returns $string with escaped special characters.
3649
3650							=cut
3651
3652							sub quote {
3653	0			0	1	0	my ($str) = @_;
3654	0					0	$str =~ s/([.\[\\(*+?{\|^\$])/\\$1/xsmg;
3655	0					0	return $str;
3656							}
3657
3658							{
3659							my %char_to_cc_cache;
3660							sub parse_alternation {
3661	597			597	0	707	my ($str_ref, $has_anchor_ref) = @_;
3662	597					639	my @all_nfas;
3663							my $loop;
3664	0					0	my @quants;
3665	597					601	do {
3666	999					1070	$loop = 0;
3667	999					1346	my $nfa = [];
3668	999					1074	my $next_state_index = 1;
3669	999					874	while (1) {
3670	1622	100				8069	if ($$str_ref =~ /\G ( $ERE_literal + ) /xmsogc) {
		100
		100
		100
		100
		100
3671	1144		66			5160	push(@$nfa,
3672							map {
3673	435					1467	[ 0, [[
3674							$char_to_cc_cache{$_} \|\|= char_to_cc($_)
3675							, $next_state_index++
3676							]]]
3677							}
3678							split('', $1)
3679							);
3680							}
3681							elsif ($$str_ref =~ /\G ( \. + ) /xmsgc) {
3682	52					236	push(@$nfa,
3683							map {
3684	51					162	[ 0, [[
3685							$cc_any
3686							, $next_state_index++
3687							]]]
3688							}
3689							(1..length($1))
3690							);
3691							}
3692							elsif ($$str_ref =~ /\G ( \[ ) /xmsgc) {
3693	102					221	push(@$nfa,
3694							[ 0, [[
3695							bracket_expression_to_cc($str_ref)
3696							, $next_state_index++
3697							]]]
3698							);
3699	102	50				419	if ($$str_ref !~ /\G ] /xmsgc) {
3700	0					0	parse_die('] expected', $str_ref);
3701							}
3702							}
3703							elsif ($$str_ref =~ /\G \\ (.) /xmsgc) {
3704	17		66			93	push(@$nfa,
3705							[ 0, [[
3706							$char_to_cc_cache{$1} \|\|= char_to_cc($1)
3707							, $next_state_index++
3708							]]]
3709							);
3710							}
3711							elsif ($$str_ref =~ /\G \^ /xmsgc) {
3712	9					32	push(@$nfa,
3713							[ 0, [[
3714							$cc_beg
3715							, $next_state_index++
3716							]]]
3717							);
3718	9		100			33	$$has_anchor_ref \|\|= 1;
3719							}
3720							elsif ($$str_ref =~ /\G \$ /xmsgc) {
3721	9					39	push(@$nfa,
3722							[ 0, [[
3723							$cc_end
3724							, $next_state_index++
3725							]]]
3726							);
3727	9		100			46	$$has_anchor_ref \|\|= 1;
3728							}
3729							else {
3730	999					1166	last;
3731							}
3732							}
3733
3734	999	100				2057	if (@$nfa) {
3735	497	100				1276	if ($$str_ref =~ /\G (?= [*+?{] ) /xmsgc) {
3736	173					342	my $last_char_class = $$nfa[$#$nfa][1][0][0];
3737	173	100				357	if (@$nfa > 1) {
3738	82					124	@{$$nfa[$#$nfa]} = (1, []);
	82					216
3739	82					130	push(@all_nfas, $nfa);
3740							}
3741	173					359	push(@quants, [scalar(@all_nfas), parse_quant($str_ref)]);
3742	173					664	push(@all_nfas, [[0, [[$last_char_class, 1 ]]], [1, []]]);
3743	173					268	$loop = 1;
3744							}
3745							else {
3746	324					735	push(@$nfa, [1, []]);
3747	324					457	push(@all_nfas, $nfa);
3748							}
3749							}
3750
3751	999	100				3440	if ($$str_ref =~ /\G \( /xmsgc) {
3752	237					552	$nfa = _ere_to_nfa($str_ref, $has_anchor_ref);
3753	237	50				859	if ($$str_ref !~ /\G \) /xmsgc) {
3754	0					0	parse_die(') expected', $str_ref);
3755							}
3756	237	100				639	if ($$str_ref =~ /\G (?= [*+?{] ) /xmsgc) {
3757	95					205	push(@quants, [scalar(@all_nfas), parse_quant($str_ref)]);
3758							}
3759	237					335	push(@all_nfas, $nfa);
3760	237					567	$loop = 1;
3761							}
3762							} while ($loop);
3763
3764	597					1017	for (@quants) {
3765	268					444	my ($i, $min, $max) = @$_;
3766	268		66			1378	$all_nfas[$i] = nfa_quant(
			66
3767							$all_nfas[$i]
3768							, $min, $max
3769							, $min && $i != 0 && _nfa_has_suffix($all_nfas[$i-1])
3770							, $min && $i != $#all_nfas && _nfa_has_prefix($all_nfas[$i+1])
3771							);
3772							}
3773
3774	597	100				1301	if (@all_nfas > 1) {
		100
3775	208					484	return nfa_concat(@all_nfas);
3776							}
3777							elsif (@all_nfas) {
3778	308					1373	return $all_nfas[0];
3779							}
3780							else {
3781	81					491	return [[1, []]];
3782							}
3783							}
3784							}
3785
3786							sub _nfa_has_prefix {
3787	19			19		31	my ($nfa) = @_;
3788							# initial state non-accepting or no loop back to it
3789	19		33			118	!$$nfa[0][0] \|\| !grep { $$_[1] == 0 } map { @{$$_[1]} } @$nfa;
3790							}
3791
3792							sub _nfa_has_suffix {
3793	23			23		33	my ($nfa) = @_;
3794							# all accepting states are final
3795	23	100				37	!grep { $$_[0] && @{$$_[1]} } @$nfa
	55					127
	23					214
3796							}
3797
3798							sub parse_die {
3799	0			0	0	0	my ($msg, $str_ref) = @_;
3800	0		0			0	die("malformed regex: $msg at "
3801							. (pos($$str_ref) \|\| 0) . " in $$str_ref");
3802							}
3803
3804
3805							##############################################################################
3806							# Shorthands
3807							##############################################################################
3808
3809							=back
3810
3811							=head2 Shorthands
3812
3813							=over 4
3814
3815							=item C
3816							:= C
3817
3818							=cut
3819
3820							sub ere_to_tree {
3821	0			0	1	0	my ($ere) = @_;
3822	0					0	return nfa_to_tree(ere_to_nfa($ere));
3823							}
3824
3825							=item C
3826							:= C
3827
3828							=cut
3829
3830							sub ere_to_regex {
3831	0			0	1	0	my ($ere, $to_perlre) = (@_, 0);
3832	0					0	return tree_to_regex(ere_to_tree($ere), $to_perlre);
3833							}
3834
3835							=item C
3836							:= C
3837
3838							=cut
3839
3840							sub nfa_to_regex {
3841	118			118	1	347	my ($nfa, $to_perlre) = (@_, 0);
3842	118					330	return tree_to_regex(nfa_to_tree($nfa), $to_perlre);
3843							}
3844
3845							=item C
3846							:= C
3847
3848							=cut
3849
3850							sub ere_to_input_constraints {
3851	0			0	1	0	my ($ere) = @_;
3852	0					0	return tree_to_input_constraints(ere_to_tree($ere));
3853							}
3854
3855							=item C
3856							:= C
3857
3858							=cut
3859
3860							sub nfa_to_input_constraints {
3861	12			12	1	98	my ($nfa) = @_;
3862	12					37	return tree_to_input_constraints(nfa_to_tree($nfa));
3863							}
3864
3865							=item C
3866							:= C
3867
3868							=cut
3869
3870							sub nfa_to_min_dfa {
3871	205			205	1	41945	my ($nfa) = @_;
3872	205					521	return dfa_to_min_dfa(nfa_to_dfa($nfa));
3873							}
3874
3875							1;
3876
3877							=back
3878
3879							=head1 AUTHOR
3880
3881							Loïc Jonas Etienne
3882
3883							=head1 COPYRIGHT and LICENSE
3884
3885							Artistic License 2.0
3886							http://www.perlfoundation.org/artistic_license_2_0