File Coverage

blib/lib/Regexp/Assemble.pm

Criterion	Covered	Total	%
statement	1141	1155	98.7
branch	777	796	97.6
condition	174	177	98.3
subroutine	97	98	98.9
pod	49	49	100.0
total	2238	2275	98.3

line	stmt	bran	cond	sub	pod	time	code
1							# Regexp::Assemple.pm
2							#
3							# Copyright (c) 2004-2011 David Landgren
4							# All rights reserved
5
6							package Regexp::Assemble;
7
8	11			11		35637	use strict;
	11					25
	11					266
9	11			11		49	use warnings;
	11					21
	11					293
10
11	11			11		57	use constant DEBUG_ADD => 1;
	11					29
	11					1054
12	11			11		64	use constant DEBUG_TAIL => 2;
	11					24
	11					425
13	11			11		56	use constant DEBUG_LEX => 4;
	11					24
	11					406
14	11			11		70	use constant DEBUG_TIME => 8;
	11					30
	11					495
15
16	11			11		69	use vars qw/$have_Storable $Current_Lexer $Default_Lexer $Single_Char $Always_Fail/;
	11					23
	11					38857
17
18							# The following patterns were generated with examples/naive.
19
20							$Default_Lexer = qr/(?![[(\\]).(?:[+?]\??\|\{\d+(?:,\d)?\}\??)?\|\\(?:[bABCEGLQUXZ]\|[lu].\|(?:[^\w]\|[aefnrtdDwWsS]\|c.\|0\d{2}\|x(?:[\da-fA-F]{2}\|{[\da-fA-F]{4}})\|N\{\w+\}\|[Pp](?:\{\w+\}\|.))(?:[+?]\??\|\{\d+(?:,\d)?\}\??)?)\|\[.*?(?
21
22							$Single_Char = qr/^(?:\\(?:[aefnrtdDwWsS]\|c.\|[^\w\/{\|}-]\|0\d{2}\|x(?:[\da-fA-F]{2}\|{[\da-fA-F]{4}}))\|[^\$^])$/;
23
24							# The pattern to return when nothing has been added (and thus not match anything)
25
26							$Always_Fail = "^\\b\0";
27
28							our $VERSION = '0.38';
29
30							# ------------------------------------------------
31
32							sub new {
33	2194			2194	1	1890739	my $class = shift;
34	2194					5767	my %args = @_;
35
36	2194					3536	my $anc;
37	2194					4382	for $anc (qw(word line string)) {
38	6582	100				16900	if (exists $args{"anchor_$anc"}) {
39	135					329	my $val = delete $args{"anchor_$anc"};
40	135					398	for my $anchor ("anchor_${anc}_begin", "anchor_${anc}_end") {
41	270	100				907	$args{$anchor} = $val unless exists $args{$anchor};
42							}
43							}
44							}
45
46							# anchor_string_absolute sets anchor_string_begin and anchor_string_end_absolute
47	2194	100				5316	if (exists $args{anchor_string_absolute}) {
48	3					6	my $val = delete $args{anchor_string_absolute};
49	3					6	for my $anchor (qw(anchor_string_begin anchor_string_end_absolute)) {
50	6	100				17	$args{$anchor} = $val unless exists $args{$anchor};
51							}
52							}
53
54	2194		100			37419	exists $args{$_} or $args{$_} = 0 for qw(
55							anchor_word_begin
56							anchor_word_end
57							anchor_line_begin
58							anchor_line_end
59							anchor_string_begin
60							anchor_string_end
61							anchor_string_end_absolute
62							debug
63							dup_warn
64							indent
65							lookahead
66							mutable
67							track
68							unroll_plus
69							);
70
71	2194		100			12637	exists $args{$_} or $args{$_} = 1 for qw(
72							fold_meta_pairs
73							reduce
74							chomp
75							);
76
77	2194					6540	@args{qw(re str path)} = (undef, undef, []);
78
79	2194		100			15086	$args{flags} \|\|= delete $args{modifiers} \|\| '';
			100
80	2194	100				4959	$args{lex} = $Current_Lexer if defined $Current_Lexer;
81
82	2194					4623	my $self = bless \%args, $class;
83
84	2194	100				5633	if ($self->_debug(DEBUG_TIME)) {
85	1					5	$self->_init_time_func();
86	1					5	$self->{_begin_time} = $self->{_time_func}->();
87							}
88							$self->{input_record_separator} = delete $self->{rs}
89	2194	100				5203	if exists $self->{rs};
90	2194	100				4929	exists $self->{file} and $self->add_file($self->{file});
91
92	2193					5859	return $self;
93							}
94
95							sub _init_time_func {
96	9			9		16	my $self = shift;
97	9	100				31	return if exists $self->{_time_func};
98
99							# attempt to improve accuracy
100	6	100				18	if (!defined($self->{_use_time_hires})) {
101	5					25	eval {require Time::HiRes};
	5					960
102	5					2158	$self->{_use_time_hires} = $@;
103							}
104							$self->{_time_func} = length($self->{_use_time_hires}) > 0
105	4			4		12	? sub { time }
106	6	100				30	: \&Time::HiRes::time
107							;
108							}
109
110							sub clone {
111	55			55	1	418	my $self = shift;
112	55					91	my $clone;
113	55					287	my @attr = grep {$_ ne 'path'} keys %$self;
	1314					2162
114	55					150	@{$clone}{@attr} = @{$self}{@attr};
	55					422
	55					136
115	55					149	$clone->{path} = _path_clone($self->_path);
116	55					328	bless $clone, ref($self);
117							}
118
119							sub _fastlex {
120	884			884		1480	my $self = shift;
121	884					1401	my $record = shift;
122	884					1283	my $len = 0;
123	884					1631	my @path = ();
124	884					1340	my $case = '';
125	884					1287	my $qm = '';
126
127	884					1499	my $debug = $self->{debug} & DEBUG_LEX;
128	884					1333	my $unroll_plus = $self->{unroll_plus};
129
130	884					1450	my $token;
131							my $qualifier;
132	884	100				3364	$debug and print "# _lex <$record>\n";
133	884					1685	my $modifier = q{(?:[+?]\\??\|\\{(?:\\d+(?:,\d)?\|,\d+)\\}\\??)?};
134	884					2519	my $class_matcher = qr/\[(?:\[:[a-z]+:\]\|\\?.)*?\]/;
135	884					3385	my $paren_matcher = qr/\(.*?(?
136	884					3310	my $misc_matcher = qr/(?:(c)(.)\|(0)(\d{2}))($modifier)/;
137	884					2779	my $regular_matcher = qr/([^\\[(])($modifier)/;
138	884					1926	my $qm_matcher = qr/(\\?.)/;
139
140	884					1441	my $matcher = $regular_matcher;
141							{
142	884	100				1259	if ($record =~ /\G$matcher/gc) {
	5960	100				25785
		100
		100
143							# neither a \\ nor [ nor ( followed by a modifer
144	3344	100	100			11141	if ($1 eq '\\E') {
		100	66
145	12	100				106	$debug and print "# E\n";
146	12					28	$case = $qm = '';
147	12					20	$matcher = $regular_matcher;
148	12					22	redo;
149							}
150							elsif ($qm and ($1 eq '\\L' or $1 eq '\\U')) {
151	5	100				27	$debug and print "# ignore \\L, \\U\n";
152	5					13	redo;
153							}
154	3327					5253	$token = $1;
155	3327	100				6090	$qualifier = defined $2 ? $2 : '';
156	3327	100				8584	$debug and print "# token <$token> <$qualifier>\n";
157	3327	100				5351	if ($qm) {
158	90					141	$token = quotemeta($token);
159	90					173	$token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{}\/])$/$1/;
160							}
161							else {
162	3237					5539	$token =~ s{\A([][{}*+?@\\/])\Z}{\\$1};
163							}
164	3327	100	100			7963	if ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/) {
165	22	100				71	$1 and $qualifier .= $1;
166	22	100				692	$debug and print " unroll <$token><$token><$qualifier>\n";
167	22	100				78	$case and $token = $case eq 'L' ? lc($token) : uc($token);
		100
168	22					60	push @path, $token, "$token$qualifier";
169							}
170							else {
171	3305	100				7572	$debug and print " clean <$token>\n";
172	3305	100				8433	push @path,
		100
173							$case eq 'L' ? lc($token).$qualifier
174							: $case eq 'U' ? uc($token).$qualifier
175							: $token.$qualifier
176							;
177							}
178	3327					5227	redo;
179							}
180
181							elsif ($record =~ /\G\\/gc) {
182	1680	100				4494	$debug and print "# backslash\n";
183							# backslash
184	1680	100				12887	if ($record =~ /\G([sdwSDW])($modifier)/gc) {
		100
		100
		100
		100
		100
		100
		100
185	443					989	($token, $qualifier) = ($1, $2);
186	443	100				1049	$debug and print "# meta <$token> <$qualifier>\n";
187	443	100	100			1369	push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
		100
188							? ("\\$token", "\\$token$qualifier" . (defined $1 ? $1 : ''))
189							: "\\$token$qualifier";
190							}
191							elsif ($record =~ /\Gx([\da-fA-F]{2})($modifier)/gc) {
192	9	100				249	$debug and print "# x $1\n";
193	9					52	$token = quotemeta(chr(hex($1)));
194	9					23	$qualifier = $2;
195	9	100				239	$debug and print "# cooked <$token>\n";
196	9					41	$token =~ s/^\\([^\w$()*+.?\[\\\]^\|{\/])$/$1/; # } balance
197	9	100				242	$debug and print "# giving <$token>\n";
198	9	100	100			72	push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
		100
199							? ($token, "$token$qualifier" . (defined $1 ? $1 : ''))
200							: "$token$qualifier";
201							}
202							elsif ($record =~ /\GQ/gc) {
203	26	100				209	$debug and print "# Q\n";
204	26					52	$qm = 1;
205	26					41	$matcher = $qm_matcher;
206							}
207							elsif ($record =~ /\G([LU])/gc) {
208	15	100				286	$debug and print "# case $1\n";
209	15					41	$case = $1;
210							}
211							elsif ($record =~ /\GE/gc) {
212	6	100				18	$debug and print "# E\n";
213	6					12	$case = $qm = '';
214	6					9	$matcher = $regular_matcher;
215							}
216							elsif ($record =~ /\G([lu])(.)/gc) {
217	6	100				22	$debug and print "# case $1 to <$2>\n";
218	6	100				23	push @path, $1 eq 'l' ? lc($2) : uc($2);
219							}
220	30					63	elsif (my @arg = grep {defined} $record =~ /\G$misc_matcher/gc) {
221	6	50				17	if ($] < 5.007) {
222	0					0	my $len = 0;
223	0					0	$len += length($_) for @arg;
224	0	0				0	$debug and print "# pos ", pos($record), " fixup add $len\n";
225	0					0	pos($record) = pos($record) + $len;
226							}
227	6					12	my $directive = shift @arg;
228	6	100				13	if ($directive eq 'c') {
229	3	100				12	$debug and print "# ctrl <@arg>\n";
230	3					11	push @path, "\\c" . uc(shift @arg);
231							}
232							else { # elsif ($directive eq '0') {
233	3	100				11	$debug and print "# octal <@arg>\n";
234	3					8	my $ascii = oct(shift @arg);
235	3	100				26	push @path, ($ascii < 32)
236							? "\\c" . chr($ascii+64)
237							: chr($ascii)
238							;
239							}
240	6					12	$path[-1] .= join( '', @arg ); # if @arg;
241	6					15	redo;
242							}
243							elsif ($record =~ /\G(.)/gc) {
244	1165					2673	$token = $1;
245	1165					3037	$token =~ s{[AZabefnrtz\[\]{}()\\\$*+.?@\|/^]}{\\$token};
246	1165	100				2795	$debug and print "# meta <$token>\n";
247	1165					2339	push @path, $token;
248							}
249							else {
250	4	100				23	$debug and print "# ignore char at ", pos($record), " of <$record>\n";
251							}
252	1674					3274	redo;
253							}
254
255							elsif ($record =~ /\G($class_matcher)($modifier)/gc) {
256							# [class] followed by a modifer
257	39					103	my $class = $1;
258	39	50				100	my $qualifier = defined $2 ? $2 : '';
259	39	100				341	$debug and print "# class begin <$class> <$qualifier>\n";
260	39	100				147	if ($class =~ /\A\[\\?(.)]\Z/) {
261	12					28	$class = quotemeta $1;
262	12					35	$class =~ s{\A\\([!@%])\Z}{$1};
263	12	100				110	$debug and print "# class unwrap $class\n";
264							}
265	39	100				337	$debug and print "# class end <$class> <$qualifier>\n";
266	39	100	100			183	push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
		100
267							? ($class, "$class$qualifier" . (defined $1 ? $1 : ''))
268							: "$class$qualifier";
269	39					84	redo;
270							}
271
272							elsif ($record =~ /\G($paren_matcher)/gc) {
273	13	100				37	$debug and print "# paren <$1>\n";
274							# (paren) followed by a modifer
275	13					29	push @path, $1;
276	13					21	redo;
277							}
278
279							}
280	884					3823	return \@path;
281							}
282
283							sub _lex {
284	211			211		415	my $self = shift;
285	211					335	my $record = shift;
286	211					313	my $len = 0;
287	211					364	my @path = ();
288	211					326	my $case = '';
289	211					310	my $qm = '';
290							my $re = defined $self->{lex} ? $self->{lex}
291	211	50				541	: defined $Current_Lexer ? $Current_Lexer
		100
292							: $Default_Lexer;
293	211					340	my $debug = $self->{debug} & DEBUG_LEX;
294	211	100				1949	$debug and print "# _lex <$record>\n";
295	211					370	my ($token, $next_token, $diff, $token_len);
296	211					2729	while( $record =~ /($re)/g ) {
297	480					1132	$token = $1;
298	480					731	$token_len = length($token);
299	480	100				8049	$debug and print "# lexed <$token> len=$token_len\n";
300	480	100				1273	if( pos($record) - $len > $token_len ) {
301	15					30	$next_token = $token;
302	15					44	$token = substr( $record, $len, $diff = pos($record) - $len - $token_len );
303	15	100				798	$debug and print "# recover <", substr( $record, $len, $diff ), "> as <$token>, save <$next_token>\n";
304	15					43	$len += $diff;
305							}
306	480					698	$len += $token_len;
307							TOKEN: {
308	480	100				685	if( substr( $token, 0, 1 ) eq '\\' ) {
	495					1153
309	226	100				843	if( $token =~ /^\\([ELQU])$/ ) {
		100
		100
310	51	100				160	if( $1 eq 'E' ) {
		100
311							$qm and $re = defined $self->{lex} ? $self->{lex}
312	12	50				80	: defined $Current_Lexer ? $Current_Lexer
		100
		100
313							: $Default_Lexer;
314	12					29	$case = $qm = '';
315							}
316							elsif( $1 eq 'Q' ) {
317	19					59	$qm = $1;
318							# switch to a more precise lexer to quotemeta individual characters
319	19					80	$re = qr/\\?./;
320							}
321							else {
322	20					36	$case = $1;
323							}
324	51	100				1531	$debug and print "# state change qm=<$qm> case=<$case>\n";
325	51					430	goto NEXT_TOKEN;
326							}
327							elsif( $token =~ /^\\([lu])(.)$/ ) {
328	3	100				85	$debug and print "# apply case=<$1> to <$2>\n";
329	3	100				22	push @path, $1 eq 'l' ? lc($2) : uc($2);
330	3					28	goto NEXT_TOKEN;
331							}
332							elsif( $token =~ /^\\x([\da-fA-F]{2})$/ ) {
333	41					124	$token = quotemeta(chr(hex($1)));
334	41	100				164	$debug and print "# cooked <$token>\n";
335	41					105	$token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{\/])$/$1/; # } balance
336	41	100				167	$debug and print "# giving <$token>\n";
337							}
338							else {
339	131					255	$token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{\/])$/$1/; # } balance
340	131	100				597	$debug and print "# backslashed <$token>\n";
341							}
342							}
343							else {
344	269	100				604	$case and $token = $case eq 'U' ? uc($token) : lc($token);
		100
345	269	100				570	$qm and $token = quotemeta($token);
346	269	100				593	$token = '\\/' if $token eq '/';
347							}
348							# undo quotemeta's brute-force escapades
349	441	100				907	$qm and $token =~ s/^\\([^\w$()*+.?@\[\\\]^\|{}\/])$/$1/;
350	441	100				7310	$debug and print "# <$token> case=<$case> qm=<$qm>\n";
351	441					1034	push @path, $token;
352
353							NEXT_TOKEN:
354	495	100				3408	if( defined $next_token ) {
355	15	100				831	$debug and print "# redo <$next_token>\n";
356	15					59	$token = $next_token;
357	15					32	$next_token = undef;
358	15					61	redo TOKEN;
359							}
360							}
361							}
362	211	100				487	if( $len < length($record) ) {
363							# NB: the remainder only arises in the case of degenerate lexer,
364							# and if \Q is operative, the lexer will have been switched to
365							# /\\?./, which means there can never be a remainder, so we
366							# don't have to bother about quotemeta. In other words:
367							# $qm will never be true in this block.
368	7					20	my $remain = substr($record,$len);
369	7	100				23	$case and $remain = $case eq 'U' ? uc($remain) : lc($remain);
		100
370	7	100				554	$debug and print "# add remaining <$remain> case=<$case> qm=<$qm>\n";
371	7					36	push @path, $remain;
372							}
373	211	100				1821	$debug and print "# _lex out <@path>\n";
374	211					1200	return \@path;
375							}
376
377							sub add {
378	1059			1059	1	14157	my $self = shift;
379	1059					1523	my $record;
380	1059					1970	my $debug = $self->{debug} & DEBUG_LEX;
381	1059					2804	while( defined( $record = shift @_ )) {
382	2263	100				5532	CORE::chomp($record) if $self->{chomp};
383	2263	100	100			5770	next if $self->{pre_filter} and not $self->{pre_filter}->($record);
384	2262	100				7581	$debug and print "# add <$record>\n";
385	2262					4095	$self->{stats_raw} += length $record;
386							my $list = $record =~ /[+*?(\\\[{]/ # }]) restore equilibrium
387	2262	100				10559	? $self->{lex} ? $self->_lex($record) : $self->_fastlex($record)
		100
388							: [split //, $record]
389							;
390	2262	100	100			6691	next if $self->{filter} and not $self->{filter}->(@$list);
391	2261					5028	$self->_insertr( $list );
392							}
393	1059					2777	return $self;
394							}
395
396							sub add_file {
397	13			13	1	25	my $self = shift;
398	13					27	my $rs;
399							my @file;
400	13	100				31	if (ref($_[0]) eq 'HASH') {
401	6					11	my $arg = shift;
402							$rs = $arg->{rs}
403							\|\| $arg->{input_record_separator}
404							\|\| $self->{input_record_separator}
405	6		100			36	\|\| $/;
406							@file = ref($arg->{file}) eq 'ARRAY'
407	4					12	? @{$arg->{file}}
408	6	100				16	: $arg->{file};
409							}
410							else {
411	7		100			31	$rs = $self->{input_record_separator} \|\| $/;
412	7					17	@file = @_;
413							}
414	13					41	local $/ = $rs;
415	13					18	my $file;
416	13					24	for $file (@file) {
417	15	100				500	open my $fh, '<', $file or do {
418	1					8	require Carp;
419	1					152	Carp::croak("cannot open $file for input: $!");
420							};
421	14					126	while (defined (my $rec = <$fh>)) {
422	55					114	$self->add($rec);
423							}
424	14					109	close $fh;
425							}
426	12					58	return $self;
427							}
428
429							sub insert {
430	3398			3398	1	12089	my $self = shift;
431	3398	100	100			8684	return if $self->{filter} and not $self->{filter}->(@_);
432	3397					11218	$self->_insertr( [@_] );
433	3397					7593	return $self;
434							}
435
436							sub _insertr {
437	7068			7068		10664	my $self = shift;
438	7068		100			23280	my $dup = $self->{stats_dup} \|\| 0;
439	7068					14432	$self->{path} = $self->_insert_path( $self->_path, $self->_debug(DEBUG_ADD), $_[0] );
440	7068	100	100			21051	if( not defined $self->{stats_dup} or $dup == $self->{stats_dup} ) {
		50
441	7063					12044	++$self->{stats_add};
442	7063	100				9899	$self->{stats_cooked} += defined($_) ? length($_) : 0 for @{$_[0]};
	7063					32387
443							}
444							elsif( $self->{dup_warn} ) {
445	0	0				0	if( ref $self->{dup_warn} eq 'CODE' ) {
446	0					0	$self->{dup_warn}->($self, $_[0]);
447							}
448							else {
449	0					0	my $pattern = join( '', @{$_[0]} );
	0					0
450	0					0	require Carp;
451	0					0	Carp::carp("duplicate pattern added: /$pattern/");
452							}
453							}
454	7068					18302	$self->{str} = $self->{re} = undef;
455							}
456
457							sub lexstr {
458	2			2	1	10	return shift->_lex(shift);
459							}
460
461							sub pre_filter {
462	3			3	1	843	my $self = shift;
463	3					6	my $pre_filter = shift;
464	3	100	100			26	if( defined $pre_filter and ref($pre_filter) ne 'CODE' ) {
465	1					6	require Carp;
466	1					60	Carp::croak("pre_filter method not passed a coderef");
467							}
468	2					5	$self->{pre_filter} = $pre_filter;
469	2					6	return $self;
470							}
471
472
473							sub filter {
474	4			4	1	382	my $self = shift;
475	4					8	my $filter = shift;
476	4	100	100			24	if( defined $filter and ref($filter) ne 'CODE' ) {
477	1					9	require Carp;
478	1					157	Carp::croak("filter method not passed a coderef");
479							}
480	3					7	$self->{filter} = $filter;
481	3					10	return $self;
482							}
483
484							sub as_string {
485	800			800	1	1889	my $self = shift;
486	800	100				2081	if( not defined $self->{str} ) {
487	798	100				1754	if( $self->{track} ) {
488	8					13	$self->{m} = undef;
489	8					16	$self->{mcount} = 0;
490	8					17	$self->{mlist} = [];
491	8					18	$self->{str} = _re_path_track($self, $self->_path, '', '');
492							}
493							else {
494	790	100	100			4636	$self->_reduce unless ($self->{mutable} or not $self->{reduce});
495	790					1614	my $arg = {@_};
496							$arg->{indent} = $self->{indent}
497	790	100	100			3647	if not exists $arg->{indent} and $self->{indent} > 0;
498	790	100	100			2855	if( exists $arg->{indent} and $arg->{indent} > 0 ) {
		100
499	42					95	$arg->{depth} = 0;
500	42					112	$self->{str} = _re_path_pretty($self, $self->_path, $arg);
501							}
502							elsif( $self->{lookahead} ) {
503	35					100	$self->{str} = _re_path_lookahead($self, $self->_path);
504							}
505							else {
506	713					1540	$self->{str} = _re_path($self, $self->_path);
507							}
508							}
509	798	100				2162	if (not length $self->{str}) {
510							# explicitly fail to match anything if no pattern was generated
511	9					35	$self->{str} = $Always_Fail;
512							}
513							else {
514							my $begin =
515							$self->{anchor_word_begin} ? '\\b'
516							: $self->{anchor_line_begin} ? '^'
517	789	100				2650	: $self->{anchor_string_begin} ? '\A'
		100
		100
518							: ''
519							;
520							my $end =
521							$self->{anchor_word_end} ? '\\b'
522							: $self->{anchor_line_end} ? '$'
523							: $self->{anchor_string_end} ? '\Z'
524	789	100				2658	: $self->{anchor_string_end_absolute} ? '\z'
		100
		100
		100
525							: ''
526							;
527	789					1869	$self->{str} = "$begin$self->{str}$end";
528							}
529	798	100				2328	$self->{path} = [] unless $self->{mutable};
530							}
531	800					4265	return $self->{str};
532							}
533
534							sub re {
535	122			122	1	1381	my $self = shift;
536	122	100				468	$self->_build_re($self->as_string(@_)) unless defined $self->{re};
537	122					591	return $self->{re};
538							}
539
540							use overload '""' => sub {
541	2131			2131		537440	my $self = shift;
542	2131	100				14235	return $self->{re} if $self->{re};
543	489					1217	$self->_build_re($self->as_string());
544	489					5658	return $self->{re};
545	11			11		10252	};
	11					8917
	11					99
546
547							sub _build_re {
548	618			618		1012	my $self = shift;
549	618					966	my $str = shift;
550	618	100				1395	if( $self->{track} ) {
551	11			11		843	use re 'eval';
	11					25
	11					86485
552							$self->{re} = length $self->{flags}
553	8	100				856	? qr/(?$self->{flags}:$str)/
554							: qr/$str/
555							;
556							}
557							else {
558							# how could I not repeat myself?
559							$self->{re} = length $self->{flags}
560	610	100				12142	? qr/(?$self->{flags}:$str)/
561							: qr/$str/
562							;
563							}
564							}
565
566							sub match {
567	29			29	1	5318	my $self = shift;
568	29					49	my $target = shift;
569	29	100				108	$self->_build_re($self->as_string(@_)) unless defined $self->{re};
570	29					71	$self->{m} = undef;
571	29					56	$self->{mvar} = [];
572	29	100				599	if( not $target =~ /$self->{re}/ ) {
573	8					20	$self->{mbegin} = [];
574	8					13	$self->{mend} = [];
575	8					39	return undef;
576							}
577	21	50				65	$self->{m} = $^R if $] >= 5.009005;
578	21					92	$self->{mbegin} = _path_copy([@-]);
579	21					85	$self->{mend} = _path_copy([@+]);
580	21					40	my $n = 0;
581	21					55	for( my $n = 0; $n < @-; ++$n ) {
582	43	100	66			199	push @{$self->{mvar}}, substr($target, $-[$n], $+[$n] - $-[$n])
	33					168
583							if defined $-[$n] and defined $+[$n];
584							}
585	21	100				49	if( $self->{track} ) {
586	20	50				115	return defined $self->{m} ? $self->{mlist}[$self->{m}] : 1;
587							}
588							else {
589	1					6	return 1;
590							}
591							}
592
593							sub source {
594	4			4	1	400	my $self = shift;
595	4	100				17	return unless $self->{track};
596	3	100				11	defined($_[0]) and return $self->{mlist}[$_[0]];
597	2	100				9	return unless defined $self->{m};
598	1					5	return $self->{mlist}[$self->{m}];
599							}
600
601							sub mbegin {
602	3			3	1	10	my $self = shift;
603	3	100				19	return exists $self->{mbegin} ? $self->{mbegin} : [];
604							}
605
606							sub mend {
607	3			3	1	7	my $self = shift;
608	3	100				18	return exists $self->{mend} ? $self->{mend} : [];
609							}
610
611							sub mvar {
612	19			19	1	35	my $self = shift;
613	19	100				54	return undef unless exists $self->{mvar};
614	18	100				99	return defined($_[0]) ? $self->{mvar}[$_[0]] : $self->{mvar};
615							}
616
617							sub capture {
618	5			5	1	14	my $self = shift;
619	5	100				18	if( $self->{mvar} ) {
620	4					6	my @capture = @{$self->{mvar}};
	4					12
621	4					7	shift @capture;
622	4					15	return @capture;
623							}
624	1					4	return ();
625							}
626
627							sub matched {
628	9			9	1	412	my $self = shift;
629	9	100				55	return defined $self->{m} ? $self->{mlist}[$self->{m}] : undef;
630							}
631
632							sub stats_add {
633	2			2	1	9	my $self = shift;
634	2		100			12	return $self->{stats_add} \|\| 0;
635							}
636
637							sub stats_dup {
638	2			2	1	5	my $self = shift;
639	2		100			13	return $self->{stats_dup} \|\| 0;
640							}
641
642							sub stats_raw {
643	2			2	1	3	my $self = shift;
644	2		100			13	return $self->{stats_raw} \|\| 0;
645							}
646
647							sub stats_cooked {
648	2			2	1	4	my $self = shift;
649	2		100			14	return $self->{stats_cooked} \|\| 0;
650							}
651
652							sub stats_length {
653	6			6	1	2669	my $self = shift;
654	6	100	100			44	return (defined $self->{str} and $self->{str} ne $Always_Fail) ? length $self->{str} : 0;
655							}
656
657							sub dup_warn {
658	0			0	1	0	my $self = shift;
659	0	0				0	$self->{dup_warn} = defined($_[0]) ? $_[0] : 1;
660	0					0	return $self;
661							}
662
663							sub anchor_word_begin {
664	5			5	1	9	my $self = shift;
665	5	100				13	$self->{anchor_word_begin} = defined($_[0]) ? $_[0] : 1;
666	5					16	return $self;
667							}
668
669							sub anchor_word_end {
670	4			4	1	4	my $self = shift;
671	4	100				11	$self->{anchor_word_end} = defined($_[0]) ? $_[0] : 1;
672	4					10	return $self;
673							}
674
675							sub anchor_word {
676	2			2	1	4	my $self = shift;
677	2					3	my $state = shift;
678	2					6	$self->anchor_word_begin($state)->anchor_word_end($state);
679	2					6	return $self;
680							}
681
682							sub anchor_line_begin {
683	4			4	1	7	my $self = shift;
684	4	100				12	$self->{anchor_line_begin} = defined($_[0]) ? $_[0] : 1;
685	4					11	return $self;
686							}
687
688							sub anchor_line_end {
689	2			2	1	4	my $self = shift;
690	2	100				4	$self->{anchor_line_end} = defined($_[0]) ? $_[0] : 1;
691	2					4	return $self;
692							}
693
694							sub anchor_line {
695	2			2	1	4	my $self = shift;
696	2					4	my $state = shift;
697	2					4	$self->anchor_line_begin($state)->anchor_line_end($state);
698	2					6	return $self;
699							}
700
701							sub anchor_string_begin {
702	277			277	1	441	my $self = shift;
703	277	100				680	$self->{anchor_string_begin} = defined($_[0]) ? $_[0] : 1;
704	277					750	return $self;
705							}
706
707							sub anchor_string_end {
708	276			276	1	518	my $self = shift;
709	276	100				623	$self->{anchor_string_end} = defined($_[0]) ? $_[0] : 1;
710	276					481	return $self;
711							}
712
713							sub anchor_string_end_absolute {
714	3			3	1	4	my $self = shift;
715	3	100				7	$self->{anchor_string_end_absolute} = defined($_[0]) ? $_[0] : 1;
716	3					6	return $self;
717							}
718
719							sub anchor_string {
720	274			274	1	493	my $self = shift;
721	274	100				638	my $state = defined($_[0]) ? $_[0] : 1;
722	274					706	$self->anchor_string_begin($state)->anchor_string_end($state);
723	274					997	return $self;
724							}
725
726							sub anchor_string_absolute {
727	2			2	1	3	my $self = shift;
728	2	100				7	my $state = defined($_[0]) ? $_[0] : 1;
729	2					6	$self->anchor_string_begin($state)->anchor_string_end_absolute($state);
730	2					7	return $self;
731							}
732
733							sub debug {
734	605			605	1	2687	my $self = shift;
735	605	100				1398	$self->{debug} = defined($_[0]) ? $_[0] : 0;
736	605	100				1183	if ($self->_debug(DEBUG_TIME)) {
737							# hmm, debugging time was switched on after instantiation
738	4					14	$self->_init_time_func;
739	4					18	$self->{_begin_time} = $self->{_time_func}->();
740							}
741	605					1048	return $self;
742							}
743
744							sub dump {
745	9			9	1	1000	return _dump($_[0]->_path);
746							}
747
748							sub chomp {
749	22			22	1	841	my $self = shift;
750	22	100				101	$self->{chomp} = defined($_[0]) ? $_[0] : 1;
751	22					61	return $self;
752							}
753
754							sub fold_meta_pairs {
755	5			5	1	9	my $self = shift;
756	5	100				16	$self->{fold_meta_pairs} = defined($_[0]) ? $_[0] : 1;
757	5					13	return $self;
758							}
759
760							sub indent {
761	4			4	1	755	my $self = shift;
762	4	100				13	$self->{indent} = defined($_[0]) ? $_[0] : 0;
763	4					10	return $self;
764							}
765
766							sub lookahead {
767	22			22	1	47	my $self = shift;
768	22	100				80	$self->{lookahead} = defined($_[0]) ? $_[0] : 1;
769	22					59	return $self;
770							}
771
772							sub flags {
773	24			24	1	1780	my $self = shift;
774	24	100				126	$self->{flags} = defined($_[0]) ? $_[0] : '';
775	24					93	return $self;
776							}
777
778							sub modifiers {
779	4			4	1	1507	my $self = shift;
780	4					10	return $self->flags(@_);
781							}
782
783							sub track {
784	5			5	1	1537	my $self = shift;
785	5	100				15	$self->{track} = defined($_[0]) ? $_[0] : 1;
786	5					12	return $self;
787							}
788
789							sub unroll_plus {
790	2			2	1	859	my $self = shift;
791	2	100				7	$self->{unroll_plus} = defined($_[0]) ? $_[0] : 1;
792	2					8	return $self;
793							}
794
795							sub lex {
796	1			1	1	2	my $self = shift;
797	1					49	$self->{lex} = qr($_[0]);
798	1					5	return $self;
799							}
800
801							sub reduce {
802	19			19	1	1182	my $self = shift;
803	19	100				73	$self->{reduce} = defined($_[0]) ? $_[0] : 1;
804	19					53	return $self;
805							}
806
807							sub mutable {
808	5			5	1	1158	my $self = shift;
809	5	100				26	$self->{mutable} = defined($_[0]) ? $_[0] : 1;
810	5					17	return $self;
811							}
812
813							sub reset {
814							# reinitialise the internal state of the object
815	19			19	1	2371	my $self = shift;
816	19					46	$self->{path} = [];
817	19					58	$self->{re} = undef;
818	19					39	$self->{str} = undef;
819	19					75	return $self;
820							}
821
822							sub Default_Lexer {
823	4	100		4	1	3916	if( $_[0] ) {
824	3	100				13	if( my $refname = ref($_[0]) ) {
825	1					9	require Carp;
826	1					132	Carp::croak("Cannot pass a $refname to Default_Lexer");
827							}
828	2					6	$Current_Lexer = $_[0];
829							}
830	3	100				16	return defined $Current_Lexer ? $Current_Lexer : $Default_Lexer;
831							}
832
833							# --- no user serviceable parts below ---
834
835							# -- debug helpers
836
837							sub _debug {
838	12297			12297		18291	my $self = shift;
839	12297	100				37581	return $self->{debug} & shift() ? 1 : 0;
840							}
841
842							# -- helpers
843
844							sub _path {
845							# access the path
846	10011			10011		25316	return $_[0]->{path};
847							}
848
849							# -- the heart of the matter
850
851							$have_Storable = do {
852							eval {
853							require Storable;
854							import Storable 'dclone';
855							};
856							$@ ? 0 : 1;
857							};
858
859							sub _path_clone {
860	55	100		55		2951	$have_Storable ? dclone($_[0]) : _path_copy($_[0]);
861							}
862
863							sub _path_copy {
864	80			80		695	my $path = shift;
865	80					124	my $new = [];
866	80					188	for( my $p = 0; $p < @$path; ++$p ) {
867	201	100				443	if( ref($path->[$p]) eq 'HASH' ) {
		100
868	9					19	push @$new, _node_copy($path->[$p]);
869							}
870							elsif( ref($path->[$p]) eq 'ARRAY' ) {
871	3					6	push @$new, _path_copy($path->[$p]);
872							}
873							else {
874	189					426	push @$new, $path->[$p];
875							}
876							}
877	80					218	return $new;
878							}
879
880							sub _node_copy {
881	11			11		19	my $node = shift;
882	11					17	my $new = {};
883	11					40	while( my( $k, $v ) = each %$node ) {
884	27	100				70	$new->{$k} = defined($v)
885							? _path_copy($v)
886							: undef
887							;
888							}
889	11					32	return $new;
890							}
891
892							sub _insert_path {
893	7142			7142		10789	my $self = shift;
894	7142					10212	my $list = shift;
895	7142					10382	my $debug = shift;
896	7142					10150	my @in = @{shift()}; # create a new copy
	7142					18998
897	7142	100				17376	if( @$list == 0 ) { # special case the first time
898	2030	100	100			11221	if( @in == 0 or (@in == 1 and (not defined $in[0] or $in[0] eq ''))) {
			100
			100
899	33					124	return [{'' => undef}];
900							}
901							else {
902	1997					5720	return \@in;
903							}
904							}
905	5112	100				10759	$debug and print "# _insert_path @{[_dump(\@in)]} into @{[_dump($list)]}\n";
	151					328
	151					301
906	5112					7655	my $path = $list;
907	5112					7298	my $offset = 0;
908	5112					6948	my $token;
909	5112	100				10877	if( not @in ) {
910	2	100				7	if( ref($list->[0]) ne 'HASH' ) {
911	1					5	return [ { '' => undef, $list->[0] => $list } ];
912							}
913							else {
914	1					2	$list->[0]{''} = undef;
915	1					3	return $list;
916							}
917							}
918	5110					12363	while( defined( $token = shift @in )) {
919	17713	100				36509	if( ref($token) eq 'HASH' ) {
920	282	100				643	$debug and print "# p0=", _dump($path), "\n";
921	282					796	$path = $self->_insert_node( $path, $offset, $token, $debug, @in );
922	282	100				663	$debug and print "# p1=", _dump($path), "\n";
923	282					487	last;
924							}
925	17431	100				37113	if( ref($path->[$offset]) eq 'HASH' ) {
926	3703	100				7474	$debug and print "# at (off=$offset len=@{[scalar @$path]}) ", _dump($path->[$offset]), "\n";
	54					202
927	3703					5669	my $node = $path->[$offset];
928	3703	100				7414	if( exists( $node->{$token} )) {
929	2624	100				5114	if ($offset < $#$path) {
930							my $new = {
931							$token => [$token, @in],
932	1					5	_re_path($self, [$node]) => [@{$path}[$offset..$#$path]],
	1					4
933							};
934	1					4	splice @$path, $offset, @$path-$offset, $new;
935	1					2	last;
936							}
937							else {
938	2623	100				5333	$debug and print "# descend key=$token @{[_dump($node->{$token})]}\n";
	31					71
939	2623					4081	$path = $node->{$token};
940	2623					3732	$offset = 0;
941	2623					4327	redo;
942							}
943							}
944							else {
945	1079	100				2219	$debug and print "# add path ($token:@{[_dump(\@in)]}) into @{[_dump($path)]} at off=$offset to end=@{[scalar $#$path]}\n";
	23					51
	23					48
	23					515
946	1079	100				2286	if( $offset == $#$path ) {
947	1072					3323	$node->{$token} = [ $token, @in ];
948							}
949							else {
950							my $new = {
951							_node_key($token) => [ $token, @in ],
952	7					19	_node_key($node) => [@{$path}[$offset..$#{$path}]],
	7					22
	7					13
953							};
954	7					23	splice( @$path, $offset, @$path - $offset, $new );
955	7	100				20	$debug and print "# fused node=@{[_dump($new)]} path=@{[_dump($path)]}\n";
	1					3
	1					2
956							}
957	1079					2022	last;
958							}
959							}
960
961	13728	100				26428	if( $debug ) {
962	306					456	my $msg = '';
963	306					438	my $n;
964	306					667	for( $n = 0; $n < @$path; ++$n ) {
965	1093	100				2035	$msg .= ' ' if $n;
966							my $atom = ref($path->[$n]) eq 'HASH'
967	1093	100				2018	? '{'.join( ' ', keys(%{$path->[$n]})).'}'
	81					229
968							: $path->[$n]
969							;
970	1093	100				2706	$msg .= $n == $offset ? "<$atom>" : $atom;
971							}
972	306					2857	print "# at path ($msg)\n";
973							}
974
975	13728	100				37709	if( $offset >= @$path ) {
		100
		100
976	732					2553	push @$path, { $token => [ $token, @in ], '' => undef };
977	732	100				1833	$debug and print "# added remaining @{[_dump($path)]}\n";
	21					48
978	732					1306	last;
979							}
980							elsif( $token ne $path->[$offset] ) {
981	2099	100				4634	$debug and print "# token $token not present\n";
982							splice @$path, $offset, @$path-$offset, {
983							length $token
984							? ( _node_key($token) => [$token, @in])
985							: ( '' => undef )
986							,
987	2099	100				6269	$path->[$offset] => [@{$path}[$offset..$#{$path}]],
	2099					8005
	2099					3945
988							};
989	2099	100				5716	$debug and print "# path=@{[_dump($path)]}\n";
	79					159
990	2099					3468	last;
991							}
992							elsif( not @in ) {
993	917	100				2014	$debug and print "# last token to add\n";
994	917	100				1932	if( defined( $path->[$offset+1] )) {
995	912					1300	++$offset;
996	912	100				1841	if( ref($path->[$offset]) eq 'HASH' ) {
997	118	100				262	$debug and print "# add sentinel to node\n";
998	118					248	$path->[$offset]{''} = undef;
999							}
1000							else {
1001	794	100				1835	$debug and print "# convert <$path->[$offset]> to node for sentinel\n";
1002							splice @$path, $offset, @$path-$offset, {
1003							'' => undef,
1004	794					1563	$path->[$offset] => [ @{$path}[$offset..$#{$path}] ],
	794					2736
	794					1429
1005							};
1006							}
1007							}
1008							else {
1009							# already seen this pattern
1010	5					11	++$self->{stats_dup};
1011							}
1012	917					1786	last;
1013							}
1014							# if we get here then @_ still contains a token
1015	9980					22185	++$offset;
1016							}
1017	5110					11620	$list;
1018							}
1019
1020							sub _insert_node {
1021	282			282		484	my $self = shift;
1022	282					420	my $path = shift;
1023	282					442	my $offset = shift;
1024	282					437	my $token = shift;
1025	282					446	my $debug = shift;
1026	282					511	my $path_end = [@{$path}[$offset..$#{$path}]];
	282					656
	282					524
1027							# NB: $path->[$offset] and $[path_end->[0] are equivalent
1028	282					840	my $token_key = _re_path($self, [$token]);
1029	282	100				880	$debug and print "# insert node(@{[_dump($token)]}:@{[_dump(\@_)]}) (key=$token_key)",
	26					58
	26					66
1030	26					51	" at path=@{[_dump($path_end)]}\n";
1031	282	100				784	if( ref($path_end->[0]) eq 'HASH' ) {
1032	195	100				665	if( exists($path_end->[0]{$token_key}) ) {
		100
1033	25	100				66	if( @$path_end > 1 ) {
1034	2					8	my $path_key = _re_path($self, [$path_end->[0]]);
1035	2					11	my $new = {
1036							$path_key => [ @$path_end ],
1037							$token_key => [ $token, @_ ],
1038							};
1039	2	100				8	$debug and print "# +bifurcate new=@{[_dump($new)]}\n";
	1					3
1040	2					8	splice( @$path, $offset, @$path_end, $new );
1041							}
1042							else {
1043	23					44	my $old_path = $path_end->[0]{$token_key};
1044	23					43	my $new_path = [];
1045	23		100			105	while( @$old_path and _node_eq( $old_path->[0], $token )) {
1046	30	100				89	$debug and print "# identical nodes in sub_path ",
		100
1047							ref($token) ? _dump($token) : $token, "\n";
1048	30					72	push @$new_path, shift(@$old_path);
1049	30					96	$token = shift @_;
1050							}
1051	23	50				59	if( @$new_path ) {
1052	23					38	my $new;
1053	23					36	my $token_key = $token;
1054	23	100				93	if( @_ ) {
1055	6					15	$new = {
1056							_re_path($self, $old_path) => $old_path,
1057							$token_key => [$token, @_],
1058							};
1059	6	100				19	$debug and print "# insert_node(bifurc) n=@{[_dump([$new])]}\n";
	1					3
1060							}
1061							else {
1062	17	100				61	$debug and print "# insert $token into old path @{[_dump($old_path)]}\n";
	5					12
1063	17	100				48	if( @$old_path ) {
1064	11					50	$new = ($self->_insert_path( $old_path, $debug, [$token] ))->[0];
1065							}
1066							else {
1067	6					19	$new = { '' => undef, $token => [$token] };
1068							}
1069							}
1070	23					51	push @$new_path, $new;
1071							}
1072	23					49	$path_end->[0]{$token_key} = $new_path;
1073	23	100				53	$debug and print "# +_insert_node result=@{[_dump($path_end)]}\n";
	6					15
1074	23					70	splice( @$path, $offset, @$path_end, @$path_end );
1075							}
1076							}
1077							elsif( not _node_eq( $path_end->[0], $token )) {
1078	70	100				198	if( @$path_end > 1 ) {
1079	11					44	my $path_key = _re_path($self, [$path_end->[0]]);
1080	11					70	my $new = {
1081							$path_key => [ @$path_end ],
1082							$token_key => [ $token, @_ ],
1083							};
1084	11	100				40	$debug and print "# path->node1 at $path_key/$token_key @{[_dump($new)]}\n";
	1					4
1085	11					39	splice( @$path, $offset, @$path_end, $new );
1086							}
1087							else {
1088	59	100				188	$debug and print "# next in path is node, trivial insert at $token_key\n";
1089	59					208	$path_end->[0]{$token_key} = [$token, @_];
1090	59					162	splice( @$path, $offset, @$path_end, @$path_end );
1091							}
1092							}
1093							else {
1094	100		100			403	while( @$path_end and _node_eq( $path_end->[0], $token )) {
1095	131	100				345	$debug and print "# identical nodes @{[_dump([$token])]}\n";
	9					24
1096	131					233	shift @$path_end;
1097	131					292	$token = shift @_;
1098	131					396	++$offset;
1099							}
1100	100	100				241	if( @$path_end ) {
1101	57	100				151	$debug and print "# insert at $offset $token:@{[_dump(\@_)]} into @{[_dump($path_end)]}\n";
	4					11
	4					9
1102	57					262	$path_end = $self->_insert_path( $path_end, $debug, [$token, @_] );
1103	57	100				166	$debug and print "# got off=$offset s=@{[scalar @_]} path_add=@{[_dump($path_end)]}\n";
	4					11
	4					14
1104	57					166	splice( @$path, $offset, @$path - $offset, @$path_end );
1105	57	100				156	$debug and print "# got final=@{[_dump($path)]}\n";
	4					8
1106							}
1107							else {
1108	43					106	$token_key = _node_key($token);
1109	43					164	my $new = {
1110							'' => undef,
1111							$token_key => [ $token, @_ ],
1112							};
1113	43	100				117	$debug and print "# convert opt @{[_dump($new)]}\n";
	3					8
1114	43					134	push @$path, $new;
1115							}
1116							}
1117							}
1118							else {
1119	87	100				210	if( @$path_end ) {
1120	74					338	my $new = {
1121							$path_end->[0] => [ @$path_end ],
1122							$token_key => [ $token, @_ ],
1123							};
1124	74	100				195	$debug and print "# atom->node @{[_dump($new)]}\n";
	5					12
1125	74					222	splice( @$path, $offset, @$path_end, $new );
1126	74	100				200	$debug and print "# out=@{[_dump($path)]}\n";
	5					11
1127							}
1128							else {
1129	13	100				34	$debug and print "# add opt @{[_dump([$token,@_])]} via $token_key\n";
	4					13
1130	13					70	push @$path, {
1131							'' => undef,
1132							$token_key => [ $token, @_ ],
1133							};
1134							}
1135							}
1136	282					668	$path;
1137							}
1138
1139							sub _reduce {
1140	810			810		1331	my $self = shift;
1141	810					1828	my $context = { debug => $self->_debug(DEBUG_TAIL), depth => 0 };
1142
1143	810	100				1806	if ($self->_debug(DEBUG_TIME)) {
1144	4					11	$self->_init_time_func;
1145	4					12	my $now = $self->{_time_func}->();
1146	4	100				10	if (exists $self->{_begin_time}) {
1147	3					89	printf "# load=%0.6f\n", $now - $self->{_begin_time};
1148							}
1149							else {
1150	1					37	printf "# load-epoch=%0.6f\n", $now;
1151							}
1152	4					19	$self->{_begin_time} = $self->{_time_func}->();
1153							}
1154
1155	810					1899	my ($head, $tail) = _reduce_path( $self->_path, $context );
1156	810	100				2063	$context->{debug} and print "# final head=", _dump($head), ' tail=', _dump($tail), "\n";
1157	810	100				1704	if( !@$head ) {
1158	651					1233	$self->{path} = $tail;
1159							}
1160							else {
1161							$self->{path} = [
1162	159					379	@{_unrev_path( $tail, $context )},
1163	159					244	@{_unrev_path( $head, $context )},
	159					320
1164							];
1165							}
1166
1167	810	100				2116	if ($self->_debug(DEBUG_TIME)) {
1168	4					12	my $now = $self->{_time_func}->();
1169	4	50				15	if (exists $self->{_begin_time}) {
1170	4					95	printf "# reduce=%0.6f\n", $now - $self->{_begin_time};
1171							}
1172							else {
1173	0					0	printf "# reduce-epoch=%0.6f\n", $now;
1174							}
1175	4					20	$self->{_begin_time} = $self->{_time_func}->();
1176							}
1177
1178	810	100				1913	$context->{debug} and print "# final path=", _dump($self->{path}), "\n";
1179	810					1865	return $self;
1180							}
1181
1182							sub _remove_optional {
1183	1710	100		1710		4133	if( exists $_[0]->{''} ) {
1184	350					668	delete $_[0]->{''};
1185	350					672	return 1;
1186							}
1187	1360					2283	return 0;
1188							}
1189
1190							sub _reduce_path {
1191	810			810		1730	my ($path, $ctx) = @_;
1192	810					2029	my $indent = ' ' x $ctx->{depth};
1193	810					1492	my $debug = $ctx->{debug};
1194	810	100				1880	$debug and print "#$indent _reduce_path $ctx->{depth} ", _dump($path), "\n";
1195	810					1208	my $new;
1196	810					1381	my $head = [];
1197	810					1326	my $tail = [];
1198	810					2418	while( defined( my $p = pop @$path )) {
1199	1159	100				2628	if( ref($p) eq 'HASH' ) {
1200	514					1189	my ($node_head, $node_tail) = _reduce_node($p, _descend($ctx) );
1201	514	100				1613	$debug and print "#$indent\| head=", _dump($node_head), " tail=", _dump($node_tail), "\n";
1202	514	100				1429	push @$head, @$node_head if scalar @$node_head;
1203	514	100				3526	push @$tail, ref($node_tail) eq 'HASH' ? $node_tail : @$node_tail;
1204							}
1205							else {
1206	645	100				1332	if( @$head ) {
1207	125	100				305	$debug and print "#$indent\| push $p leaves @{[_dump($path)]}\n";
	7					15
1208	125					412	push @$tail, $p;
1209							}
1210							else {
1211	520	100				1102	$debug and print "#$indent\| unshift $p\n";
1212	520					1769	unshift @$tail, $p;
1213							}
1214							}
1215							}
1216	20					86	$debug and print "#$indent\| tail nr=@{[scalar @$tail]} t0=", ref($tail->[0]),
1217	810	100				1877	(ref($tail->[0]) eq 'HASH' ? " n=" . scalar(keys %{$tail->[0]}) : '' ),
	18	100				77
1218							"\n";
1219	810	100	100			2652	if( @$tail > 1
			100
1220							and ref($tail->[0]) eq 'HASH'
1221	97					435	and keys %{$tail->[0]} == 2
1222							) {
1223	72					149	my $opt;
1224							my $fixed;
1225	72					126	while( my ($key, $path) = each %{$tail->[0]} ) {
	216					660
1226	144	100				371	$debug and print "#$indent\| scan k=$key p=@{[_dump($path)]}\n";
	14					28
1227	144	100				361	next unless $path;
1228	123	100	100			437	if (@$path == 1 and ref($path->[0]) eq 'HASH') {
1229	8					22	$opt = $path->[0];
1230							}
1231							else {
1232	115					268	$fixed = $path;
1233							}
1234							}
1235	72	100				223	if( exists $tail->[0]{''} ) {
1236	21					46	my $path = [@{$tail}[1..$#{$tail}]];
	21					61
	21					46
1237	21					56	$tail = $tail->[0];
1238	21					73	($head, $tail, $path) = _slide_tail( $head, $tail, $path, _descend($ctx) );
1239	21					75	$tail = [$tail, @$path];
1240							}
1241							}
1242	810	100				1863	$debug and print "#$indent _reduce_path $ctx->{depth} out head=", _dump($head), ' tail=', _dump($tail), "\n";
1243	810					1811	return ($head, $tail);
1244							}
1245
1246							sub _reduce_node {
1247	1080			1080		2262	my ($node, $ctx) = @_;
1248	1080					2174	my $indent = ' ' x $ctx->{depth};
1249	1080					1710	my $debug = $ctx->{debug};
1250	1080					2057	my $optional = _remove_optional($node);
1251	1080	100				2499	$debug and print "#$indent _reduce_node $ctx->{depth} in @{[_dump($node)]} opt=$optional\n";
	57					120
1252	1080	100	100			3034	if( $optional and scalar keys %$node == 1 ) {
1253	70					142	my $path = (values %$node)[0];
1254	70	100				142	if( not grep { ref($_) eq 'HASH' } @$path ) {
	117					344
1255							# if we have removed an optional, and there is only one path
1256							# left then there is nothing left to compare. Because of the
1257							# optional it cannot participate in any further reductions.
1258							# (unless we test for equality among sub-trees).
1259	61					160	my $result = {
1260							'' => undef,
1261							$path->[0] => $path
1262							};
1263	61	100				159	$debug and print "#$indent\| fast fail @{[_dump($result)]}\n";
	2					7
1264	61					164	return [], $result;
1265							}
1266							}
1267
1268	1019					1903	my( $fail, $reduce ) = _scan_node( $node, _descend($ctx) );
1269
1270	1019	100				2767	$debug and print "#$indent\|_scan_node done opt=$optional reduce=@{[_dump($reduce)]} fail=@{[_dump($fail)]}\n";
	55					112
	55					113
1271
1272							# We now perform tail reduction on each of the nodes in the reduce
1273							# hash. If we have only one key, we know we will have a successful
1274							# reduction (since everything that was inserted into the node based
1275							# on the value of the last token of each path all mapped to the same
1276							# value).
1277
1278	1019	100	100			5758	if( @$fail == 0 and keys %$reduce == 1 and not $optional) {
			100
1279							# every path shares a common path
1280	506					1058	my $path = (values %$reduce)[0];
1281	506					1065	my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1282	506	100				1453	$debug and print "#$indent\|_reduce_node $ctx->{depth} common=@{[_dump($common)]} tail=", _dump($tail), "\n";
	50					110
1283	506					2281	return( $common, $tail );
1284							}
1285
1286							# this node resulted in a list of paths, game over
1287	513					1115	$ctx->{indent} = $indent;
1288	513					1067	return _reduce_fail( $reduce, $fail, $optional, _descend($ctx) );
1289							}
1290
1291							sub _reduce_fail {
1292	513			513		1066	my( $reduce, $fail, $optional, $ctx ) = @_;
1293	513					922	my( $debug, $depth, $indent ) = @{$ctx}{qw(debug depth indent)};
	513					1130
1294	513					877	my %result;
1295	513	100				1218	$result{''} = undef if $optional;
1296	513					737	my $p;
1297	513					1259	for $p (keys %$reduce) {
1298	1181					2054	my $path = $reduce->{$p};
1299	1181	100				2294	if( scalar @$path == 1 ) {
1300	1114					1833	$path = $path->[0];
1301	1114	100				2464	$debug and print "#$indent\| -simple opt=$optional unrev @{[_dump($path)]}\n";
	7					15
1302	1114					2066	$path = _unrev_path($path, _descend($ctx) );
1303	1114					2684	$result{_node_key($path->[0])} = $path;
1304							}
1305							else {
1306	67	100				167	$debug and print "#$indent\| _do_reduce(@{[_dump($path)]})\n";
	1					3
1307	67					159	my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1308							$path = [
1309							(
1310							ref($tail) eq 'HASH'
1311							? _unrev_node($tail, _descend($ctx) )
1312							: _unrev_path($tail, _descend($ctx) )
1313							),
1314	67	100				310	@{_unrev_path($common, _descend($ctx) )}
	67					165
1315							];
1316	67	100				248	$debug and print "#$indent\| +reduced @{[_dump($path)]}\n";
	1					4
1317	67					156	$result{_node_key($path->[0])} = $path;
1318							}
1319							}
1320	513					878	my $f;
1321	513					983	for $f( @$fail ) {
1322	219	100				441	$debug and print "#$indent\| +fail @{[_dump($f)]}\n";
	3					8
1323	219					424	$result{$f->[0]} = $f;
1324							}
1325	513	100				1187	$debug and print "#$indent _reduce_fail $depth fail=@{[_dump(\%result)]}\n";
	5					32
1326	513					2246	return ( [], \%result );
1327							}
1328
1329							sub _scan_node {
1330	1019			1019		1991	my( $node, $ctx ) = @_;
1331	1019					2115	my $indent = ' ' x $ctx->{depth};
1332	1019					1681	my $debug = $ctx->{debug};
1333
1334							# For all the paths in the node, reverse them. If the first token
1335							# of the path is a scalar, push it onto an array in a hash keyed by
1336							# the value of the scalar.
1337							#
1338							# If it is a node, call _reduce_node on this node beforehand. If we
1339							# get back a common head, all of the paths in the subnode shared a
1340							# common tail. We then store the common part and the remaining node
1341							# of paths (which is where the paths diverged from the end and install
1342							# this into the same hash. At this point both the common and the tail
1343							# are in reverse order, just as simple scalar paths are.
1344							#
1345							# On the other hand, if there were no common path returned then all
1346							# the paths of the sub-node diverge at the end character. In this
1347							# case the tail cannot participate in any further reductions and will
1348							# appear in forward order.
1349							#
1350							# certainly the hurgliest function in the whole file :(
1351
1352							# $debug = 1 if $depth >= 8;
1353	1019					2433	my @fail;
1354							my %reduce;
1355
1356	1019					0	my $n;
1357	1019					2586	for $n(
1358	2743					7088	map { substr($_, index($_, '#')+1) }
1359							sort
1360							map {
1361							join( '\|' =>
1362	9029					17887	scalar(grep {ref($_) eq 'HASH'} @{$node->{$_}}),
	2743					4951
1363							_node_offset($node->{$_}),
1364	2743					4145	scalar @{$node->{$_}},
	2743					10212
1365							)
1366							. "#$_"
1367							}
1368							keys %$node ) {
1369	2743					4189	my( $end, @path ) = reverse @{$node->{$n}};
	2743					7030
1370	2743	100				5715	if( ref($end) ne 'HASH' ) {
1371	2137	100				4408	$debug and print "# $indent\|_scan_node push reduce ($end:@{[_dump(\@path)]})\n";
	87					182
1372	2137					3079	push @{$reduce{$end}}, [ $end, @path ];
	2137					7679
1373							}
1374							else {
1375	606	100				1385	$debug and print "# $indent\|_scan_node head=", _dump(\@path), ' tail=', _dump($end), "\n";
1376	606					959	my $new_path;
1377							# deal with sing, singing => s(?:ing)?ing
1378	606	100	100			2633	if( keys %$end == 2 and exists $end->{''} ) {
1379	94					231	my ($key, $opt_path) = each %$end;
1380	94	100				303	($key, $opt_path) = each %$end if $key eq '';
1381	94					151	$opt_path = [reverse @{$opt_path}];
	94					228
1382	94	100				237	$debug and print "# $indent\| check=", _dump($opt_path), "\n";
1383	94					289	my $end = { '' => undef, $opt_path->[0] => [@$opt_path] };
1384	94					178	my $head = [];
1385	94					205	my $path = [@path];
1386	94					250	($head, my $slide, $path) = _slide_tail( $head, $end, $path, $ctx );
1387	94	100				334	if( @$head ) {
1388	40					171	$new_path = [ @$head, $slide, @$path ];
1389							}
1390							}
1391	606	100				1209	if( $new_path ) {
1392	40	100				95	$debug and print "# $indent\|_scan_node slid=", _dump($new_path), "\n";
1393	40					63	push @{$reduce{$new_path->[0]}}, $new_path;
	40					128
1394							}
1395							else {
1396	566					1168	my( $common, $tail ) = _reduce_node( $end, _descend($ctx) );
1397	566	100				1575	if( not @$common ) {
1398	219	100				552	$debug and print "# $indent\| +failed $n\n";
1399	219					703	push @fail, [reverse(@path), $tail];
1400							}
1401							else {
1402	347					792	my $path = [@path];
1403	347	100				835	$debug and print "# $indent\|_scan_node ++recovered common=@{[_dump($common)]} tail=",
	34					69
1404	34					65	_dump($tail), " path=@{[_dump($path)]}\n";
1405	347	100	100			1720	if( ref($tail) eq 'HASH'
1406							and keys %$tail == 2
1407							) {
1408	287	100				846	if( exists $tail->{''} ) {
1409	121					289	($common, $tail, $path) = _slide_tail( $common, $tail, $path, $ctx );
1410							}
1411							}
1412	347	100				596	push @{$reduce{$common->[0]}}, [
	347					1916
1413							@$common,
1414							(ref($tail) eq 'HASH' ? $tail : @$tail ),
1415							@$path
1416							];
1417							}
1418							}
1419							}
1420							}
1421	1019	100				2825	$debug and print
1422	55					182	"# $indent\|_scan_node counts: reduce=@{[scalar keys %reduce]} fail=@{[scalar @fail]}\n";
	55					209
1423	1019					2640	return( \@fail, \%reduce );
1424							}
1425
1426							sub _do_reduce {
1427	573			573		1110	my ($path, $ctx) = @_;
1428	573					1225	my $indent = ' ' x $ctx->{depth};
1429	573					896	my $debug = $ctx->{debug};
1430	573					1499	my $ra = Regexp::Assemble->new(chomp=>0);
1431	573					1603	$ra->debug($debug);
1432	573	100				1302	$debug and print "# $indent\| do @{[_dump($path)]}\n";
	51					100
1433	573					1821	$ra->_insertr( $_ ) for
1434							# When nodes come into the picture, we have to be careful
1435							# about how we insert the paths into the assembly.
1436							# Paths with nodes first, then closest node to front
1437							# then shortest path. Merely because if we can control
1438							# order in which paths containing nodes get inserted,
1439							# then we can make a couple of assumptions that simplify
1440							# the code in _insert_node.
1441							sort {
1442	6000					10572	scalar(grep {ref($_) eq 'HASH'} @$a)
1443	1113	50	100			2054	<=> scalar(grep {ref($_) eq 'HASH'} @$b)
	6578					12440
1444							\|\|
1445							_node_offset($b) <=> _node_offset($a)
1446							\|\|
1447							scalar @$a <=> scalar @$b
1448							}
1449							@$path
1450							;
1451	573					1328	$path = $ra->_path;
1452	573					1012	my $common = [];
1453	573					3127	push @$common, shift @$path while( ref($path->[0]) ne 'HASH' );
1454	573	100				1430	my $tail = scalar( @$path ) > 1 ? [@$path] : $path->[0];
1455	573	100				1263	$debug and print "# $indent\| _do_reduce common=@{[_dump($common)]} tail=@{[_dump($tail)]}\n";
	51					104
	51					101
1456	573					2717	return ($common, $tail);
1457							}
1458
1459							sub _node_offset {
1460							# return the offset that the first node is found, or -ve
1461							# optimised for speed
1462	4422			4422		6385	my $nr = @{$_[0]};
	4422					6891
1463	4422					6401	my $atom = -1;
1464	4422		100			31167	ref($_[0]->[$atom]) eq 'HASH' and return $atom while ++$atom < $nr;
1465	3458					7646	return -1;
1466							}
1467
1468							sub _slide_tail {
1469	240			240		20123	my $head = shift;
1470	240					354	my $tail = shift;
1471	240					361	my $path = shift;
1472	240					354	my $ctx = shift;
1473	240					515	my $indent = ' ' x $ctx->{depth};
1474	240					394	my $debug = $ctx->{debug};
1475	240	100				574	$debug and print "# $indent\| slide in h=", _dump($head),
1476							' t=', _dump($tail), ' p=', _dump($path), "\n";
1477	240					553	my $slide_path = (each %$tail)[-1];
1478	240	100				623	$slide_path = (each %$tail)[-1] unless defined $slide_path;
1479	240	100				560	$debug and print "# $indent\| slide potential ", _dump($slide_path), " over ", _dump($path), "\n";
1480	240		100			1108	while( defined $path->[0] and $path->[0] eq $slide_path->[0] ) {
1481	154	100				429	$debug and print "# $indent\| slide=tail=$slide_path->[0]\n";
1482	154					271	my $slide = shift @$path;
1483	154					238	shift @$slide_path;
1484	154					327	push @$slide_path, $slide;
1485	154					595	push @$head, $slide;
1486							}
1487	240	100				585	$debug and print "# $indent\| slide path ", _dump($slide_path), "\n";
1488	240					590	my $slide_node = {
1489							'' => undef,
1490							_node_key($slide_path->[0]) => $slide_path,
1491							};
1492	240	100				624	$debug and print "# $indent\| slide out h=", _dump($head),
1493							' s=', _dump($slide_node), ' p=', _dump($path), "\n";
1494	240					652	return ($head, $slide_node, $path);
1495							}
1496
1497							sub _unrev_path {
1498	2690			2690		4844	my ($path, $ctx) = @_;
1499	2690					5097	my $indent = ' ' x $ctx->{depth};
1500	2690					4107	my $debug = $ctx->{debug};
1501	2690					3622	my $new;
1502	2690	100				4347	if( not grep { ref($_) } @$path ) {
	6383					13257
1503	2196	100				4657	$debug and print "# ${indent}_unrev path fast ", _dump($path);
1504	2196					5847	$new = [reverse @$path];
1505	2196	100				4667	$debug and print "# -> ", _dump($new), "\n";
1506	2196					4551	return $new;
1507							}
1508	494	100				1144	$debug and print "# ${indent}unrev path in ", _dump($path), "\n";
1509	494					1285	while( defined( my $p = pop @$path )) {
1510	1483	100				4801	push @$new,
		100
1511							ref($p) eq 'HASH' ? _unrev_node($p, _descend($ctx) )
1512							: ref($p) eq 'ARRAY' ? _unrev_path($p, _descend($ctx) )
1513							: $p
1514							;
1515							}
1516	494	100				1101	$debug and print "# ${indent}unrev path out ", _dump($new), "\n";
1517	494					975	return $new;
1518							}
1519
1520							sub _unrev_node {
1521	630			630		1236	my ($node, $ctx ) = @_;
1522	630					1227	my $indent = ' ' x $ctx->{depth};
1523	630					960	my $debug = $ctx->{debug};
1524	630					1159	my $optional = _remove_optional($node);
1525	630	100				1423	$debug and print "# ${indent}unrev node in ", _dump($node), " opt=$optional\n";
1526	630					916	my $new;
1527	630	100				1446	$new->{''} = undef if $optional;
1528	630					905	my $n;
1529	630					1409	for $n( keys %$node ) {
1530	1167					2323	my $path = _unrev_path($node->{$n}, _descend($ctx) );
1531	1167					2794	$new->{_node_key($path->[0])} = $path;
1532							}
1533	630	100				1506	$debug and print "# ${indent}unrev node out ", _dump($new), "\n";
1534	630					2422	return $new;
1535							}
1536
1537							sub _node_key {
1538	4653			4653		7893	my $node = shift;
1539	4653	100				9977	return _node_key($node->[0]) if ref($node) eq 'ARRAY';
1540	4642	100				15860	return $node unless ref($node) eq 'HASH';
1541	205					335	my $key = '';
1542	205					301	my $k;
1543	205					500	for $k( keys %$node ) {
1544	452	100				988	next if $k eq '';
1545	374	100	100			1318	$key = $k if $key eq '' or $key gt $k;
1546							}
1547	205					645	return $key;
1548							}
1549
1550							sub _descend {
1551							# Take a context object, and increase the depth by one.
1552							# By creating a fresh hash each time, we don't have to
1553							# bother adding make-work code to decrease the depth
1554							# when we return from what we called.
1555	6190			6190		9293	my $ctx = shift;
1556	6190					23621	return {%$ctx, depth => $ctx->{depth}+1};
1557							}
1558
1559							#####################################################################
1560
1561							sub _make_class {
1562	650			650		1089	my $self = shift;
1563	650					1171	my %set = map { ($_,1) } @_;
	1675					3854
1564	650	100				1813	delete $set{'\\d'} if exists $set{'\\w'};
1565	650	100				1450	delete $set{'\\D'} if exists $set{'\\W'};
1566							return '.' if exists $set{'.'}
1567							or ($self->{fold_meta_pairs} and (
1568							(exists $set{'\\d'} and exists $set{'\\D'})
1569							or (exists $set{'\\s'} and exists $set{'\\S'})
1570	650	100	100			4895	or (exists $set{'\\w'} and exists $set{'\\W'})
			100
			100
1571							))
1572							;
1573	632					1184	for my $meta( q/\\d/, q/\\D/, q/\\s/, q/\\S/, q/\\w/, q/\\W/ ) {
1574	3792	100				7692	if( exists $set{$meta} ) {
1575	28					175	my $re = qr/$meta/;
1576	28					47	my @delete;
1577	28		66			308	$_ =~ /^$re$/ and push @delete, $_ for keys %set;
1578	28	100				124	delete @set{@delete} if @delete;
1579							}
1580							}
1581	632	100				1451	return (keys %set)[0] if keys %set == 1;
1582	625					1117	for my $meta( '.', '+', '*', '?', '(', ')', '^', '@', '$', '[', '/', ) {
1583	6875	100				14335	exists $set{"\\$meta"} and $set{$meta} = delete $set{"\\$meta"};
1584							}
1585	625	100				1386	my $dash = exists $set{'-'} ? do { delete($set{'-'}), '-' } : '';
	20					53
1586	625	100				1259	my $caret = exists $set{'^'} ? do { delete($set{'^'}), '^' } : '';
	7					25
1587	625					2160	my $class = join( '' => sort keys %set );
1588	625	100	100			2051	$class =~ s/0123456789/\\d/ and $class eq '\\d' and return $class;
1589	622					3782	return "[$dash$class$caret]";
1590							}
1591
1592							sub _re_sort {
1593	998		100	998		4593	return length $b <=> length $a \|\| $a cmp $b
1594							}
1595
1596							sub _combine {
1597	140			140		211	my $self = shift;
1598	140					225	my $type = shift;
1599							# print "c in = @{[_dump(\@_)]}\n";
1600							# my $combine =
1601							return '('
1602							. $type
1603	140					241	. do {
1604	140					221	my( @short, @long );
1605	140	100				260	push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
	377					1953
1606	140	100				408	if( @short == 1 ) {
		100
1607	31					88	@long = sort _re_sort @long, @short;
1608							}
1609							elsif( @short > 1 ) {
1610							# yucky but true
1611	77					179	my @combine = (_make_class($self, @short), sort _re_sort @long);
1612	77					188	@long = @combine;
1613							}
1614							else {
1615	32					101	@long = sort _re_sort @long;
1616							}
1617	140					469	join( '\|', @long );
1618							}
1619							. ')';
1620							# print "combine <$combine>\n";
1621							# $combine;
1622							}
1623
1624							sub _combine_new {
1625	1738			1738		2919	my $self = shift;
1626	1738					2648	my( @short, @long );
1627	1738	100				3133	push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
	3244					17480
1628	1738	100	100			8318	if( @short == 1 and @long == 0 ) {
		100	100
1629	365					1782	return $short[0];
1630							}
1631							elsif( @short > 1 and @short == @_ ) {
1632	494					1106	return _make_class($self, @short);
1633							}
1634							else {
1635	879	100				4983	return '(?:'
1636							. join( '\|' =>
1637							@short > 1
1638							? ( _make_class($self, @short), sort _re_sort @long)
1639							: ( (sort _re_sort( @long )), @short )
1640							)
1641							. ')';
1642							}
1643							}
1644
1645							sub _re_path {
1646	4772			4772		7336	my $self = shift;
1647							# in shorter assemblies, _re_path() is the second hottest
1648							# routine. after insert(), so make it fast.
1649
1650	4772	100				10506	if ($self->{unroll_plus}) {
1651							# but we can't easily make this blockless
1652	72					113	my @arr = @{$_[0]};
	72					163
1653	72					110	my $str = '';
1654	72					99	my $skip = 0;
1655	72					147	for my $i (0..$#arr) {
1656	127	100	100			1162	if (ref($arr[$i]) eq 'ARRAY') {
		100
		100
		100
1657	1					4	$str .= _re_path($self, $arr[$i]);
1658							}
1659							elsif (ref($arr[$i]) eq 'HASH') {
1660							$str .= exists $arr[$i]->{''}
1661							? _combine_new( $self,
1662	7					22	map { _re_path( $self, $arr[$i]->{$_} ) } grep { $_ ne '' } keys %{$arr[$i]}
	14					39
	7					22
1663							) . '?'
1664	28	100				62	: _combine_new($self, map { _re_path( $self, $arr[$i]->{$_} ) } keys %{$arr[$i]})
	42					107
	21					55
1665							;
1666							}
1667							elsif ($i < $#arr and $arr[$i+1] =~ /\A$arr[$i]\*(\??)\Z/) {
1668	7	50				34	$str .= "$arr[$i]+" . (defined $1 ? $1 : '');
1669	7					16	++$skip;
1670							}
1671							elsif ($skip) {
1672	7					19	$skip = 0;
1673							}
1674							else {
1675	84					182	$str .= $arr[$i];
1676							}
1677							}
1678	72					232	return $str;
1679							}
1680
1681	4700	50				7743	return join( '', @_ ) unless grep { length ref $_ } @_;
	4700					11983
1682	4700					6609	my $p;
1683							return join '', map {
1684							ref($_) eq '' ? $_
1685	9506	100				25949	: ref($_) eq 'HASH' ? do {
		100
1686							# In the case of a node, see whether there's a '' which
1687							# indicates that the whole thing is optional and thus
1688							# requires a trailing ?
1689							# Unroll the two different paths to avoid the needless
1690							# grep when it isn't necessary.
1691	1710					2601	$p = $_;
1692							exists $_->{''}
1693							? _combine_new( $self,
1694	801					1790	map { _re_path( $self, $p->{$_} ) } grep { $_ ne '' } keys %$_
	1508					3071
1695							) . '?'
1696	1710	100				5033	: _combine_new($self, map { _re_path( $self, $p->{$_} ) } keys %$_ )
	2394					5124
1697							}
1698							: _re_path($self, $_) # ref($_) eq 'ARRAY'
1699	4700					6570	} @{$_[0]}
	4700					7834
1700							}
1701
1702							sub _lookahead {
1703	132			132		256	my $in = shift;
1704	132					223	my %head;
1705							my $path;
1706	132					282	for $path( keys %$in ) {
1707	328	100				716	next unless defined $in->{$path};
1708							# print "look $path: ", ref($in->{$path}[0]), ".\n";
1709	267	100				659	if( ref($in->{$path}[0]) eq 'HASH' ) {
		100
1710	15					27	my $next = 0;
1711	15		100			52	while( ref($in->{$path}[$next]) eq 'HASH' and @{$in->{$path}} > $next + 1 ) {
	16					71
1712	11	100				30	if( exists $in->{$path}[$next]{''} ) {
1713	5					15	++$head{$in->{$path}[$next+1]};
1714							}
1715	11					33	++$next;
1716							}
1717	15					50	my $inner = _lookahead( $in->{$path}[0] );
1718	15					66	@head{ keys %$inner } = (values %$inner);
1719							}
1720							elsif( ref($in->{$path}[0]) eq 'ARRAY' ) {
1721	2					6	my $subpath = $in->{$path}[0];
1722	2					9	for( my $sp = 0; $sp < @$subpath; ++$sp ) {
1723	3	100				12	if( ref($subpath->[$sp]) eq 'HASH' ) {
1724	2					8	my $follow = _lookahead( $subpath->[$sp] );
1725	2					12	@head{ keys %$follow } = (values %$follow);
1726	2	100				12	last unless exists $subpath->[$sp]{''};
1727							}
1728							else {
1729	1					4	++$head{$subpath->[$sp]};
1730	1					4	last;
1731							}
1732							}
1733							}
1734							else {
1735	250					522	++$head{ $in->{$path}[0] };
1736							}
1737							}
1738							# print "_lookahead ", _dump($in), '==>', _dump([keys %head]), "\n";
1739	132					301	return \%head;
1740							}
1741
1742							sub _re_path_lookahead {
1743	265			265		400	my $self = shift;
1744	265					380	my $in = shift;
1745							# print "_re_path_la in ", _dump($in), "\n";
1746	265					395	my $out = '';
1747	265					605	for( my $p = 0; $p < @$in; ++$p ) {
1748	573	100				13685	if( ref($in->[$p]) eq '' ) {
		100
1749	462					715	$out .= $in->[$p];
1750	462					1011	next;
1751							}
1752							elsif( ref($in->[$p]) eq 'ARRAY' ) {
1753	2					10	$out .= _re_path_lookahead($self, $in->[$p]);
1754	2					8	next;
1755							}
1756							# print "$p ", _dump($in->[$p]), "\n";
1757							my $path = [
1758	228					527	map { _re_path_lookahead($self, $in->[$p]{$_} ) }
1759	275					550	grep { $_ ne '' }
1760	109					180	keys %{$in->[$p]}
	109					327
1761							];
1762	109					297	my $ahead = _lookahead($in->[$p]);
1763	109					168	my $more = 0;
1764	109	100	100			418	if( exists $in->[$p]{''} and $p + 1 < @$in ) {
1765	12					23	my $next = 1;
1766	12					31	while( $p + $next < @$in ) {
1767	14	100				43	if( ref( $in->[$p+$next] ) eq 'HASH' ) {
1768	2					8	my $follow = _lookahead( $in->[$p+$next] );
1769	2					5	@{$ahead}{ keys %$follow } = (values %$follow);
	2					7
1770							}
1771							else {
1772	12					29	++$ahead->{$in->[$p+$next]};
1773	12					20	last;
1774							}
1775	2					6	++$next;
1776							}
1777	12					23	$more = 1;
1778							}
1779	109					220	my $nr_one = grep { /^$Single_Char$/ } @$path;
	228					1188
1780	109					194	my $nr = @$path;
1781	109	100	100			383	if( $nr_one > 1 and $nr_one == $nr ) {
1782	18					48	$out .= _make_class($self, @$path);
1783	18	100				95	$out .= '?' if exists $in->[$p]{''};
1784							}
1785							else {
1786							my $zwla = keys(%$ahead) > 1
1787	91	100				280	? _combine($self, '?=', grep { s/\+$//; $_ } keys %$ahead )
	191					312
	191					365
1788							: '';
1789	91	100				262	my $patt = $nr > 1 ? _combine($self, '?:', @$path ) : $path->[0];
1790							# print "have nr=$nr n1=$nr_one n=", _dump($in->[$p]), ' a=', _dump([keys %$ahead]), " zwla=$zwla patt=$patt @{[_dump($path)]}\n";
1791	91	100				220	if( exists $in->[$p]{''} ) {
1792	44	100				220	$out .= $more ? "$zwla(?:$patt)?" : "(?:$zwla$patt)?";
1793							}
1794							else {
1795	47					211	$out .= "$zwla$patt";
1796							}
1797							}
1798							}
1799	265					674	return $out;
1800							}
1801
1802							sub _re_path_track {
1803	33			33		48	my $self = shift;
1804	33					45	my $in = shift;
1805	33					50	my $normal = shift;
1806	33					43	my $augmented = shift;
1807	33					43	my $o;
1808	33					51	my $simple = '';
1809	33					44	my $augment = '';
1810	33					109	for( my $n = 0; $n < @$in; ++$n ) {
1811	114	100				216	if( ref($in->[$n]) eq '' ) {
1812	104					149	$o = $in->[$n];
1813	104					140	$simple .= $o;
1814	104					165	$augment .= $o;
1815	104	100	100			627	if( (
			100
			100
1816							$n < @$in - 1
1817							and ref($in->[$n+1]) eq 'HASH' and exists $in->[$n+1]{''}
1818							)
1819							or $n == @$in - 1
1820							) {
1821	24					36	push @{$self->{mlist}}, $normal . $simple ;
	24					55
1822	24	50				62	$augment .= $] < 5.009005
1823							? "(?{\$self->{m}=$self->{mcount}})"
1824							: "(?{$self->{mcount}})"
1825							;
1826	24					61	++$self->{mcount};
1827							}
1828							}
1829							else {
1830							my $path = [
1831	25					80	map { $self->_re_path_track( $in->[$n]{$_}, $normal.$simple , $augmented.$augment ) }
1832	26					56	grep { $_ ne '' }
1833	10					17	keys %{$in->[$n]}
	10					30
1834							];
1835	10					48	$o = '(?:' . join( '\|' => sort _re_sort @$path ) . ')';
1836	10	100				26	$o .= '?' if exists $in->[$n]{''};
1837	10					20	$simple .= $o;
1838	10					30	$augment .= $o;
1839							}
1840							}
1841	33					107	return $augment;
1842							}
1843
1844							sub _re_path_pretty {
1845	411			411		638	my $self = shift;
1846	411					603	my $in = shift;
1847	411					610	my $arg = shift;
1848	411					882	my $pre = ' ' x (($arg->{depth}+0) * $arg->{indent});
1849	411					800	my $indent = ' ' x (($arg->{depth}+1) * $arg->{indent});
1850	411					615	my $out = '';
1851	411					628	$arg->{depth}++;
1852	411					597	my $prev_was_paren = 0;
1853	411					1114	for( my $p = 0; $p < @$in; ++$p ) {
1854	1084	100				2262	if( ref($in->[$p]) eq '' ) {
		100
1855	910	100				1808	$out .= "\n$pre" if $prev_was_paren;
1856	910					1402	$out .= $in->[$p];
1857	910					2266	$prev_was_paren = 0;
1858							}
1859							elsif( ref($in->[$p]) eq 'ARRAY' ) {
1860	3					11	$out .= _re_path($self, $in->[$p]);
1861							}
1862							else {
1863							my $path = [
1864	369					932	map { _re_path_pretty($self, $in->[$p]{$_}, $arg ) }
1865	419					864	grep { $_ ne '' }
1866	171					271	keys %{$in->[$p]}
	171					461
1867							];
1868	171					382	my $nr = @$path;
1869	171					278	my( @short, @long );
1870	171	100				332	push @{/^$Single_Char$/ ? \@short : \@long}, $_ for @$path;
	369					2162
1871	171	100				419	if( @short == $nr ) {
1872	37	100				118	$out .= $nr == 1 ? $path->[0] : _make_class($self, @short);
1873	37	100				174	$out .= '?' if exists $in->[$p]{''};
1874							}
1875							else {
1876	134	100				355	$out .= "\n" if length $out;
1877	134	100				318	$out .= $pre if $p;
1878	134					291	$out .= "(?:\n$indent";
1879	134	100				286	if( @short < 2 ) {
1880	133					234	my $r = 0;
1881							$out .= join( "\n$indent\|" => map {
1882	133	100				481	$r++ and $_ =~ s/^\(\?:/\n$indent(?:/;
	298					732
1883	298					734	$_
1884							}
1885							sort _re_sort @$path
1886							);
1887							}
1888							else {
1889	1					5	$out .= join( "\n$indent\|" => ( (sort _re_sort @long), _make_class($self, @short) ));
1890							}
1891	134					350	$out .= "\n$pre)";
1892	134	100				311	if( exists $in->[$p]{''} ) {
1893	37					86	$out .= "\n$pre?";
1894	37					134	$prev_was_paren = 0;
1895							}
1896							else {
1897	97					333	$prev_was_paren = 1;
1898							}
1899							}
1900							}
1901							}
1902	411					662	$arg->{depth}--;
1903	411					1100	return $out;
1904							}
1905
1906							sub _node_eq {
1907	425	100	100	425		1832	return 0 if not defined $_[0] or not defined $_[1];
1908	422	100				1251	return 0 if ref $_[0] ne ref $_[1];
1909							# Now that we have determined that the reference of each
1910							# argument are the same, we only have to test the first
1911							# one, which gives us a nice micro-optimisation.
1912	381	100				865	if( ref($_[0]) eq 'HASH' ) {
		100
1913	305					676	keys %{$_[0]} == keys %{$_[1]}
	305					950
1914							and
1915							# does this short-circuit to avoid _re_path() cost more than it saves?
1916	305	100	100			453	join( '\|' => sort keys %{$_[0]}) eq join( '\|' => sort keys %{$_[1]})
	272					870
	272					1518
1917							and
1918							_re_path(undef, [$_[0]] ) eq _re_path(undef, [$_[1]] );
1919							}
1920							elsif( ref($_[0]) eq 'ARRAY' ) {
1921	9	100				12	scalar @{$_[0]} == scalar @{$_[1]}
	9					16
	9					32
1922							and
1923							_re_path(undef, $_[0]) eq _re_path(undef, $_[1]);
1924							}
1925							else {
1926	67					251	$_[0] eq $_[1];
1927							}
1928							}
1929
1930							sub _pretty_dump {
1931	7			7		25	return sprintf "\\x%02x", ord(shift);
1932							}
1933
1934							sub _dump {
1935	5579			5579		8986	my $path = shift;
1936	5579	100				11914	return _dump_node($path) if ref($path) eq 'HASH';
1937	5057					7330	my $dump = '[';
1938	5057					6645	my $d;
1939	5057					6892	my $nr = 0;
1940	5057					7822	for $d( @$path ) {
1941	11036	100				21262	$dump .= ' ' if $nr++;
1942	11036	100				24932	if( ref($d) eq 'HASH' ) {
		100
		100
1943	1340					2502	$dump .= _dump_node($d);
1944							}
1945							elsif( ref($d) eq 'ARRAY' ) {
1946	242					465	$dump .= _dump($d);
1947							}
1948							elsif( defined $d ) {
1949							# D::C indicates the second test is redundant
1950							# $dump .= ( $d =~ /\s/ or not length $d )
1951	9453	100				25879	$dump .= (
		100
1952							$d =~ /\s/ ? qq{'$d'} :
1953							$d =~ /^[\x00-\x1f]$/ ? _pretty_dump($d) :
1954							$d
1955							);
1956							}
1957							else {
1958	1					3	$dump .= '*';
1959							}
1960							}
1961	5057					25902	return $dump . ']';
1962							}
1963
1964							sub _dump_node {
1965	1862			1862		2807	my $node = shift;
1966	1862					2629	my $dump = '{';
1967	1862					2506	my $nr = 0;
1968	1862					2486	my $n;
1969	1862					4770	for $n (sort keys %$node) {
1970	3899	100				8141	$dump .= ' ' if $nr++;
1971							# Devel::Cover shows this to test to be redundant
1972							# $dump .= ( $n eq '' and not defined $node->{$n} )
1973							$dump .= $n eq ''
1974							? '*'
1975							: ($n =~ /^[\x00-\x1f]$/ ? _pretty_dump($n) : $n)
1976	3899	100				11559	. "=>" . _dump($node->{$n})
		100
1977							;
1978							}
1979	1862					9728	return $dump . '}';
1980							}
1981
1982							=pod
1983
1984							=head1 NAME
1985
1986							Regexp::Assemble - Assemble multiple Regular Expressions into a single RE
1987
1988							=head1 SYNOPSIS
1989
1990							use Regexp::Assemble;
1991
1992							my $ra = Regexp::Assemble->new;
1993							$ra->add( 'ab+c' );
1994							$ra->add( 'ab+-' );
1995							$ra->add( 'a\w\d+' );
1996							$ra->add( 'a\d+' );
1997							print $ra->re; # prints a(?:\w?\d+\|b+[-c])
1998
1999							=head1 DESCRIPTION
2000
2001							Regexp::Assemble takes an arbitrary number of regular expressions
2002							and assembles them into a single regular expression (or RE) that
2003							matches all that the individual REs match.
2004
2005							As a result, instead of having a large list of expressions to loop
2006							over, a target string only needs to be tested against one expression.
2007							This is interesting when you have several thousand patterns to deal
2008							with. Serious effort is made to produce the smallest pattern possible.
2009
2010							It is also possible to track the original patterns, so that you can
2011							determine which, among the source patterns that form the assembled
2012							pattern, was the one that caused the match to occur.
2013
2014							You should realise that large numbers of alternations are processed
2015							in perl's regular expression engine in O(n) time, not O(1). If you
2016							are still having performance problems, you should look at using a
2017							trie. Note that Perl's own regular expression engine will implement
2018							trie optimisations in perl 5.10 (they are already available in
2019							perl 5.9.3 if you want to try them out). C will
2020							do the right thing when it knows it's running on a trie'd perl.
2021							(At least in some version after this one).
2022
2023							Some more examples of usage appear in the accompanying README. If
2024							that file is not easy to access locally, you can find it on a web
2025							repository such as
2026							L or
2027							L.
2028
2029							See also L.
2030
2031							=head1 Methods
2032
2033							=head2 add(LIST)
2034
2035							Takes a string, breaks it apart into a set of tokens (respecting
2036							meta characters) and inserts the resulting list into the C
2037							object. It uses a naive regular expression to lex the string
2038							that may be fooled complex expressions (specifically, it will
2039							fail to lex nested parenthetical expressions such as
2040							C correctly). If this is the case, the end of
2041							the string will not be tokenised correctly and returned as one
2042							long string.
2043
2044							On the one hand, this may indicate that the patterns you are
2045							trying to feed the C object are too complex. Simpler
2046							patterns might allow the algorithm to work more effectively and
2047							perform more reductions in the resulting pattern.
2048
2049							On the other hand, you can supply your own pattern to perform the
2050							lexing if you need. The test suite contains an example of a lexer
2051							pattern that will match one level of nested parentheses.
2052
2053							Note that there is an internal optimisation that will bypass a
2054							much of the lexing process. If a string contains no C<\>
2055							(backslash), C<[> (open square bracket), C<(> (open paren),
2056							C (question mark), C<+> (plus), C<*> (star) or C<{> (open
2057							curly), a character split will be performed directly.
2058
2059							A list of strings may be supplied, thus you can pass it a file
2060							handle of a file opened for reading:
2061
2062							$re->add( '\d+-\d+-\d+-\d+\.example\.com' );
2063							$re->add( );
2064
2065							If the file is very large, it may be more efficient to use a
2066							C loop, to read the file line-by-line:
2067
2068							$re->add($_) while ;
2069
2070							The C method will chomp the lines automatically. If you
2071							do not want this to occur (you want to keep the record
2072							separator), then disable Cing.
2073
2074							$re->chomp(0);
2075							$re->add($_) while ;
2076
2077							This method is chainable.
2078
2079							=head2 add_file(FILENAME [...])
2080
2081							Takes a list of file names. Each file is opened and read
2082							line by line. Each line is added to the assembly.
2083
2084							$r->add_file( 'file.1', 'file.2' );
2085
2086							If a file cannot be opened, the method will croak. If you cannot
2087							afford to let this happen then you should wrap the call in a C
2088							block.
2089
2090							Chomping happens automatically unless you the C method
2091							to disable it. By default, input lines are read according to the
2092							value of the C attribute (if defined), and
2093							will otherwise fall back to the current setting of the system C<$/>
2094							variable. The record separator may also be specified on each
2095							call to C. Internally, the routine Cises the
2096							value of C<$/> to whatever is required, for the duration of the
2097							call.
2098
2099							An alternate calling mechanism using a hash reference is
2100							available. The recognised keys are:
2101
2102							=over 4
2103
2104							=item file
2105
2106							Reference to a list of file names, or the name of a single
2107							file.
2108
2109							$r->add_file({file => ['file.1', 'file.2', 'file.3']});
2110							$r->add_file({file => 'file.n'});
2111
2112							=item input_record_separator
2113
2114							If present, indicates what constitutes a line
2115
2116							$r->add_file({file => 'data.txt', input_record_separator => ':' });
2117
2118							=item rs
2119
2120							An alias for input_record_separator (mnemonic: same as the
2121							English variable names).
2122
2123							=back
2124
2125							$r->add_file( {
2126							file => [ 'pattern.txt', 'more.txt' ],
2127							input_record_separator => "\r\n",
2128							});
2129
2130							=head2 clone()
2131
2132							Clones the contents of a Regexp::Assemble object and creates a new
2133							object (in other words it performs a deep copy).
2134
2135							If the Storable module is installed, its dclone method will be used,
2136							otherwise the cloning will be performed using a pure perl approach.
2137
2138							You can use this method to take a snapshot of the patterns that have
2139							been added so far to an object, and generate an assembly from the
2140							clone. Additional patterns may to be added to the original object
2141							afterwards.
2142
2143							my $re = $main->clone->re();
2144							$main->add( 'another-pattern-\\d+' );
2145
2146							=head2 insert(LIST)
2147
2148							Takes a list of tokens representing a regular expression and
2149							stores them in the object. Note: you should not pass it a bare
2150							regular expression, such as C. You must pass it as
2151							a list of tokens, I C<('a', 'b+', 'c?', 'd*', 'e')>.
2152
2153							This method is chainable, I:
2154
2155							my $ra = Regexp::Assemble->new
2156							->insert( qw[ a b+ c? d* e ] )
2157							->insert( qw[ a c+ d+ e* f ] );
2158
2159							Lexing complex patterns with metacharacters and so on can consume
2160							a significant proportion of the overall time to build an assembly.
2161							If you have the information available in a tokenised form, calling
2162							C directly can be a big win.
2163
2164							=head2 lexstr
2165
2166							Use the C method if you are curious to see how a pattern
2167							gets tokenised. It takes a scalar on input, representing a pattern,
2168							and returns a reference to an array, containing the tokenised
2169							pattern. You can recover the original pattern by performing a
2170							C:
2171
2172							my @token = $re->lexstr($pattern);
2173							my $new_pattern = join( '', @token );
2174
2175							If the original pattern contains unnecessary backslashes, or C<\x4b>
2176							escapes, or quotemeta escapes (C<\Q>...C<\E>) the resulting pattern
2177							may not be identical.
2178
2179							Call C does not add the pattern to the object, it is merely
2180							for exploratory purposes. It will, however, update various statistical
2181							counters.
2182
2183							=head2 pre_filter(CODE)
2184
2185							Allows you to install a callback to check that the pattern being
2186							loaded contains valid input. It receives the pattern as a whole to
2187							be added, before it been tokenised by the lexer. It may to return
2188							0 or C to indicate that the pattern should not be added, any
2189							true value indicates that the contents are fine.
2190
2191							A filter to strip out trailing comments (marked by #):
2192
2193							$re->pre_filter( sub { $_[0] =~ s/\s#.$//; 1 } );
2194
2195							A filter to ignore blank lines:
2196
2197							$re->pre_filter( sub { length(shift) } );
2198
2199							If you want to remove the filter, pass C as a parameter.
2200
2201							$ra->pre_filter(undef);
2202
2203							This method is chainable.
2204
2205							=head2 filter(CODE)
2206
2207							Allows you to install a callback to check that the pattern being
2208							loaded contains valid input. It receives a list on input, after it
2209							has been tokenised by the lexer. It may to return 0 or undef to
2210							indicate that the pattern should not be added, any true value
2211							indicates that the contents are fine.
2212
2213							If you know that all patterns you expect to assemble contain
2214							a restricted set of of tokens (e.g. no spaces), you could do
2215							the following:
2216
2217							$ra->filter(sub { not grep { / / } @_ });
2218
2219							or
2220
2221							sub only_spaces_and_digits {
2222							not grep { ![\d ] } @_
2223							}
2224							$ra->filter( \&only_spaces_and_digits );
2225
2226							These two examples will silently ignore faulty patterns, If you
2227							want the user to be made aware of the problem you should raise an
2228							error (via C or C), log an error message, whatever is
2229							best. If you want to remove a filter, pass C as a parameter.
2230
2231							$ra->filter(undef);
2232
2233							This method is chainable.
2234
2235							=head2 as_string
2236
2237							Assemble the expression and return it as a string. You may want to do
2238							this if you are writing the pattern to a file. The following arguments
2239							can be passed to control the aspect of the resulting pattern:
2240
2241							B, the number of spaces used to indent nested grouping of
2242							a pattern. Use this to produce a pretty-printed pattern (for some
2243							definition of "pretty"). The resulting output is rather verbose. The
2244							reason is to ensure that the metacharacters C<(?:> and C<)> always
2245							occur on otherwise empty lines. This allows you grep the result for an
2246							even more synthetic view of the pattern:
2247
2248							egrep -v '^ *[()]'
2249
2250							The result of the above is quite readable. Remember to backslash the
2251							spaces appearing in your own patterns if you wish to use an indented
2252							pattern in an C construct. Indenting is ignored if tracking
2253							is enabled.
2254
2255							The B argument takes precedence over the C
2256							method/attribute of the object.
2257
2258							Calling this
2259							method will drain the internal data structure. Large numbers of patterns
2260							can eat a significant amount of memory, and this lets perl recover the
2261							memory used for other purposes.
2262
2263							If you want to reduce the pattern I continue to add new patterns,
2264							clone the object and reduce the clone, leaving the original object intact.
2265
2266							=head2 re
2267
2268							Assembles the pattern and return it as a compiled RE, using the
2269							C operator.
2270
2271							As with C, calling this method will reset the internal data
2272							structures to free the memory used in assembling the RE.
2273
2274							The B attribute, documented in the C method, can be
2275							used here (it will be ignored if tracking is enabled).
2276
2277							With method chaining, it is possible to produce a RE without having
2278							a temporary C object lying around, I:
2279
2280							my $re = Regexp::Assemble->new
2281							->add( q[ab+cd+e] )
2282							->add( q[ac\\d+e] )
2283							->add( q[c\\d+e] )
2284							->re;
2285
2286							The C<$re> variable now contains a Regexp object that can be used
2287							directly:
2288
2289							while( <> ) {
2290							/$re/ and print "Something in [$_] matched\n";
2291							)
2292
2293							The C method is called when the object is used in string context
2294							(hence, within an C operator), so by and large you do not even
2295							need to save the RE in a separate variable. The following will work
2296							as expected:
2297
2298							my $re = Regexp::Assemble->new->add( qw[ fee fie foe fum ] );
2299							while( ) {
2300							if( /($re)/ ) {
2301							print "Here be giants: $1\n";
2302							}
2303							}
2304
2305							This approach does not work with tracked patterns. The
2306							C and C methods must be used instead, see below.
2307
2308							=head2 match(SCALAR)
2309
2310							The following information applies to Perl 5.8 and below. See
2311							the section that follows for information on Perl 5.10.
2312
2313							If pattern tracking is in use, you must C in order
2314							to make things work correctly. At a minimum, this will make your
2315							code look like this:
2316
2317							my $did_match = do { use re 'eval'; $target =~ /$ra/ }
2318							if( $did_match ) {
2319							print "matched ", $ra->matched, "\n";
2320							}
2321
2322							(The main reason is that the C<$^R> variable is currently broken
2323							and an ugly workaround that runs some Perl code during the match
2324							is required, in order to simulate what C<$^R> should be doing. See
2325							Perl bug #32840 for more information if you are curious. The README
2326							also contains more information). This bug has been fixed in 5.10.
2327
2328							The important thing to note is that with C, THERE
2329							ARE SECURITY IMPLICATIONS WHICH YOU IGNORE AT YOUR PERIL. The problem
2330							is this: if you do not have strict control over the patterns being
2331							fed to C when tracking is enabled, and someone
2332							slips you a pattern such as C and you
2333							attempt to match a string against the resulting pattern, you will
2334							know Fear and Loathing.
2335
2336							What is more, the C<$^R> workaround means that that tracking does
2337							not work if you perform a bare C pattern match as shown
2338							above. You have to instead call the C method, in order to
2339							supply the necessary context to take care of the tracking housekeeping
2340							details.
2341
2342							if( defined( my $match = $ra->match($_)) ) {
2343							print " $_ matched by $match\n";
2344							}
2345
2346							In the case of a successful match, the original matched pattern
2347							is returned directly. The matched pattern will also be available
2348							through the C method.
2349
2350							(Except that the above is not true for 5.6.0: the C method
2351							returns true or undef, and the C method always returns
2352							undef).
2353
2354							If you are capturing parts of the pattern I C
2355							you will want to get at the captures. See the C, C,
2356							C and C methods. If you are not using captures
2357							then you may safely ignore this section.
2358
2359							In 5.10, since the bug concerning C<$^R> has been resolved, there
2360							is no need to use C and the assembled pattern does
2361							not require any Perl code to be executed during the match.
2362
2363							=head2 new()
2364
2365							Creates a new C object. The following optional
2366							key/value parameters may be employed. All keys have a corresponding
2367							method that can be used to change the behaviour later on. As a
2368							general rule, especially if you're just starting out, you don't
2369							have to bother with any of these.
2370
2371							B, a family of optional attributes that allow anchors
2372							(C<^>, C<\b>, C<\Z>...) to be added to the resulting pattern.
2373
2374							B, sets the C flags to add to the assembled regular
2375							expression. Warning: no error checking is done, you should ensure
2376							that the flags you pass are understood by the version of Perl you
2377							are using. B exists as an alias, for users familiar
2378							with L.
2379
2380							B, controls whether the pattern should be chomped before being
2381							lexed. Handy if you are reading patterns from a file. By default,
2382							Cing is performed (this behaviour changed as of version 0.24,
2383							prior versions did not chomp automatically).
2384							See also the C attribute and the C method.
2385
2386							B, slurp the contents of the specified file and add them
2387							to the assembly. Multiple files may be processed by using a list.
2388
2389							my $r = Regexp::Assemble->new(file => 're.list');
2390
2391							my $r = Regexp::Assemble->new(file => ['re.1', 're.2']);
2392
2393							If you really don't want chomping to occur, you will have to set
2394							the C attribute to 0 (zero). You may also want to look at
2395							the C attribute, as well.
2396
2397							B, controls what constitutes a record
2398							separator when using the C attribute or the C
2399							method. May be abbreviated to B. See the C<$/> variable in
2400							L.
2401
2402							B, controls whether the pattern should contain zero-width
2403							lookahead assertions (For instance: (?=[abc])(?:bob\|alice\|charles).
2404							This is not activated by default, because in many circumstances the
2405							cost of processing the assertion itself outweighs the benefit of
2406							its faculty for short-circuiting a match that will fail. This is
2407							sensitive to the probability of a match succeeding, so if you're
2408							worried about performance you'll have to benchmark a sample population
2409							of targets to see which way the benefits lie.
2410
2411							B, controls whether you want know which of the initial
2412							patterns was the one that matched. See the C method for
2413							more details. Note for version 5.8 of Perl and below, in this mode
2414							of operation YOU SHOULD BE AWARE OF THE SECURITY IMPLICATIONS that
2415							this entails. Perl 5.10 does not suffer from any such restriction.
2416
2417							B, the number of spaces used to indent nested grouping of
2418							a pattern. Use this to produce a pretty-printed pattern. See the
2419							C method for a more detailed explanation.
2420
2421							B, allows you to add a callback to enable sanity checks
2422							on the pattern being loaded. This callback is triggered before the
2423							pattern is split apart by the lexer. In other words, it operates
2424							on the entire pattern. If you are loading patterns from a file,
2425							this would be an appropriate place to remove comments.
2426
2427							B, allows you to add a callback to enable sanity checks on
2428							the pattern being loaded. This callback is triggered after the
2429							pattern has been split apart by the lexer.
2430
2431							B, controls whether to unroll, for example, C into
2432							C, C, which may allow additional reductions in the
2433							resulting assembled pattern.
2434
2435							B, controls whether tail reduction occurs or not. If set,
2436							patterns like C will be reduced to C.
2437							That is, the end of the pattern in each part of the b... and d...
2438							alternations is identical, and hence is hoisted out of the alternation
2439							and placed after it. On by default. Turn it off if you're really
2440							pressed for short assembly times.
2441
2442							B, specifies the pattern used to lex the input lines into
2443							tokens. You could replace the default pattern by a more sophisticated
2444							version that matches arbitrarily nested parentheses, for example.
2445
2446							B, controls whether copious amounts of output is produced
2447							during the loading stage or the reducing stage of assembly.
2448
2449							my $ra = Regexp::Assemble->new;
2450							my $rb = Regexp::Assemble->new( chomp => 1, debug => 3 );
2451
2452							B, controls whether new patterns can be added to the object
2453							after the assembled pattern is generated. DEPRECATED.
2454
2455							This method/attribute will be removed in a future release. It doesn't
2456							really serve any purpose, and may be more effectively replaced by
2457							cloning an existing C object and spinning out a
2458							pattern from that instead.
2459
2460							=head2 source()
2461
2462							When using tracked mode, after a successful match is made, returns
2463							the original source pattern that caused the match. In Perl 5.10,
2464							the C<$^R> variable can be used to as an index to fetch the correct
2465							pattern from the object.
2466
2467							If no successful match has been performed, or the object is not in
2468							tracked mode, this method returns C.
2469
2470							my $r = Regexp::Assemble->new->track(1)->add(qw(foo? bar{2} [Rr]at));
2471
2472							for my $w (qw(this food is rather barren)) {
2473							if ($w =~ /$r/) {
2474							print "$w matched by ", $r->source($^R), $/;
2475							}
2476							else {
2477							print "$w no match\n";
2478							}
2479							}
2480
2481							=head2 mbegin()
2482
2483							This method returns a copy of C<@-> at the moment of the
2484							last match. You should ordinarily not need to bother with
2485							this, C should be able to supply all your needs.
2486
2487							=head2 mend()
2488
2489							This method returns a copy of C<@+> at the moment of the
2490							last match.
2491
2492							=head2 mvar(NUMBER)
2493
2494							The C method returns the captures of the last match.
2495							C corresponds to $1, C to $2, and so on.
2496							C happens to return the target string matched,
2497							as a byproduct of walking down the C<@-> and C<@+> arrays
2498							after the match.
2499
2500							If called without a parameter, C will return a
2501							reference to an array containing all captures.
2502
2503							=head2 capture
2504
2505							The C method returns the the captures of the last
2506							match as an array. Unlink C, this method does not
2507							include the matched string. It is equivalent to getting an
2508							array back that contains C<$1, $2, $3, ...>.
2509
2510							If no captures were found in the match, an empty array is
2511							returned, rather than C. You are therefore guaranteed
2512							to be able to use C<< for my $c ($re->capture) { ... >>
2513							without have to check whether anything was captured.
2514
2515							=head2 matched()
2516
2517							If pattern tracking has been set, via the C attribute,
2518							or through the C method, this method will return the
2519							original pattern of the last successful match. Returns undef
2520							match has yet been performed, or tracking has not been enabled.
2521
2522							See below in the NOTES section for additional subtleties of
2523							which you should be aware of when tracking patterns.
2524
2525							Note that this method is not available in 5.6.0, due to
2526							limitations in the implementation of C<(?{...})> at the time.
2527
2528							=head2 Statistics/Reporting routines
2529
2530							=head2 stats_add
2531
2532							Returns the number of patterns added to the assembly (whether
2533							by C or C). Duplicate patterns are not included
2534							in this total.
2535
2536							=head2 stats_dup
2537
2538							Returns the number of duplicate patterns added to the assembly.
2539							If non-zero, this may be a sign that something is wrong with
2540							your data (or at the least, some needless redundancy). This may
2541							occur when you have two patterns (for instance, C and
2542							C) which map to the same result.
2543
2544							=head2 stats_raw()
2545
2546							Returns the raw number of bytes in the patterns added to the
2547							assembly. This includes both original and duplicate patterns.
2548							For instance, adding the two patterns C and C will
2549							count as 4 bytes.
2550
2551							=head2 stats_cooked()
2552
2553							Return the true number of bytes added to the assembly. This
2554							will not include duplicate patterns. Furthermore, it may differ
2555							from the raw bytes due to quotemeta treatment. For instance,
2556							C will count as 7 (not 8) bytes, because C<\,> will
2557							be stored as C<,>. Also, C<\Qa.b\E> is 7 bytes long, however,
2558							after the quotemeta directives are processed, C will be
2559							stored, for a total of 4 bytes.
2560
2561							=head2 stats_length()
2562
2563							Returns the length of the resulting assembled expression.
2564							Until C or C have been called, the length
2565							will be 0 (since the assembly will have not yet been
2566							performed). The length includes only the pattern, not the
2567							additional (C<(?-xism...>) fluff added by the compilation.
2568
2569							=head2 dup_warn(NUMBER\|CODEREF)
2570
2571							Turns warnings about duplicate patterns on or off. By
2572							default, no warnings are emitted. If the method is
2573							called with no parameters, or a true parameter,
2574							the object will carp about patterns it has
2575							already seen. To turn off the warnings, use 0 as a
2576							parameter.
2577
2578							$r->dup_warn();
2579
2580							The method may also be passed a code block. In this case
2581							the code will be executed and it will receive a reference
2582							to the object in question, and the lexed pattern.
2583
2584							$r->dup_warn(
2585							sub {
2586							my $self = shift;
2587							print $self->stats_add, " patterns added at line $.\n",
2588							join( '', @_ ), " added previously\n";
2589							}
2590							)
2591
2592							=head2 Anchor routines
2593
2594							Suppose you wish to assemble a series of patterns that all begin
2595							with C<^> and end with C<$> (anchor pattern to the beginning and
2596							end of line). Rather than add the anchors to each and every pattern
2597							(and possibly forget to do so when a new entry is added), you may
2598							specify the anchors in the object, and they will appear in the
2599							resulting pattern, and you no longer need to (or should) put them
2600							in your source patterns. For example, the two following snippets
2601							will produce identical patterns:
2602
2603							$r->add(qw(^this ^that ^them))->as_string;
2604
2605							$r->add(qw(this that them))->anchor_line_begin->as_string;
2606
2607							# both techniques will produce ^th(?:at\|em\|is)
2608
2609							All anchors are possible word (C<\b>) boundaries, line
2610							boundaries (C<^> and C<$>) and string boundaries (C<\A>
2611							and C<\Z> (or C<\z> if you absolutely need it)).
2612
2613							The shortcut C> implies both
2614							C_begin> C_end>
2615							is also available. If different anchors are specified
2616							the most specific anchor wins. For instance, if both
2617							C and C are
2618							specified, C takes precedence.
2619
2620							All the anchor methods are chainable.
2621
2622							=head2 anchor_word_begin
2623
2624							The resulting pattern will be prefixed with a C<\b>
2625							word boundary assertion when the value is true. Set
2626							to 0 to disable.
2627
2628							$r->add('pre')->anchor_word_begin->as_string;
2629							# produces '\bpre'
2630
2631							=head2 anchor_word_end
2632
2633							The resulting pattern will be suffixed with a C<\b>
2634							word boundary assertion when the value is true. Set
2635							to 0 to disable.
2636
2637							$r->add(qw(ing tion))
2638							->anchor_word_end
2639							->as_string; # produces '(?:tion\|ing)\b'
2640
2641							=head2 anchor_word
2642
2643							The resulting pattern will be have C<\b>
2644							word boundary assertions at the beginning and end
2645							of the pattern when the value is true. Set
2646							to 0 to disable.
2647
2648							$r->add(qw(cat carrot)
2649							->anchor_word(1)
2650							->as_string; # produces '\bca(?:rro)t\b'
2651
2652							=head2 anchor_line_begin
2653
2654							The resulting pattern will be prefixed with a C<^>
2655							line boundary assertion when the value is true. Set
2656							to 0 to disable.
2657
2658							$r->anchor_line_begin;
2659							# or
2660							$r->anchor_line_begin(1);
2661
2662							=head2 anchor_line_end
2663
2664							The resulting pattern will be suffixed with a C<$>
2665							line boundary assertion when the value is true. Set
2666							to 0 to disable.
2667
2668							# turn it off
2669							$r->anchor_line_end(0);
2670
2671							=head2 anchor_line
2672
2673							The resulting pattern will be have the C<^> and C<$>
2674							line boundary assertions at the beginning and end
2675							of the pattern, respectively, when the value is true. Set
2676							to 0 to disable.
2677
2678							$r->add(qw(cat carrot)
2679							->anchor_line
2680							->as_string; # produces '^ca(?:rro)t$'
2681
2682							=head2 anchor_string_begin
2683
2684							The resulting pattern will be prefixed with a C<\A>
2685							string boundary assertion when the value is true. Set
2686							to 0 to disable.
2687
2688							$r->anchor_string_begin(1);
2689
2690							=head2 anchor_string_end
2691
2692							The resulting pattern will be suffixed with a C<\Z>
2693							string boundary assertion when the value is true. Set
2694							to 0 to disable.
2695
2696							# disable the string boundary end anchor
2697							$r->anchor_string_end(0);
2698
2699							=head2 anchor_string_end_absolute
2700
2701							The resulting pattern will be suffixed with a C<\z>
2702							string boundary assertion when the value is true. Set
2703							to 0 to disable.
2704
2705							# disable the string boundary absolute end anchor
2706							$r->anchor_string_end_absolute(0);
2707
2708							If you don't understand the difference between
2709							C<\Z> and C<\z>, the former will probably do what
2710							you want.
2711
2712							=head2 anchor_string
2713
2714							The resulting pattern will be have the C<\A> and C<\Z>
2715							string boundary assertions at the beginning and end
2716							of the pattern, respectively, when the value is true. Set
2717							to 0 to disable.
2718
2719							$r->add(qw(cat carrot)
2720							->anchor_string
2721							->as_string; # produces '\Aca(?:rro)t\Z'
2722
2723							=head2 anchor_string_absolute
2724
2725							The resulting pattern will be have the C<\A> and C<\z>
2726							string boundary assertions at the beginning and end
2727							of the pattern, respectively, when the value is true. Set
2728							to 0 to disable.
2729
2730							$r->add(qw(cat carrot)
2731							->anchor_string_absolute
2732							->as_string; # produces '\Aca(?:rro)t\z'
2733
2734							=head2 debug(NUMBER)
2735
2736							Turns debugging on or off. Statements are printed
2737							to the currently selected file handle (STDOUT by default).
2738							If you are already using this handle, you will have to
2739							arrange to select an output handle to a file of your own
2740							choosing, before call the C, C or C)
2741							functions, otherwise it will scribble all over your
2742							carefully formatted output.
2743
2744							=over 4
2745
2746							=item * 0
2747
2748							Off. Turns off all debugging output.
2749
2750							=item * 1
2751
2752							Add. Trace the addition of patterns.
2753
2754							=item * 2
2755
2756							Reduce. Trace the process of reduction and assembly.
2757
2758							=item * 4
2759
2760							Lex. Trace the lexing of the input patterns into its constituent
2761							tokens.
2762
2763							=item * 8
2764
2765							Time. Print to STDOUT the time taken to load all the patterns. This is
2766							nothing more than the difference between the time the object was
2767							instantiated and the time reduction was initiated.
2768
2769							# load=
2770
2771							Any lengthy computation performed in the client code will be reflected
2772							in this value. Another line will be printed after reduction is
2773							complete.
2774
2775							# reduce=
2776
2777							The above output lines will be changed to C and
2778							C if the internal state of the object is corrupted
2779							and the initial timestamp is lost.
2780
2781							The code attempts to load L in order to report fractional
2782							seconds. If this is not successful, the elapsed time is displayed
2783							in whole seconds.
2784
2785							=back
2786
2787							Values can be added (or or'ed together) to trace everything
2788
2789							$r->debug(7)->add( '\\d+abc' );
2790
2791							Calling C with no arguments turns debugging off.
2792
2793							=head2 dump()
2794
2795							Produces a synthetic view of the internal data structure. How
2796							to interpret the results is left as an exercise to the reader.
2797
2798							print $r->dump;
2799
2800							=head2 chomp(0\|1)
2801
2802							Turns chomping on or off.
2803
2804							IMPORTANT: As of version 0.24, chomping is now on by default as it
2805							makes C Just Work. The only time you may run into trouble
2806							is with C. So don't do that, or else explicitly turn
2807							off chomping.
2808
2809							To avoid incorporating (spurious)
2810							record separators (such as "\n" on Unix) when reading from a file,
2811							C Cs its input. If you don't want this to happen,
2812							call C with a false value.
2813
2814							$re->chomp(0); # really want the record separators
2815							$re->add();
2816
2817							=head2 fold_meta_pairs(NUMBER)
2818
2819							Determines whether C<\s>, C<\S> and C<\w>, C<\W> and C<\d>, C<\D>
2820							are folded into a C<.> (dot). Folding happens by default (for
2821							reasons of backwards compatibility, even though it is wrong when
2822							the C expression modifier is active).
2823
2824							Call this method with a false value to prevent this behaviour (which
2825							is only a problem when dealing with C<\n> if the C expression
2826							modifier is also set).
2827
2828							$re->add( '\\w', '\\W' );
2829							my $clone = $re->clone;
2830
2831							$clone->fold_meta_pairs(0);
2832							print $clone->as_string; # prints '.'
2833							print $re->as_string; # print '[\W\w]'
2834
2835							=head2 indent(NUMBER)
2836
2837							Sets the level of indent for pretty-printing nested groups
2838							within a pattern. See the C method for more details.
2839							When called without a parameter, no indenting is performed.
2840
2841							$re->indent( 4 );
2842							print $re->as_string;
2843
2844							=head2 lookahead(0\|1)
2845
2846							Turns on zero-width lookahead assertions. This is usually
2847							beneficial when you expect that the pattern will usually fail.
2848							If you expect that the pattern will usually match you will
2849							probably be worse off.
2850
2851							=head2 flags(STRING)
2852
2853							Sets the flags that govern how the pattern behaves (for
2854							versions of Perl up to 5.9 or so, these are C). By
2855							default no flags are enabled.
2856
2857							=head2 modifiers(STRING)
2858
2859							An alias of the C method, for users familiar with
2860							C.
2861
2862							=head2 track(0\|1)
2863
2864							Turns tracking on or off. When this attribute is enabled,
2865							additional housekeeping information is inserted into the
2866							assembled expression using C<({...}> embedded code
2867							constructs. This provides the necessary information to
2868							determine which, of the original patterns added, was the
2869							one that caused the match.
2870
2871							$re->track( 1 );
2872							if( $target =~ /$re/ ) {
2873							print "$target matched by ", $re->matched, "\n";
2874							}
2875
2876							Note that when this functionality is enabled, no
2877							reduction is performed and no character classes are
2878							generated. In other words, C is not
2879							reduced down to C<(?:br\|t)ag> and C is not
2880							reduced to C.
2881
2882							=head2 unroll_plus(0\|1)
2883
2884							Turns the unrolling of plus metacharacters on or off. When
2885							a pattern is broken up, C becomes C, C (and
2886							C becomes C, C. This may allow the freed C
2887							to assemble with other patterns. Not enabled by default.
2888
2889							=head2 lex(SCALAR)
2890
2891							Change the pattern used to break a string apart into tokens.
2892							You can examine the C script as a starting point.
2893
2894							=head2 reduce(0\|1)
2895
2896							Turns pattern reduction on or off. A reduced pattern may
2897							be considerably shorter than an unreduced pattern. Consider
2898							C I C. An unreduced
2899							pattern will be very similar to those produced by
2900							C. Reduction is on by default. Turning
2901							it off speeds assembly (but assembly is pretty fast -- it's
2902							the breaking up of the initial patterns in the lexing stage
2903							that can consume a non-negligible amount of time).
2904
2905							=head2 mutable(0\|1)
2906
2907							This method has been marked as DEPRECATED. It will be removed
2908							in a future release. See the C method for a technique
2909							to replace its functionality.
2910
2911							=head2 reset()
2912
2913							Empties out the patterns that have been Ced or C-ed
2914							into the object. Does not modify the state of controller attributes
2915							such as C, C, C and the like.
2916
2917							=head2 Default_Lexer
2918
2919							B the C function is a class method, not
2920							an object method. It is a fatal error to call it as an object
2921							method.
2922
2923							The C method lets you replace the default pattern
2924							used for all subsequently created C objects. It
2925							will not have any effect on existing objects. (It is also possible
2926							to override the lexer pattern used on a per-object basis).
2927
2928							The parameter should be an ordinary scalar, not a compiled
2929							pattern. If the pattern fails to match all parts of the string,
2930							the missing parts will be returned as single chunks. Therefore
2931							the following pattern is legal (albeit rather cork-brained):
2932
2933							Regexp::Assemble::Default_Lexer( '\\d' );
2934
2935							The above pattern will split up input strings digit by digit, and
2936							all non-digit characters as single chunks.
2937
2938							=head1 DIAGNOSTICS
2939
2940							"Cannot pass a C to Default_Lexer"
2941
2942							You tried to replace the default lexer pattern with an object
2943							instead of a scalar. Solution: You probably tried to call
2944							C<< $obj->Default_Lexer >>. Call the qualified class method instead
2945							C.
2946
2947							"filter method not passed a coderef"
2948
2949							"pre_filter method not passed a coderef"
2950
2951							A reference to a subroutine (anonymous or otherwise) was expected.
2952							Solution: read the documentation for the C method.
2953
2954							"duplicate pattern added: /.../"
2955
2956							The C attribute is active, and a duplicate pattern was
2957							added (well duh!). Solution: clean your data.
2958
2959							"cannot open [file] for input: [reason]"
2960
2961							The C method was unable to open the specified file for
2962							whatever reason. Solution: make sure the file exists and the script
2963							has the required privileges to read it.
2964
2965							=head1 NOTES
2966
2967							This module has been tested successfully with a range of versions
2968							of perl, from 5.005_03 to 5.9.3. Use of 5.6.0 is not recommended.
2969
2970							The expressions produced by this module can be used with the PCRE
2971							library.
2972
2973							Remember to "double up" your backslashes if the patterns are
2974							hard-coded as constants in your program. That is, you should
2975							literally C rather than C. It
2976							usually will work either way, but it's good practice to do so.
2977
2978							Where possible, supply the simplest tokens possible. Don't add
2979							C when C will do. The reason is that
2980							if you also add C the resulting assembly changes
2981							dramatically: C I
2982							C. Since R::A doesn't perform enough analysis,
2983							it won't "unroll" the C<{2}> quantifier, and will fail to notice
2984							the divergence after the first C<-d\d+>.
2985
2986							Furthermore, when the string 'X-123000P' is matched against the
2987							first assembly, the regexp engine will have to backtrack over each
2988							alternation (the one that ends in Y B the one that ends in Z)
2989							before determining that there is no match. No such backtracking
2990							occurs in the second pattern: as soon as the engine encounters the
2991							'P' in the target string, neither of the alternations at that point
2992							(C<-\d+Y> or C) could succeed and so the match fails.
2993
2994							C does, however, know how to build character
2995							classes. Given C, C and C, it will assemble these
2996							into C. When C<-> (dash) appears as a candidate for a
2997							character class it will be the first character in the class. When
2998							C<^> (circumflex) appears as a candidate for a character class it
2999							will be the last character in the class.
3000
3001							It also knows about meta-characters than can "absorb" regular
3002							characters. For instance, given C and C, it knows that
3003							C<5> can be represented by C<\d> and so the assembly is just C.
3004							The "absorbent" meta-characters it deals with are C<.>, C<\d>, C<\s>
3005							and C<\W> and their complements. It will replace C<\d>/C<\D>,
3006							C<\s>/C<\S> and C<\w>/C<\W> by C<.> (dot), and it will drop C<\d>
3007							if C<\w> is also present (as will C<\D> in the presence of C<\W>).
3008
3009							C deals correctly with C's propensity
3010							to backslash many characters that have no need to be. Backslashes on
3011							non-metacharacters will be removed. Similarly, in character classes,
3012							a number of characters lose their magic and so no longer need to be
3013							backslashed within a character class. Two common examples are C<.>
3014							(dot) and C<$>. Such characters will lose their backslash.
3015
3016							At the same time, it will also process C<\Q...\E> sequences. When
3017							such a sequence is encountered, the inner section is extracted and
3018							C is applied to the section. The resulting quoted text
3019							is then used in place of the original unquoted text, and the C<\Q>
3020							and C<\E> metacharacters are thrown away. Similar processing occurs
3021							with the C<\U...\E> and C<\L...\E> sequences. This may have surprising
3022							effects when using a dispatch table. In this case, you will need
3023							to know exactly what the module makes of your input. Use the C
3024							method to find out what's going on:
3025
3026							$pattern = join( '', @{$re->lexstr($pattern)} );
3027
3028							If all the digits 0..9 appear in a character class, C
3029							will replace them by C<\d>. I'd do it for letters as well, but
3030							thinking about accented characters and other glyphs hurts my head.
3031
3032							In an alternation, the longest paths are chosen first (for example,
3033							C). When two paths have the same length, the path
3034							with the most subpaths will appear first. This aims to put the
3035							"busiest" paths to the front of the alternation. For example, the
3036							list C, C, C, C and C will produce the
3037							pattern C<(?:f(?:ew\|ig\|un)\|b(?:ad\|it))>. See F for a
3038							real-world example of how alternations are sorted. Once you have
3039							looked at that, everything should be crystal clear.
3040
3041							When tracking is in use, no reduction is performed. nor are
3042							character classes formed. The reason is that it is
3043							too difficult to determine the original pattern afterwards. Consider the
3044							two patterns C and C. These should be reduced to
3045							C. The final character matches one of two possibilities.
3046							To resolve whether it matched an C<'e'> or C<'m'> would require
3047							keeping track of the fact that the pattern finished up in a character
3048							class, which would the require a whole lot more work to figure out
3049							which character of the class matched. Without character classes
3050							it becomes much easier. Instead, C is produced, which
3051							lets us find out more simply where we ended up.
3052
3053							Similarly, C and C should form C<(?:dog\|sea)food>.
3054							When the pattern is being assembled, the tracking decision needs
3055							to be made at the end of the grouping, but the tail of the pattern
3056							has not yet been visited. Deferring things to make this work correctly
3057							is a vast hassle. In this case, the pattern becomes merely
3058							C<(?:dogfood\|seafood>. Tracked patterns will therefore be bulkier than
3059							simple patterns.
3060
3061							There is an open bug on this issue:
3062
3063							L
3064
3065							If this bug is ever resolved, tracking would become much easier to
3066							deal with (none of the C hassle would be required - you could
3067							just match like a regular RE and it would Just Work).
3068
3069							=head1 SEE ALSO
3070
3071							=over 4
3072
3073							=item L
3074
3075							General information about Perl's regular expressions.
3076
3077							=item L
3078
3079							Specific information about C.
3080
3081							=item Regex::PreSuf
3082
3083							C takes a string and chops it itself into tokens of
3084							length 1. Since it can't deal with tokens of more than one character,
3085							it can't deal with meta-characters and thus no regular expressions.
3086							Which is the main reason why I wrote this module.
3087
3088							=item Regexp::Optimizer
3089
3090							C produces regular expressions that are similar to
3091							those produced by R::A with reductions switched off. It's biggest
3092							drawback is that it is exponentially slower than Regexp::Assemble on
3093							very large sets of patterns.
3094
3095							=item Regexp::Parser
3096
3097							Fine grained analysis of regular expressions.
3098
3099							=item Regexp::Trie
3100
3101							Funnily enough, this was my working name for C
3102							during its development. I changed the name because I thought it
3103							was too obscure. Anyway, C does much the same as
3104							C and C except that it runs
3105							much faster (according to the author). It does not recognise
3106							meta characters (that is, 'a+b' is interpreted as 'a\+b').
3107
3108							=item Text::Trie
3109
3110							C is well worth investigating. Tries can outperform very
3111							bushy (read: many alternations) patterns.
3112
3113							=item Tree::Trie
3114
3115							C is another module that builds tries. The algorithm that
3116							C uses appears to be quite similar to the
3117							algorithm described therein, except that C solves its
3118							end-marker problem without having to rewrite the leaves.
3119
3120							=back
3121
3122							=head1 See Also
3123
3124							For alternatives to this module, consider one of:
3125
3126							=over 4
3127
3128							=item o L
3129
3130							=item o L
3131
3132							=item o L
3133
3134							=back
3135
3136							=head1 LIMITATIONS
3137
3138							Some mildly complex cases are not handled well. See examples/failure.01.pl
3139							and L.
3140
3141							See also L for a discussion
3142							of some of the issues arising with the use of a huge number of alterations. Thanx to
3143							Slaven Rezic for the details of trie 'v' non-trie operations within Perl which influence
3144							regexp handling of alternations.
3145
3146							does not attempt to find common substrings. For
3147							instance, it will not collapse C down to C.
3148							If there's a module out there that performs this sort of string
3149							analysis I'd like to know about it. But keep in mind that the
3150							algorithms that do this are very expensive: quadratic or worse.
3151
3152							C does not interpret meta-character modifiers.
3153							For instance, if the following two patterns are
3154							given: C and C, it will not determine that C<\d> can be
3155							matched by C<\d+>. Instead, it will produce C. Along
3156							a similar line of reasoning, it will not determine that C and
3157							C is equivalent to C (It will produce C
3158							instead).
3159
3160							You cannot remove a pattern that has been added to an object. You'll
3161							just have to start over again. Adding a pattern is difficult enough,
3162							I'd need a solid argument to convince me to add a C method.
3163							If you need to do this you should read the documentation for the
3164							C method.
3165
3166							C does not (yet)? employ the C<(?E...)>
3167							construct.
3168
3169							The module does not produce POSIX-style regular expressions. This
3170							would be quite easy to add, if there was a demand for it.
3171
3172							=head1 BUGS
3173
3174							Patterns that generate look-ahead assertions sometimes produce
3175							incorrect patterns in certain obscure corner cases. If you
3176							suspect that this is occurring in your pattern, disable
3177							lookaheads.
3178
3179							Tracking doesn't really work at all with 5.6.0. It works better
3180							in subsequent 5.6 releases. For maximum reliability, the use of
3181							a 5.8 release is strongly recommended. Tracking barely works with
3182							5.005_04. Of note, using C<\d>-style meta-characters invariably
3183							causes panics. Tracking really comes into its own in Perl 5.10.
3184
3185							If you feed C patterns with nested parentheses,
3186							there is a chance that the resulting pattern will be uncompilable
3187							due to mismatched parentheses (not enough closing parentheses). This
3188							is normal, so long as the default lexer pattern is used. If you want
3189							to find out which pattern among a list of 3000 patterns are to blame
3190							(speaking from experience here), the F script offers
3191							a strategy for pinpointing the pattern at fault. While you may not
3192							be able to use the script directly, the general approach is easy to
3193							implement.
3194
3195							The algorithm used to assemble the regular expressions makes extensive
3196							use of mutually-recursive functions (that is, A calls B, B calls
3197							A, ...) For deeply similar expressions, it may be possible to provoke
3198							"Deep recursion" warnings.
3199
3200							The module has been tested extensively, and has an extensive test
3201							suite (that achieves close to 100% statement coverage), but you
3202							never know... A bug may manifest itself in two ways: creating a
3203							pattern that cannot be compiled, such as C, or a pattern
3204							that compiles correctly but that either matches things it shouldn't,
3205							or doesn't match things it should. It is assumed that Such problems
3206							will occur when the reduction algorithm encounters some sort of
3207							edge case. A temporary work-around is to disable reductions:
3208
3209							my $pattern = $assembler->reduce(0)->re;
3210
3211							A discussion about implementation details and where bugs might lurk
3212							appears in the README file. If this file is not available locally,
3213							you should be able to find a copy on the Web at your nearest CPAN
3214							mirror.
3215
3216							Seriously, though, a number of people have been using this module to
3217							create expressions anywhere from 140Kb to 600Kb in size, and it seems to
3218							be working according to spec. Thus, I don't think there are any serious
3219							bugs remaining.
3220
3221							If you are feeling brave, extensive debugging traces are available to
3222							figure out where assembly goes wrong.
3223
3224							Please report all bugs at
3225							L
3226
3227							Make sure you include the output from the following two commands:
3228
3229							perl -MRegexp::Assemble -le 'print $Regexp::Assemble::VERSION'
3230							perl -V
3231
3232							There is a mailing list for the discussion of C.
3233							Subscription details are available at
3234							L.
3235
3236							=head1 ACKNOWLEDGEMENTS
3237
3238							This module grew out of work I did building access maps for Postfix,
3239							a modern SMTP mail transfer agent. See L
3240							for more information. I used Perl to build large regular expressions
3241							for blocking dynamic/residential IP addresses to cut down on spam
3242							and viruses. Once I had the code running for this, it was easy to
3243							start adding stuff to block really blatant spam subject lines, bogus
3244							HELO strings, spammer mailer-ids and more...
3245
3246							I presented the work at the French Perl Workshop in 2004, and the
3247							thing most people asked was whether the underlying mechanism for
3248							assembling the REs was available as a module. At that time it was
3249							nothing more that a twisty maze of scripts, all different. The
3250							interest shown indicated that a module was called for. I'd like to
3251							thank the people who showed interest. Hey, it's going to make I
3252							messy scripts smaller, in any case.
3253
3254							Thomas Drugeon was a valuable sounding board for trying out
3255							early ideas. Jean Forget and Philippe Blayo looked over an early
3256							version. H.Merijn Brandt stopped over in Paris one evening, and
3257							discussed things over a few beers.
3258
3259							Nicholas Clark pointed out that while what this module does
3260							(?:c\|sh)ould be done in perl's core, as per the 2004 TODO, he
3261							encouraged me to continue with the development of this module. In
3262							any event, this module allows one to gauge the difficulty of
3263							undertaking the endeavour in C. I'd rather gouge my eyes out with
3264							a blunt pencil.
3265
3266							Paul Johnson settled the question as to whether this module should
3267							live in the Regex:: namespace, or Regexp:: namespace. If you're
3268							not convinced, try running the following one-liner:
3269
3270							perl -le 'print ref qr//'
3271
3272							Philippe Bruhat found a couple of corner cases where this module
3273							could produce incorrect results. Such feedback is invaluable,
3274							and only improves the module's quality.
3275
3276							=head1 Machine-Readable Change Log
3277
3278							The file Changes was converted into Changelog.ini by L.
3279
3280							=head1 AUTHOR
3281
3282							David Landgren
3283
3284							Copyright (C) 2004-2011. All rights reserved.
3285
3286							http://www.landgren.net/perl/
3287
3288							If you use this module, I'd love to hear about what you're using
3289							it for. If you want to be informed of updates, send me a note.
3290
3291							Ron Savage is co-maint of the module, starting with V 0.36.
3292
3293							=head1 Repository
3294
3295							L
3296
3297							=head1 TODO
3298
3299							1. Tree equivalencies. Currently, /contend/ /content/ /resend/ /resent/
3300							produces (?:conten[dt]\|resend[dt]) but it is possible to produce
3301							(?:cont\|res)en[dt] if one can spot the common tail nodes (and walk back
3302							the equivalent paths). Or be by me my => /[bm][ey]/ in the simplest case.
3303
3304							To do this requires a certain amount of restructuring of the code.
3305							Currently, the algorithm uses a two-phase approach. In the first
3306							phase, the trie is traversed and reductions are performed. In the
3307							second phase, the reduced trie is traversed and the pattern is
3308							emitted.
3309
3310							What has to occur is that the reduction and emission have to occur
3311							together. As a node is completed, it is replaced by its string
3312							representation. This then allows child nodes to be compared for
3313							equality with a simple 'eq'. Since there is only a single traversal,
3314							the overall generation time might drop, even though the context
3315							baggage required to delve through the tree will be more expensive
3316							to carry along (a hash rather than a couple of scalars).
3317
3318							Actually, a simpler approach is to take on a secret sentinel
3319							atom at the end of every pattern, which gives the reduction
3320							algorithm sufficient traction to create a perfect trie.
3321
3322							I'm rewriting the reduction code using this technique.
3323
3324							2. Investigate how (?>foo) works. Can it be applied?
3325
3326							5. How can a tracked pattern be serialised? (Add freeze and thaw methods).
3327
3328							6. Store callbacks per tracked pattern.
3329
3330							12. utf-8... hmmmm...
3331
3332							14. Adding qr//'ed patterns. For example, consider
3333							$r->add ( qr/^abc/i )
3334							->add( qr/^abd/ )
3335							->add( qr/^ab e/x );
3336							this should admit abc abC aBc aBC abd abe as matches
3337
3338							16. Allow a fast, unsafe tracking mode, that can be used if a(?bc)?
3339							can't happen. (Possibly carp if it does appear during traversal)?
3340
3341							17. given a-\d+-\d+-\d+-\d+-b, produce a(?:-\d+){4}-b. Something
3342							along the lines of (.{4))(\1+) would let the regexp engine
3343							itself be brought to bear on the matter, which is a rather
3344							appealing idea. Consider
3345
3346							while(/(?!\+)(\S{2,}?)(\1+)/g) { ... $1, $2 ... }
3347
3348							as a starting point.
3349
3350							19. The reduction code has become unbelievably baroque. Adding code
3351							to handle (sting,singing,sing) => s(?:(?:ing)?\|t)ing was far
3352							too difficult. Adding more stuff just breaks existing behaviour.
3353							And fixing the ^abcd$ ... bug broke stuff all over again.
3354							Now that the corner cases are more clearly identified, a full
3355							rewrite of the reduction code is needed. And would admit the
3356							possibility of implementing items 1 and 17.
3357
3358							20. Handle debug unrev with a separate bit
3359
3360							23. Japhy's http://www.perlmonks.org/index.pl?node_id=90876 list2range
3361							regexp
3362
3363							24. Lookahead assertions contain serious bugs (as shown by
3364							assembling powersets. Need to save more context during reduction,
3365							which in turn will simplify the preparation of the lookahead
3366							classes. See also 19.
3367
3368							26. _lex() swamps the overall run-time. It stems from the decision
3369							to use a single regexp to pull apart any pattern. A suite of
3370							simpler regexp to pick of parens, char classes, quantifiers
3371							and bare tokens may be faster. (This has been implemented as
3372							_fastlex(), but it's only marginally faster. Perhaps split-by-
3373							char and lex a la C?
3374
3375							27. We don't, as yet, unroll_plus a paren e.g. (abc)+?
3376
3377							28. We don't reroll unrolled a a* to a+ in indented or tracked
3378							output
3379
3380							29. Use (MARK n) in blead for tracked patterns, and use (FAIL) for
3381							the unmatchable pattern.
3382
3383							=head1 LICENSE
3384
3385							This library is free software; you can redistribute it and/or modify
3386							it under the same terms as Perl itself.
3387
3388							=cut
3389
3390							# Return a +ve value to tell Perl the module is ready to go.
3391
3392							'The Lusty Decadent Delights of Imperial Pompeii';