File Coverage

blib/lib/Regexp/Assemble.pm
Criterion Covered Total %
statement 1141 1155 98.7
branch 777 796 97.6
condition 174 177 98.3
subroutine 97 98 98.9
pod 49 49 100.0
total 2238 2275 98.3


line stmt bran cond sub pod time code
1             # Regexp::Assemple.pm
2             #
3             # Copyright (c) 2004-2011 David Landgren
4             # All rights reserved
5              
6             package Regexp::Assemble;
7              
8 11     11   35637 use strict;
  11         25  
  11         266  
9 11     11   49 use warnings;
  11         21  
  11         293  
10              
11 11     11   57 use constant DEBUG_ADD => 1;
  11         29  
  11         1054  
12 11     11   64 use constant DEBUG_TAIL => 2;
  11         24  
  11         425  
13 11     11   56 use constant DEBUG_LEX => 4;
  11         24  
  11         406  
14 11     11   70 use constant DEBUG_TIME => 8;
  11         30  
  11         495  
15              
16 11     11   69 use vars qw/$have_Storable $Current_Lexer $Default_Lexer $Single_Char $Always_Fail/;
  11         23  
  11         38857  
17              
18             # The following patterns were generated with examples/naive.
19              
20             $Default_Lexer = qr/(?![[(\\]).(?:[*+?]\??|\{\d+(?:,\d*)?\}\??)?|\\(?:[bABCEGLQUXZ]|[lu].|(?:[^\w]|[aefnrtdDwWsS]|c.|0\d{2}|x(?:[\da-fA-F]{2}|{[\da-fA-F]{4}})|N\{\w+\}|[Pp](?:\{\w+\}|.))(?:[*+?]\??|\{\d+(?:,\d*)?\}\??)?)|\[.*?(?
21              
22             $Single_Char = qr/^(?:\\(?:[aefnrtdDwWsS]|c.|[^\w\/{|}-]|0\d{2}|x(?:[\da-fA-F]{2}|{[\da-fA-F]{4}}))|[^\$^])$/;
23              
24             # The pattern to return when nothing has been added (and thus not match anything)
25              
26             $Always_Fail = "^\\b\0";
27              
28             our $VERSION = '0.38';
29              
30             # ------------------------------------------------
31              
32             sub new {
33 2194     2194 1 1890739 my $class = shift;
34 2194         5767 my %args = @_;
35              
36 2194         3536 my $anc;
37 2194         4382 for $anc (qw(word line string)) {
38 6582 100       16900 if (exists $args{"anchor_$anc"}) {
39 135         329 my $val = delete $args{"anchor_$anc"};
40 135         398 for my $anchor ("anchor_${anc}_begin", "anchor_${anc}_end") {
41 270 100       907 $args{$anchor} = $val unless exists $args{$anchor};
42             }
43             }
44             }
45              
46             # anchor_string_absolute sets anchor_string_begin and anchor_string_end_absolute
47 2194 100       5316 if (exists $args{anchor_string_absolute}) {
48 3         6 my $val = delete $args{anchor_string_absolute};
49 3         6 for my $anchor (qw(anchor_string_begin anchor_string_end_absolute)) {
50 6 100       17 $args{$anchor} = $val unless exists $args{$anchor};
51             }
52             }
53              
54 2194   100     37419 exists $args{$_} or $args{$_} = 0 for qw(
55             anchor_word_begin
56             anchor_word_end
57             anchor_line_begin
58             anchor_line_end
59             anchor_string_begin
60             anchor_string_end
61             anchor_string_end_absolute
62             debug
63             dup_warn
64             indent
65             lookahead
66             mutable
67             track
68             unroll_plus
69             );
70              
71 2194   100     12637 exists $args{$_} or $args{$_} = 1 for qw(
72             fold_meta_pairs
73             reduce
74             chomp
75             );
76              
77 2194         6540 @args{qw(re str path)} = (undef, undef, []);
78              
79 2194   100     15086 $args{flags} ||= delete $args{modifiers} || '';
      100        
80 2194 100       4959 $args{lex} = $Current_Lexer if defined $Current_Lexer;
81              
82 2194         4623 my $self = bless \%args, $class;
83              
84 2194 100       5633 if ($self->_debug(DEBUG_TIME)) {
85 1         5 $self->_init_time_func();
86 1         5 $self->{_begin_time} = $self->{_time_func}->();
87             }
88             $self->{input_record_separator} = delete $self->{rs}
89 2194 100       5203 if exists $self->{rs};
90 2194 100       4929 exists $self->{file} and $self->add_file($self->{file});
91              
92 2193         5859 return $self;
93             }
94              
95             sub _init_time_func {
96 9     9   16 my $self = shift;
97 9 100       31 return if exists $self->{_time_func};
98              
99             # attempt to improve accuracy
100 6 100       18 if (!defined($self->{_use_time_hires})) {
101 5         25 eval {require Time::HiRes};
  5         960  
102 5         2158 $self->{_use_time_hires} = $@;
103             }
104             $self->{_time_func} = length($self->{_use_time_hires}) > 0
105 4     4   12 ? sub { time }
106 6 100       30 : \&Time::HiRes::time
107             ;
108             }
109              
110             sub clone {
111 55     55 1 418 my $self = shift;
112 55         91 my $clone;
113 55         287 my @attr = grep {$_ ne 'path'} keys %$self;
  1314         2162  
114 55         150 @{$clone}{@attr} = @{$self}{@attr};
  55         422  
  55         136  
115 55         149 $clone->{path} = _path_clone($self->_path);
116 55         328 bless $clone, ref($self);
117             }
118              
119             sub _fastlex {
120 884     884   1480 my $self = shift;
121 884         1401 my $record = shift;
122 884         1283 my $len = 0;
123 884         1631 my @path = ();
124 884         1340 my $case = '';
125 884         1287 my $qm = '';
126              
127 884         1499 my $debug = $self->{debug} & DEBUG_LEX;
128 884         1333 my $unroll_plus = $self->{unroll_plus};
129              
130 884         1450 my $token;
131             my $qualifier;
132 884 100       3364 $debug and print "# _lex <$record>\n";
133 884         1685 my $modifier = q{(?:[*+?]\\??|\\{(?:\\d+(?:,\d*)?|,\d+)\\}\\??)?};
134 884         2519 my $class_matcher = qr/\[(?:\[:[a-z]+:\]|\\?.)*?\]/;
135 884         3385 my $paren_matcher = qr/\(.*?(?
136 884         3310 my $misc_matcher = qr/(?:(c)(.)|(0)(\d{2}))($modifier)/;
137 884         2779 my $regular_matcher = qr/([^\\[(])($modifier)/;
138 884         1926 my $qm_matcher = qr/(\\?.)/;
139              
140 884         1441 my $matcher = $regular_matcher;
141             {
142 884 100       1259 if ($record =~ /\G$matcher/gc) {
  5960 100       25785  
    100          
    100          
143             # neither a \\ nor [ nor ( followed by a modifer
144 3344 100 100     11141 if ($1 eq '\\E') {
    100 66        
145 12 100       106 $debug and print "# E\n";
146 12         28 $case = $qm = '';
147 12         20 $matcher = $regular_matcher;
148 12         22 redo;
149             }
150             elsif ($qm and ($1 eq '\\L' or $1 eq '\\U')) {
151 5 100       27 $debug and print "# ignore \\L, \\U\n";
152 5         13 redo;
153             }
154 3327         5253 $token = $1;
155 3327 100       6090 $qualifier = defined $2 ? $2 : '';
156 3327 100       8584 $debug and print "# token <$token> <$qualifier>\n";
157 3327 100       5351 if ($qm) {
158 90         141 $token = quotemeta($token);
159 90         173 $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{}\/])$/$1/;
160             }
161             else {
162 3237         5539 $token =~ s{\A([][{}*+?@\\/])\Z}{\\$1};
163             }
164 3327 100 100     7963 if ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/) {
165 22 100       71 $1 and $qualifier .= $1;
166 22 100       692 $debug and print " unroll <$token><$token><$qualifier>\n";
167 22 100       78 $case and $token = $case eq 'L' ? lc($token) : uc($token);
    100          
168 22         60 push @path, $token, "$token$qualifier";
169             }
170             else {
171 3305 100       7572 $debug and print " clean <$token>\n";
172 3305 100       8433 push @path,
    100          
173             $case eq 'L' ? lc($token).$qualifier
174             : $case eq 'U' ? uc($token).$qualifier
175             : $token.$qualifier
176             ;
177             }
178 3327         5227 redo;
179             }
180              
181             elsif ($record =~ /\G\\/gc) {
182 1680 100       4494 $debug and print "# backslash\n";
183             # backslash
184 1680 100       12887 if ($record =~ /\G([sdwSDW])($modifier)/gc) {
    100          
    100          
    100          
    100          
    100          
    100          
    100          
185 443         989 ($token, $qualifier) = ($1, $2);
186 443 100       1049 $debug and print "# meta <$token> <$qualifier>\n";
187 443 100 100     1369 push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
    100          
188             ? ("\\$token", "\\$token$qualifier" . (defined $1 ? $1 : ''))
189             : "\\$token$qualifier";
190             }
191             elsif ($record =~ /\Gx([\da-fA-F]{2})($modifier)/gc) {
192 9 100       249 $debug and print "# x $1\n";
193 9         52 $token = quotemeta(chr(hex($1)));
194 9         23 $qualifier = $2;
195 9 100       239 $debug and print "# cooked <$token>\n";
196 9         41 $token =~ s/^\\([^\w$()*+.?\[\\\]^|{\/])$/$1/; # } balance
197 9 100       242 $debug and print "# giving <$token>\n";
198 9 100 100     72 push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
    100          
199             ? ($token, "$token$qualifier" . (defined $1 ? $1 : ''))
200             : "$token$qualifier";
201             }
202             elsif ($record =~ /\GQ/gc) {
203 26 100       209 $debug and print "# Q\n";
204 26         52 $qm = 1;
205 26         41 $matcher = $qm_matcher;
206             }
207             elsif ($record =~ /\G([LU])/gc) {
208 15 100       286 $debug and print "# case $1\n";
209 15         41 $case = $1;
210             }
211             elsif ($record =~ /\GE/gc) {
212 6 100       18 $debug and print "# E\n";
213 6         12 $case = $qm = '';
214 6         9 $matcher = $regular_matcher;
215             }
216             elsif ($record =~ /\G([lu])(.)/gc) {
217 6 100       22 $debug and print "# case $1 to <$2>\n";
218 6 100       23 push @path, $1 eq 'l' ? lc($2) : uc($2);
219             }
220 30         63 elsif (my @arg = grep {defined} $record =~ /\G$misc_matcher/gc) {
221 6 50       17 if ($] < 5.007) {
222 0         0 my $len = 0;
223 0         0 $len += length($_) for @arg;
224 0 0       0 $debug and print "# pos ", pos($record), " fixup add $len\n";
225 0         0 pos($record) = pos($record) + $len;
226             }
227 6         12 my $directive = shift @arg;
228 6 100       13 if ($directive eq 'c') {
229 3 100       12 $debug and print "# ctrl <@arg>\n";
230 3         11 push @path, "\\c" . uc(shift @arg);
231             }
232             else { # elsif ($directive eq '0') {
233 3 100       11 $debug and print "# octal <@arg>\n";
234 3         8 my $ascii = oct(shift @arg);
235 3 100       26 push @path, ($ascii < 32)
236             ? "\\c" . chr($ascii+64)
237             : chr($ascii)
238             ;
239             }
240 6         12 $path[-1] .= join( '', @arg ); # if @arg;
241 6         15 redo;
242             }
243             elsif ($record =~ /\G(.)/gc) {
244 1165         2673 $token = $1;
245 1165         3037 $token =~ s{[AZabefnrtz\[\]{}()\\\$*+.?@|/^]}{\\$token};
246 1165 100       2795 $debug and print "# meta <$token>\n";
247 1165         2339 push @path, $token;
248             }
249             else {
250 4 100       23 $debug and print "# ignore char at ", pos($record), " of <$record>\n";
251             }
252 1674         3274 redo;
253             }
254              
255             elsif ($record =~ /\G($class_matcher)($modifier)/gc) {
256             # [class] followed by a modifer
257 39         103 my $class = $1;
258 39 50       100 my $qualifier = defined $2 ? $2 : '';
259 39 100       341 $debug and print "# class begin <$class> <$qualifier>\n";
260 39 100       147 if ($class =~ /\A\[\\?(.)]\Z/) {
261 12         28 $class = quotemeta $1;
262 12         35 $class =~ s{\A\\([!@%])\Z}{$1};
263 12 100       110 $debug and print "# class unwrap $class\n";
264             }
265 39 100       337 $debug and print "# class end <$class> <$qualifier>\n";
266 39 100 100     183 push @path, ($unroll_plus and $qualifier =~ s/\A\+(\?)?\Z/*/)
    100          
267             ? ($class, "$class$qualifier" . (defined $1 ? $1 : ''))
268             : "$class$qualifier";
269 39         84 redo;
270             }
271              
272             elsif ($record =~ /\G($paren_matcher)/gc) {
273 13 100       37 $debug and print "# paren <$1>\n";
274             # (paren) followed by a modifer
275 13         29 push @path, $1;
276 13         21 redo;
277             }
278              
279             }
280 884         3823 return \@path;
281             }
282              
283             sub _lex {
284 211     211   415 my $self = shift;
285 211         335 my $record = shift;
286 211         313 my $len = 0;
287 211         364 my @path = ();
288 211         326 my $case = '';
289 211         310 my $qm = '';
290             my $re = defined $self->{lex} ? $self->{lex}
291 211 50       541 : defined $Current_Lexer ? $Current_Lexer
    100          
292             : $Default_Lexer;
293 211         340 my $debug = $self->{debug} & DEBUG_LEX;
294 211 100       1949 $debug and print "# _lex <$record>\n";
295 211         370 my ($token, $next_token, $diff, $token_len);
296 211         2729 while( $record =~ /($re)/g ) {
297 480         1132 $token = $1;
298 480         731 $token_len = length($token);
299 480 100       8049 $debug and print "# lexed <$token> len=$token_len\n";
300 480 100       1273 if( pos($record) - $len > $token_len ) {
301 15         30 $next_token = $token;
302 15         44 $token = substr( $record, $len, $diff = pos($record) - $len - $token_len );
303 15 100       798 $debug and print "# recover <", substr( $record, $len, $diff ), "> as <$token>, save <$next_token>\n";
304 15         43 $len += $diff;
305             }
306 480         698 $len += $token_len;
307             TOKEN: {
308 480 100       685 if( substr( $token, 0, 1 ) eq '\\' ) {
  495         1153  
309 226 100       843 if( $token =~ /^\\([ELQU])$/ ) {
    100          
    100          
310 51 100       160 if( $1 eq 'E' ) {
    100          
311             $qm and $re = defined $self->{lex} ? $self->{lex}
312 12 50       80 : defined $Current_Lexer ? $Current_Lexer
    100          
    100          
313             : $Default_Lexer;
314 12         29 $case = $qm = '';
315             }
316             elsif( $1 eq 'Q' ) {
317 19         59 $qm = $1;
318             # switch to a more precise lexer to quotemeta individual characters
319 19         80 $re = qr/\\?./;
320             }
321             else {
322 20         36 $case = $1;
323             }
324 51 100       1531 $debug and print "# state change qm=<$qm> case=<$case>\n";
325 51         430 goto NEXT_TOKEN;
326             }
327             elsif( $token =~ /^\\([lu])(.)$/ ) {
328 3 100       85 $debug and print "# apply case=<$1> to <$2>\n";
329 3 100       22 push @path, $1 eq 'l' ? lc($2) : uc($2);
330 3         28 goto NEXT_TOKEN;
331             }
332             elsif( $token =~ /^\\x([\da-fA-F]{2})$/ ) {
333 41         124 $token = quotemeta(chr(hex($1)));
334 41 100       164 $debug and print "# cooked <$token>\n";
335 41         105 $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{\/])$/$1/; # } balance
336 41 100       167 $debug and print "# giving <$token>\n";
337             }
338             else {
339 131         255 $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{\/])$/$1/; # } balance
340 131 100       597 $debug and print "# backslashed <$token>\n";
341             }
342             }
343             else {
344 269 100       604 $case and $token = $case eq 'U' ? uc($token) : lc($token);
    100          
345 269 100       570 $qm and $token = quotemeta($token);
346 269 100       593 $token = '\\/' if $token eq '/';
347             }
348             # undo quotemeta's brute-force escapades
349 441 100       907 $qm and $token =~ s/^\\([^\w$()*+.?@\[\\\]^|{}\/])$/$1/;
350 441 100       7310 $debug and print "# <$token> case=<$case> qm=<$qm>\n";
351 441         1034 push @path, $token;
352              
353             NEXT_TOKEN:
354 495 100       3408 if( defined $next_token ) {
355 15 100       831 $debug and print "# redo <$next_token>\n";
356 15         59 $token = $next_token;
357 15         32 $next_token = undef;
358 15         61 redo TOKEN;
359             }
360             }
361             }
362 211 100       487 if( $len < length($record) ) {
363             # NB: the remainder only arises in the case of degenerate lexer,
364             # and if \Q is operative, the lexer will have been switched to
365             # /\\?./, which means there can never be a remainder, so we
366             # don't have to bother about quotemeta. In other words:
367             # $qm will never be true in this block.
368 7         20 my $remain = substr($record,$len);
369 7 100       23 $case and $remain = $case eq 'U' ? uc($remain) : lc($remain);
    100          
370 7 100       554 $debug and print "# add remaining <$remain> case=<$case> qm=<$qm>\n";
371 7         36 push @path, $remain;
372             }
373 211 100       1821 $debug and print "# _lex out <@path>\n";
374 211         1200 return \@path;
375             }
376              
377             sub add {
378 1059     1059 1 14157 my $self = shift;
379 1059         1523 my $record;
380 1059         1970 my $debug = $self->{debug} & DEBUG_LEX;
381 1059         2804 while( defined( $record = shift @_ )) {
382 2263 100       5532 CORE::chomp($record) if $self->{chomp};
383 2263 100 100     5770 next if $self->{pre_filter} and not $self->{pre_filter}->($record);
384 2262 100       7581 $debug and print "# add <$record>\n";
385 2262         4095 $self->{stats_raw} += length $record;
386             my $list = $record =~ /[+*?(\\\[{]/ # }]) restore equilibrium
387 2262 100       10559 ? $self->{lex} ? $self->_lex($record) : $self->_fastlex($record)
    100          
388             : [split //, $record]
389             ;
390 2262 100 100     6691 next if $self->{filter} and not $self->{filter}->(@$list);
391 2261         5028 $self->_insertr( $list );
392             }
393 1059         2777 return $self;
394             }
395              
396             sub add_file {
397 13     13 1 25 my $self = shift;
398 13         27 my $rs;
399             my @file;
400 13 100       31 if (ref($_[0]) eq 'HASH') {
401 6         11 my $arg = shift;
402             $rs = $arg->{rs}
403             || $arg->{input_record_separator}
404             || $self->{input_record_separator}
405 6   100     36 || $/;
406             @file = ref($arg->{file}) eq 'ARRAY'
407 4         12 ? @{$arg->{file}}
408 6 100       16 : $arg->{file};
409             }
410             else {
411 7   100     31 $rs = $self->{input_record_separator} || $/;
412 7         17 @file = @_;
413             }
414 13         41 local $/ = $rs;
415 13         18 my $file;
416 13         24 for $file (@file) {
417 15 100       500 open my $fh, '<', $file or do {
418 1         8 require Carp;
419 1         152 Carp::croak("cannot open $file for input: $!");
420             };
421 14         126 while (defined (my $rec = <$fh>)) {
422 55         114 $self->add($rec);
423             }
424 14         109 close $fh;
425             }
426 12         58 return $self;
427             }
428              
429             sub insert {
430 3398     3398 1 12089 my $self = shift;
431 3398 100 100     8684 return if $self->{filter} and not $self->{filter}->(@_);
432 3397         11218 $self->_insertr( [@_] );
433 3397         7593 return $self;
434             }
435              
436             sub _insertr {
437 7068     7068   10664 my $self = shift;
438 7068   100     23280 my $dup = $self->{stats_dup} || 0;
439 7068         14432 $self->{path} = $self->_insert_path( $self->_path, $self->_debug(DEBUG_ADD), $_[0] );
440 7068 100 100     21051 if( not defined $self->{stats_dup} or $dup == $self->{stats_dup} ) {
    50          
441 7063         12044 ++$self->{stats_add};
442 7063 100       9899 $self->{stats_cooked} += defined($_) ? length($_) : 0 for @{$_[0]};
  7063         32387  
443             }
444             elsif( $self->{dup_warn} ) {
445 0 0       0 if( ref $self->{dup_warn} eq 'CODE' ) {
446 0         0 $self->{dup_warn}->($self, $_[0]);
447             }
448             else {
449 0         0 my $pattern = join( '', @{$_[0]} );
  0         0  
450 0         0 require Carp;
451 0         0 Carp::carp("duplicate pattern added: /$pattern/");
452             }
453             }
454 7068         18302 $self->{str} = $self->{re} = undef;
455             }
456              
457             sub lexstr {
458 2     2 1 10 return shift->_lex(shift);
459             }
460              
461             sub pre_filter {
462 3     3 1 843 my $self = shift;
463 3         6 my $pre_filter = shift;
464 3 100 100     26 if( defined $pre_filter and ref($pre_filter) ne 'CODE' ) {
465 1         6 require Carp;
466 1         60 Carp::croak("pre_filter method not passed a coderef");
467             }
468 2         5 $self->{pre_filter} = $pre_filter;
469 2         6 return $self;
470             }
471              
472              
473             sub filter {
474 4     4 1 382 my $self = shift;
475 4         8 my $filter = shift;
476 4 100 100     24 if( defined $filter and ref($filter) ne 'CODE' ) {
477 1         9 require Carp;
478 1         157 Carp::croak("filter method not passed a coderef");
479             }
480 3         7 $self->{filter} = $filter;
481 3         10 return $self;
482             }
483              
484             sub as_string {
485 800     800 1 1889 my $self = shift;
486 800 100       2081 if( not defined $self->{str} ) {
487 798 100       1754 if( $self->{track} ) {
488 8         13 $self->{m} = undef;
489 8         16 $self->{mcount} = 0;
490 8         17 $self->{mlist} = [];
491 8         18 $self->{str} = _re_path_track($self, $self->_path, '', '');
492             }
493             else {
494 790 100 100     4636 $self->_reduce unless ($self->{mutable} or not $self->{reduce});
495 790         1614 my $arg = {@_};
496             $arg->{indent} = $self->{indent}
497 790 100 100     3647 if not exists $arg->{indent} and $self->{indent} > 0;
498 790 100 100     2855 if( exists $arg->{indent} and $arg->{indent} > 0 ) {
    100          
499 42         95 $arg->{depth} = 0;
500 42         112 $self->{str} = _re_path_pretty($self, $self->_path, $arg);
501             }
502             elsif( $self->{lookahead} ) {
503 35         100 $self->{str} = _re_path_lookahead($self, $self->_path);
504             }
505             else {
506 713         1540 $self->{str} = _re_path($self, $self->_path);
507             }
508             }
509 798 100       2162 if (not length $self->{str}) {
510             # explicitly fail to match anything if no pattern was generated
511 9         35 $self->{str} = $Always_Fail;
512             }
513             else {
514             my $begin =
515             $self->{anchor_word_begin} ? '\\b'
516             : $self->{anchor_line_begin} ? '^'
517 789 100       2650 : $self->{anchor_string_begin} ? '\A'
    100          
    100          
518             : ''
519             ;
520             my $end =
521             $self->{anchor_word_end} ? '\\b'
522             : $self->{anchor_line_end} ? '$'
523             : $self->{anchor_string_end} ? '\Z'
524 789 100       2658 : $self->{anchor_string_end_absolute} ? '\z'
    100          
    100          
    100          
525             : ''
526             ;
527 789         1869 $self->{str} = "$begin$self->{str}$end";
528             }
529 798 100       2328 $self->{path} = [] unless $self->{mutable};
530             }
531 800         4265 return $self->{str};
532             }
533              
534             sub re {
535 122     122 1 1381 my $self = shift;
536 122 100       468 $self->_build_re($self->as_string(@_)) unless defined $self->{re};
537 122         591 return $self->{re};
538             }
539              
540             use overload '""' => sub {
541 2131     2131   537440 my $self = shift;
542 2131 100       14235 return $self->{re} if $self->{re};
543 489         1217 $self->_build_re($self->as_string());
544 489         5658 return $self->{re};
545 11     11   10252 };
  11         8917  
  11         99  
546              
547             sub _build_re {
548 618     618   1012 my $self = shift;
549 618         966 my $str = shift;
550 618 100       1395 if( $self->{track} ) {
551 11     11   843 use re 'eval';
  11         25  
  11         86485  
552             $self->{re} = length $self->{flags}
553 8 100       856 ? qr/(?$self->{flags}:$str)/
554             : qr/$str/
555             ;
556             }
557             else {
558             # how could I not repeat myself?
559             $self->{re} = length $self->{flags}
560 610 100       12142 ? qr/(?$self->{flags}:$str)/
561             : qr/$str/
562             ;
563             }
564             }
565              
566             sub match {
567 29     29 1 5318 my $self = shift;
568 29         49 my $target = shift;
569 29 100       108 $self->_build_re($self->as_string(@_)) unless defined $self->{re};
570 29         71 $self->{m} = undef;
571 29         56 $self->{mvar} = [];
572 29 100       599 if( not $target =~ /$self->{re}/ ) {
573 8         20 $self->{mbegin} = [];
574 8         13 $self->{mend} = [];
575 8         39 return undef;
576             }
577 21 50       65 $self->{m} = $^R if $] >= 5.009005;
578 21         92 $self->{mbegin} = _path_copy([@-]);
579 21         85 $self->{mend} = _path_copy([@+]);
580 21         40 my $n = 0;
581 21         55 for( my $n = 0; $n < @-; ++$n ) {
582 43 100 66     199 push @{$self->{mvar}}, substr($target, $-[$n], $+[$n] - $-[$n])
  33         168  
583             if defined $-[$n] and defined $+[$n];
584             }
585 21 100       49 if( $self->{track} ) {
586 20 50       115 return defined $self->{m} ? $self->{mlist}[$self->{m}] : 1;
587             }
588             else {
589 1         6 return 1;
590             }
591             }
592              
593             sub source {
594 4     4 1 400 my $self = shift;
595 4 100       17 return unless $self->{track};
596 3 100       11 defined($_[0]) and return $self->{mlist}[$_[0]];
597 2 100       9 return unless defined $self->{m};
598 1         5 return $self->{mlist}[$self->{m}];
599             }
600              
601             sub mbegin {
602 3     3 1 10 my $self = shift;
603 3 100       19 return exists $self->{mbegin} ? $self->{mbegin} : [];
604             }
605              
606             sub mend {
607 3     3 1 7 my $self = shift;
608 3 100       18 return exists $self->{mend} ? $self->{mend} : [];
609             }
610              
611             sub mvar {
612 19     19 1 35 my $self = shift;
613 19 100       54 return undef unless exists $self->{mvar};
614 18 100       99 return defined($_[0]) ? $self->{mvar}[$_[0]] : $self->{mvar};
615             }
616              
617             sub capture {
618 5     5 1 14 my $self = shift;
619 5 100       18 if( $self->{mvar} ) {
620 4         6 my @capture = @{$self->{mvar}};
  4         12  
621 4         7 shift @capture;
622 4         15 return @capture;
623             }
624 1         4 return ();
625             }
626              
627             sub matched {
628 9     9 1 412 my $self = shift;
629 9 100       55 return defined $self->{m} ? $self->{mlist}[$self->{m}] : undef;
630             }
631              
632             sub stats_add {
633 2     2 1 9 my $self = shift;
634 2   100     12 return $self->{stats_add} || 0;
635             }
636              
637             sub stats_dup {
638 2     2 1 5 my $self = shift;
639 2   100     13 return $self->{stats_dup} || 0;
640             }
641              
642             sub stats_raw {
643 2     2 1 3 my $self = shift;
644 2   100     13 return $self->{stats_raw} || 0;
645             }
646              
647             sub stats_cooked {
648 2     2 1 4 my $self = shift;
649 2   100     14 return $self->{stats_cooked} || 0;
650             }
651              
652             sub stats_length {
653 6     6 1 2669 my $self = shift;
654 6 100 100     44 return (defined $self->{str} and $self->{str} ne $Always_Fail) ? length $self->{str} : 0;
655             }
656              
657             sub dup_warn {
658 0     0 1 0 my $self = shift;
659 0 0       0 $self->{dup_warn} = defined($_[0]) ? $_[0] : 1;
660 0         0 return $self;
661             }
662              
663             sub anchor_word_begin {
664 5     5 1 9 my $self = shift;
665 5 100       13 $self->{anchor_word_begin} = defined($_[0]) ? $_[0] : 1;
666 5         16 return $self;
667             }
668              
669             sub anchor_word_end {
670 4     4 1 4 my $self = shift;
671 4 100       11 $self->{anchor_word_end} = defined($_[0]) ? $_[0] : 1;
672 4         10 return $self;
673             }
674              
675             sub anchor_word {
676 2     2 1 4 my $self = shift;
677 2         3 my $state = shift;
678 2         6 $self->anchor_word_begin($state)->anchor_word_end($state);
679 2         6 return $self;
680             }
681              
682             sub anchor_line_begin {
683 4     4 1 7 my $self = shift;
684 4 100       12 $self->{anchor_line_begin} = defined($_[0]) ? $_[0] : 1;
685 4         11 return $self;
686             }
687              
688             sub anchor_line_end {
689 2     2 1 4 my $self = shift;
690 2 100       4 $self->{anchor_line_end} = defined($_[0]) ? $_[0] : 1;
691 2         4 return $self;
692             }
693              
694             sub anchor_line {
695 2     2 1 4 my $self = shift;
696 2         4 my $state = shift;
697 2         4 $self->anchor_line_begin($state)->anchor_line_end($state);
698 2         6 return $self;
699             }
700              
701             sub anchor_string_begin {
702 277     277 1 441 my $self = shift;
703 277 100       680 $self->{anchor_string_begin} = defined($_[0]) ? $_[0] : 1;
704 277         750 return $self;
705             }
706              
707             sub anchor_string_end {
708 276     276 1 518 my $self = shift;
709 276 100       623 $self->{anchor_string_end} = defined($_[0]) ? $_[0] : 1;
710 276         481 return $self;
711             }
712              
713             sub anchor_string_end_absolute {
714 3     3 1 4 my $self = shift;
715 3 100       7 $self->{anchor_string_end_absolute} = defined($_[0]) ? $_[0] : 1;
716 3         6 return $self;
717             }
718              
719             sub anchor_string {
720 274     274 1 493 my $self = shift;
721 274 100       638 my $state = defined($_[0]) ? $_[0] : 1;
722 274         706 $self->anchor_string_begin($state)->anchor_string_end($state);
723 274         997 return $self;
724             }
725              
726             sub anchor_string_absolute {
727 2     2 1 3 my $self = shift;
728 2 100       7 my $state = defined($_[0]) ? $_[0] : 1;
729 2         6 $self->anchor_string_begin($state)->anchor_string_end_absolute($state);
730 2         7 return $self;
731             }
732              
733             sub debug {
734 605     605 1 2687 my $self = shift;
735 605 100       1398 $self->{debug} = defined($_[0]) ? $_[0] : 0;
736 605 100       1183 if ($self->_debug(DEBUG_TIME)) {
737             # hmm, debugging time was switched on after instantiation
738 4         14 $self->_init_time_func;
739 4         18 $self->{_begin_time} = $self->{_time_func}->();
740             }
741 605         1048 return $self;
742             }
743              
744             sub dump {
745 9     9 1 1000 return _dump($_[0]->_path);
746             }
747              
748             sub chomp {
749 22     22 1 841 my $self = shift;
750 22 100       101 $self->{chomp} = defined($_[0]) ? $_[0] : 1;
751 22         61 return $self;
752             }
753              
754             sub fold_meta_pairs {
755 5     5 1 9 my $self = shift;
756 5 100       16 $self->{fold_meta_pairs} = defined($_[0]) ? $_[0] : 1;
757 5         13 return $self;
758             }
759              
760             sub indent {
761 4     4 1 755 my $self = shift;
762 4 100       13 $self->{indent} = defined($_[0]) ? $_[0] : 0;
763 4         10 return $self;
764             }
765              
766             sub lookahead {
767 22     22 1 47 my $self = shift;
768 22 100       80 $self->{lookahead} = defined($_[0]) ? $_[0] : 1;
769 22         59 return $self;
770             }
771              
772             sub flags {
773 24     24 1 1780 my $self = shift;
774 24 100       126 $self->{flags} = defined($_[0]) ? $_[0] : '';
775 24         93 return $self;
776             }
777              
778             sub modifiers {
779 4     4 1 1507 my $self = shift;
780 4         10 return $self->flags(@_);
781             }
782              
783             sub track {
784 5     5 1 1537 my $self = shift;
785 5 100       15 $self->{track} = defined($_[0]) ? $_[0] : 1;
786 5         12 return $self;
787             }
788              
789             sub unroll_plus {
790 2     2 1 859 my $self = shift;
791 2 100       7 $self->{unroll_plus} = defined($_[0]) ? $_[0] : 1;
792 2         8 return $self;
793             }
794              
795             sub lex {
796 1     1 1 2 my $self = shift;
797 1         49 $self->{lex} = qr($_[0]);
798 1         5 return $self;
799             }
800              
801             sub reduce {
802 19     19 1 1182 my $self = shift;
803 19 100       73 $self->{reduce} = defined($_[0]) ? $_[0] : 1;
804 19         53 return $self;
805             }
806              
807             sub mutable {
808 5     5 1 1158 my $self = shift;
809 5 100       26 $self->{mutable} = defined($_[0]) ? $_[0] : 1;
810 5         17 return $self;
811             }
812              
813             sub reset {
814             # reinitialise the internal state of the object
815 19     19 1 2371 my $self = shift;
816 19         46 $self->{path} = [];
817 19         58 $self->{re} = undef;
818 19         39 $self->{str} = undef;
819 19         75 return $self;
820             }
821              
822             sub Default_Lexer {
823 4 100   4 1 3916 if( $_[0] ) {
824 3 100       13 if( my $refname = ref($_[0]) ) {
825 1         9 require Carp;
826 1         132 Carp::croak("Cannot pass a $refname to Default_Lexer");
827             }
828 2         6 $Current_Lexer = $_[0];
829             }
830 3 100       16 return defined $Current_Lexer ? $Current_Lexer : $Default_Lexer;
831             }
832              
833             # --- no user serviceable parts below ---
834              
835             # -- debug helpers
836              
837             sub _debug {
838 12297     12297   18291 my $self = shift;
839 12297 100       37581 return $self->{debug} & shift() ? 1 : 0;
840             }
841              
842             # -- helpers
843              
844             sub _path {
845             # access the path
846 10011     10011   25316 return $_[0]->{path};
847             }
848              
849             # -- the heart of the matter
850              
851             $have_Storable = do {
852             eval {
853             require Storable;
854             import Storable 'dclone';
855             };
856             $@ ? 0 : 1;
857             };
858              
859             sub _path_clone {
860 55 100   55   2951 $have_Storable ? dclone($_[0]) : _path_copy($_[0]);
861             }
862              
863             sub _path_copy {
864 80     80   695 my $path = shift;
865 80         124 my $new = [];
866 80         188 for( my $p = 0; $p < @$path; ++$p ) {
867 201 100       443 if( ref($path->[$p]) eq 'HASH' ) {
    100          
868 9         19 push @$new, _node_copy($path->[$p]);
869             }
870             elsif( ref($path->[$p]) eq 'ARRAY' ) {
871 3         6 push @$new, _path_copy($path->[$p]);
872             }
873             else {
874 189         426 push @$new, $path->[$p];
875             }
876             }
877 80         218 return $new;
878             }
879              
880             sub _node_copy {
881 11     11   19 my $node = shift;
882 11         17 my $new = {};
883 11         40 while( my( $k, $v ) = each %$node ) {
884 27 100       70 $new->{$k} = defined($v)
885             ? _path_copy($v)
886             : undef
887             ;
888             }
889 11         32 return $new;
890             }
891              
892             sub _insert_path {
893 7142     7142   10789 my $self = shift;
894 7142         10212 my $list = shift;
895 7142         10382 my $debug = shift;
896 7142         10150 my @in = @{shift()}; # create a new copy
  7142         18998  
897 7142 100       17376 if( @$list == 0 ) { # special case the first time
898 2030 100 100     11221 if( @in == 0 or (@in == 1 and (not defined $in[0] or $in[0] eq ''))) {
      100        
      100        
899 33         124 return [{'' => undef}];
900             }
901             else {
902 1997         5720 return \@in;
903             }
904             }
905 5112 100       10759 $debug and print "# _insert_path @{[_dump(\@in)]} into @{[_dump($list)]}\n";
  151         328  
  151         301  
906 5112         7655 my $path = $list;
907 5112         7298 my $offset = 0;
908 5112         6948 my $token;
909 5112 100       10877 if( not @in ) {
910 2 100       7 if( ref($list->[0]) ne 'HASH' ) {
911 1         5 return [ { '' => undef, $list->[0] => $list } ];
912             }
913             else {
914 1         2 $list->[0]{''} = undef;
915 1         3 return $list;
916             }
917             }
918 5110         12363 while( defined( $token = shift @in )) {
919 17713 100       36509 if( ref($token) eq 'HASH' ) {
920 282 100       643 $debug and print "# p0=", _dump($path), "\n";
921 282         796 $path = $self->_insert_node( $path, $offset, $token, $debug, @in );
922 282 100       663 $debug and print "# p1=", _dump($path), "\n";
923 282         487 last;
924             }
925 17431 100       37113 if( ref($path->[$offset]) eq 'HASH' ) {
926 3703 100       7474 $debug and print "# at (off=$offset len=@{[scalar @$path]}) ", _dump($path->[$offset]), "\n";
  54         202  
927 3703         5669 my $node = $path->[$offset];
928 3703 100       7414 if( exists( $node->{$token} )) {
929 2624 100       5114 if ($offset < $#$path) {
930             my $new = {
931             $token => [$token, @in],
932 1         5 _re_path($self, [$node]) => [@{$path}[$offset..$#$path]],
  1         4  
933             };
934 1         4 splice @$path, $offset, @$path-$offset, $new;
935 1         2 last;
936             }
937             else {
938 2623 100       5333 $debug and print "# descend key=$token @{[_dump($node->{$token})]}\n";
  31         71  
939 2623         4081 $path = $node->{$token};
940 2623         3732 $offset = 0;
941 2623         4327 redo;
942             }
943             }
944             else {
945 1079 100       2219 $debug and print "# add path ($token:@{[_dump(\@in)]}) into @{[_dump($path)]} at off=$offset to end=@{[scalar $#$path]}\n";
  23         51  
  23         48  
  23         515  
946 1079 100       2286 if( $offset == $#$path ) {
947 1072         3323 $node->{$token} = [ $token, @in ];
948             }
949             else {
950             my $new = {
951             _node_key($token) => [ $token, @in ],
952 7         19 _node_key($node) => [@{$path}[$offset..$#{$path}]],
  7         22  
  7         13  
953             };
954 7         23 splice( @$path, $offset, @$path - $offset, $new );
955 7 100       20 $debug and print "# fused node=@{[_dump($new)]} path=@{[_dump($path)]}\n";
  1         3  
  1         2  
956             }
957 1079         2022 last;
958             }
959             }
960              
961 13728 100       26428 if( $debug ) {
962 306         456 my $msg = '';
963 306         438 my $n;
964 306         667 for( $n = 0; $n < @$path; ++$n ) {
965 1093 100       2035 $msg .= ' ' if $n;
966             my $atom = ref($path->[$n]) eq 'HASH'
967 1093 100       2018 ? '{'.join( ' ', keys(%{$path->[$n]})).'}'
  81         229  
968             : $path->[$n]
969             ;
970 1093 100       2706 $msg .= $n == $offset ? "<$atom>" : $atom;
971             }
972 306         2857 print "# at path ($msg)\n";
973             }
974              
975 13728 100       37709 if( $offset >= @$path ) {
    100          
    100          
976 732         2553 push @$path, { $token => [ $token, @in ], '' => undef };
977 732 100       1833 $debug and print "# added remaining @{[_dump($path)]}\n";
  21         48  
978 732         1306 last;
979             }
980             elsif( $token ne $path->[$offset] ) {
981 2099 100       4634 $debug and print "# token $token not present\n";
982             splice @$path, $offset, @$path-$offset, {
983             length $token
984             ? ( _node_key($token) => [$token, @in])
985             : ( '' => undef )
986             ,
987 2099 100       6269 $path->[$offset] => [@{$path}[$offset..$#{$path}]],
  2099         8005  
  2099         3945  
988             };
989 2099 100       5716 $debug and print "# path=@{[_dump($path)]}\n";
  79         159  
990 2099         3468 last;
991             }
992             elsif( not @in ) {
993 917 100       2014 $debug and print "# last token to add\n";
994 917 100       1932 if( defined( $path->[$offset+1] )) {
995 912         1300 ++$offset;
996 912 100       1841 if( ref($path->[$offset]) eq 'HASH' ) {
997 118 100       262 $debug and print "# add sentinel to node\n";
998 118         248 $path->[$offset]{''} = undef;
999             }
1000             else {
1001 794 100       1835 $debug and print "# convert <$path->[$offset]> to node for sentinel\n";
1002             splice @$path, $offset, @$path-$offset, {
1003             '' => undef,
1004 794         1563 $path->[$offset] => [ @{$path}[$offset..$#{$path}] ],
  794         2736  
  794         1429  
1005             };
1006             }
1007             }
1008             else {
1009             # already seen this pattern
1010 5         11 ++$self->{stats_dup};
1011             }
1012 917         1786 last;
1013             }
1014             # if we get here then @_ still contains a token
1015 9980         22185 ++$offset;
1016             }
1017 5110         11620 $list;
1018             }
1019              
1020             sub _insert_node {
1021 282     282   484 my $self = shift;
1022 282         420 my $path = shift;
1023 282         442 my $offset = shift;
1024 282         437 my $token = shift;
1025 282         446 my $debug = shift;
1026 282         511 my $path_end = [@{$path}[$offset..$#{$path}]];
  282         656  
  282         524  
1027             # NB: $path->[$offset] and $[path_end->[0] are equivalent
1028 282         840 my $token_key = _re_path($self, [$token]);
1029 282 100       880 $debug and print "# insert node(@{[_dump($token)]}:@{[_dump(\@_)]}) (key=$token_key)",
  26         58  
  26         66  
1030 26         51 " at path=@{[_dump($path_end)]}\n";
1031 282 100       784 if( ref($path_end->[0]) eq 'HASH' ) {
1032 195 100       665 if( exists($path_end->[0]{$token_key}) ) {
    100          
1033 25 100       66 if( @$path_end > 1 ) {
1034 2         8 my $path_key = _re_path($self, [$path_end->[0]]);
1035 2         11 my $new = {
1036             $path_key => [ @$path_end ],
1037             $token_key => [ $token, @_ ],
1038             };
1039 2 100       8 $debug and print "# +bifurcate new=@{[_dump($new)]}\n";
  1         3  
1040 2         8 splice( @$path, $offset, @$path_end, $new );
1041             }
1042             else {
1043 23         44 my $old_path = $path_end->[0]{$token_key};
1044 23         43 my $new_path = [];
1045 23   100     105 while( @$old_path and _node_eq( $old_path->[0], $token )) {
1046 30 100       89 $debug and print "# identical nodes in sub_path ",
    100          
1047             ref($token) ? _dump($token) : $token, "\n";
1048 30         72 push @$new_path, shift(@$old_path);
1049 30         96 $token = shift @_;
1050             }
1051 23 50       59 if( @$new_path ) {
1052 23         38 my $new;
1053 23         36 my $token_key = $token;
1054 23 100       93 if( @_ ) {
1055 6         15 $new = {
1056             _re_path($self, $old_path) => $old_path,
1057             $token_key => [$token, @_],
1058             };
1059 6 100       19 $debug and print "# insert_node(bifurc) n=@{[_dump([$new])]}\n";
  1         3  
1060             }
1061             else {
1062 17 100       61 $debug and print "# insert $token into old path @{[_dump($old_path)]}\n";
  5         12  
1063 17 100       48 if( @$old_path ) {
1064 11         50 $new = ($self->_insert_path( $old_path, $debug, [$token] ))->[0];
1065             }
1066             else {
1067 6         19 $new = { '' => undef, $token => [$token] };
1068             }
1069             }
1070 23         51 push @$new_path, $new;
1071             }
1072 23         49 $path_end->[0]{$token_key} = $new_path;
1073 23 100       53 $debug and print "# +_insert_node result=@{[_dump($path_end)]}\n";
  6         15  
1074 23         70 splice( @$path, $offset, @$path_end, @$path_end );
1075             }
1076             }
1077             elsif( not _node_eq( $path_end->[0], $token )) {
1078 70 100       198 if( @$path_end > 1 ) {
1079 11         44 my $path_key = _re_path($self, [$path_end->[0]]);
1080 11         70 my $new = {
1081             $path_key => [ @$path_end ],
1082             $token_key => [ $token, @_ ],
1083             };
1084 11 100       40 $debug and print "# path->node1 at $path_key/$token_key @{[_dump($new)]}\n";
  1         4  
1085 11         39 splice( @$path, $offset, @$path_end, $new );
1086             }
1087             else {
1088 59 100       188 $debug and print "# next in path is node, trivial insert at $token_key\n";
1089 59         208 $path_end->[0]{$token_key} = [$token, @_];
1090 59         162 splice( @$path, $offset, @$path_end, @$path_end );
1091             }
1092             }
1093             else {
1094 100   100     403 while( @$path_end and _node_eq( $path_end->[0], $token )) {
1095 131 100       345 $debug and print "# identical nodes @{[_dump([$token])]}\n";
  9         24  
1096 131         233 shift @$path_end;
1097 131         292 $token = shift @_;
1098 131         396 ++$offset;
1099             }
1100 100 100       241 if( @$path_end ) {
1101 57 100       151 $debug and print "# insert at $offset $token:@{[_dump(\@_)]} into @{[_dump($path_end)]}\n";
  4         11  
  4         9  
1102 57         262 $path_end = $self->_insert_path( $path_end, $debug, [$token, @_] );
1103 57 100       166 $debug and print "# got off=$offset s=@{[scalar @_]} path_add=@{[_dump($path_end)]}\n";
  4         11  
  4         14  
1104 57         166 splice( @$path, $offset, @$path - $offset, @$path_end );
1105 57 100       156 $debug and print "# got final=@{[_dump($path)]}\n";
  4         8  
1106             }
1107             else {
1108 43         106 $token_key = _node_key($token);
1109 43         164 my $new = {
1110             '' => undef,
1111             $token_key => [ $token, @_ ],
1112             };
1113 43 100       117 $debug and print "# convert opt @{[_dump($new)]}\n";
  3         8  
1114 43         134 push @$path, $new;
1115             }
1116             }
1117             }
1118             else {
1119 87 100       210 if( @$path_end ) {
1120 74         338 my $new = {
1121             $path_end->[0] => [ @$path_end ],
1122             $token_key => [ $token, @_ ],
1123             };
1124 74 100       195 $debug and print "# atom->node @{[_dump($new)]}\n";
  5         12  
1125 74         222 splice( @$path, $offset, @$path_end, $new );
1126 74 100       200 $debug and print "# out=@{[_dump($path)]}\n";
  5         11  
1127             }
1128             else {
1129 13 100       34 $debug and print "# add opt @{[_dump([$token,@_])]} via $token_key\n";
  4         13  
1130 13         70 push @$path, {
1131             '' => undef,
1132             $token_key => [ $token, @_ ],
1133             };
1134             }
1135             }
1136 282         668 $path;
1137             }
1138              
1139             sub _reduce {
1140 810     810   1331 my $self = shift;
1141 810         1828 my $context = { debug => $self->_debug(DEBUG_TAIL), depth => 0 };
1142              
1143 810 100       1806 if ($self->_debug(DEBUG_TIME)) {
1144 4         11 $self->_init_time_func;
1145 4         12 my $now = $self->{_time_func}->();
1146 4 100       10 if (exists $self->{_begin_time}) {
1147 3         89 printf "# load=%0.6f\n", $now - $self->{_begin_time};
1148             }
1149             else {
1150 1         37 printf "# load-epoch=%0.6f\n", $now;
1151             }
1152 4         19 $self->{_begin_time} = $self->{_time_func}->();
1153             }
1154              
1155 810         1899 my ($head, $tail) = _reduce_path( $self->_path, $context );
1156 810 100       2063 $context->{debug} and print "# final head=", _dump($head), ' tail=', _dump($tail), "\n";
1157 810 100       1704 if( !@$head ) {
1158 651         1233 $self->{path} = $tail;
1159             }
1160             else {
1161             $self->{path} = [
1162 159         379 @{_unrev_path( $tail, $context )},
1163 159         244 @{_unrev_path( $head, $context )},
  159         320  
1164             ];
1165             }
1166              
1167 810 100       2116 if ($self->_debug(DEBUG_TIME)) {
1168 4         12 my $now = $self->{_time_func}->();
1169 4 50       15 if (exists $self->{_begin_time}) {
1170 4         95 printf "# reduce=%0.6f\n", $now - $self->{_begin_time};
1171             }
1172             else {
1173 0         0 printf "# reduce-epoch=%0.6f\n", $now;
1174             }
1175 4         20 $self->{_begin_time} = $self->{_time_func}->();
1176             }
1177              
1178 810 100       1913 $context->{debug} and print "# final path=", _dump($self->{path}), "\n";
1179 810         1865 return $self;
1180             }
1181              
1182             sub _remove_optional {
1183 1710 100   1710   4133 if( exists $_[0]->{''} ) {
1184 350         668 delete $_[0]->{''};
1185 350         672 return 1;
1186             }
1187 1360         2283 return 0;
1188             }
1189              
1190             sub _reduce_path {
1191 810     810   1730 my ($path, $ctx) = @_;
1192 810         2029 my $indent = ' ' x $ctx->{depth};
1193 810         1492 my $debug = $ctx->{debug};
1194 810 100       1880 $debug and print "#$indent _reduce_path $ctx->{depth} ", _dump($path), "\n";
1195 810         1208 my $new;
1196 810         1381 my $head = [];
1197 810         1326 my $tail = [];
1198 810         2418 while( defined( my $p = pop @$path )) {
1199 1159 100       2628 if( ref($p) eq 'HASH' ) {
1200 514         1189 my ($node_head, $node_tail) = _reduce_node($p, _descend($ctx) );
1201 514 100       1613 $debug and print "#$indent| head=", _dump($node_head), " tail=", _dump($node_tail), "\n";
1202 514 100       1429 push @$head, @$node_head if scalar @$node_head;
1203 514 100       3526 push @$tail, ref($node_tail) eq 'HASH' ? $node_tail : @$node_tail;
1204             }
1205             else {
1206 645 100       1332 if( @$head ) {
1207 125 100       305 $debug and print "#$indent| push $p leaves @{[_dump($path)]}\n";
  7         15  
1208 125         412 push @$tail, $p;
1209             }
1210             else {
1211 520 100       1102 $debug and print "#$indent| unshift $p\n";
1212 520         1769 unshift @$tail, $p;
1213             }
1214             }
1215             }
1216 20         86 $debug and print "#$indent| tail nr=@{[scalar @$tail]} t0=", ref($tail->[0]),
1217 810 100       1877 (ref($tail->[0]) eq 'HASH' ? " n=" . scalar(keys %{$tail->[0]}) : '' ),
  18 100       77  
1218             "\n";
1219 810 100 100     2652 if( @$tail > 1
      100        
1220             and ref($tail->[0]) eq 'HASH'
1221 97         435 and keys %{$tail->[0]} == 2
1222             ) {
1223 72         149 my $opt;
1224             my $fixed;
1225 72         126 while( my ($key, $path) = each %{$tail->[0]} ) {
  216         660  
1226 144 100       371 $debug and print "#$indent| scan k=$key p=@{[_dump($path)]}\n";
  14         28  
1227 144 100       361 next unless $path;
1228 123 100 100     437 if (@$path == 1 and ref($path->[0]) eq 'HASH') {
1229 8         22 $opt = $path->[0];
1230             }
1231             else {
1232 115         268 $fixed = $path;
1233             }
1234             }
1235 72 100       223 if( exists $tail->[0]{''} ) {
1236 21         46 my $path = [@{$tail}[1..$#{$tail}]];
  21         61  
  21         46  
1237 21         56 $tail = $tail->[0];
1238 21         73 ($head, $tail, $path) = _slide_tail( $head, $tail, $path, _descend($ctx) );
1239 21         75 $tail = [$tail, @$path];
1240             }
1241             }
1242 810 100       1863 $debug and print "#$indent _reduce_path $ctx->{depth} out head=", _dump($head), ' tail=', _dump($tail), "\n";
1243 810         1811 return ($head, $tail);
1244             }
1245              
1246             sub _reduce_node {
1247 1080     1080   2262 my ($node, $ctx) = @_;
1248 1080         2174 my $indent = ' ' x $ctx->{depth};
1249 1080         1710 my $debug = $ctx->{debug};
1250 1080         2057 my $optional = _remove_optional($node);
1251 1080 100       2499 $debug and print "#$indent _reduce_node $ctx->{depth} in @{[_dump($node)]} opt=$optional\n";
  57         120  
1252 1080 100 100     3034 if( $optional and scalar keys %$node == 1 ) {
1253 70         142 my $path = (values %$node)[0];
1254 70 100       142 if( not grep { ref($_) eq 'HASH' } @$path ) {
  117         344  
1255             # if we have removed an optional, and there is only one path
1256             # left then there is nothing left to compare. Because of the
1257             # optional it cannot participate in any further reductions.
1258             # (unless we test for equality among sub-trees).
1259 61         160 my $result = {
1260             '' => undef,
1261             $path->[0] => $path
1262             };
1263 61 100       159 $debug and print "#$indent| fast fail @{[_dump($result)]}\n";
  2         7  
1264 61         164 return [], $result;
1265             }
1266             }
1267              
1268 1019         1903 my( $fail, $reduce ) = _scan_node( $node, _descend($ctx) );
1269              
1270 1019 100       2767 $debug and print "#$indent|_scan_node done opt=$optional reduce=@{[_dump($reduce)]} fail=@{[_dump($fail)]}\n";
  55         112  
  55         113  
1271              
1272             # We now perform tail reduction on each of the nodes in the reduce
1273             # hash. If we have only one key, we know we will have a successful
1274             # reduction (since everything that was inserted into the node based
1275             # on the value of the last token of each path all mapped to the same
1276             # value).
1277              
1278 1019 100 100     5758 if( @$fail == 0 and keys %$reduce == 1 and not $optional) {
      100        
1279             # every path shares a common path
1280 506         1058 my $path = (values %$reduce)[0];
1281 506         1065 my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1282 506 100       1453 $debug and print "#$indent|_reduce_node $ctx->{depth} common=@{[_dump($common)]} tail=", _dump($tail), "\n";
  50         110  
1283 506         2281 return( $common, $tail );
1284             }
1285              
1286             # this node resulted in a list of paths, game over
1287 513         1115 $ctx->{indent} = $indent;
1288 513         1067 return _reduce_fail( $reduce, $fail, $optional, _descend($ctx) );
1289             }
1290              
1291             sub _reduce_fail {
1292 513     513   1066 my( $reduce, $fail, $optional, $ctx ) = @_;
1293 513         922 my( $debug, $depth, $indent ) = @{$ctx}{qw(debug depth indent)};
  513         1130  
1294 513         877 my %result;
1295 513 100       1218 $result{''} = undef if $optional;
1296 513         737 my $p;
1297 513         1259 for $p (keys %$reduce) {
1298 1181         2054 my $path = $reduce->{$p};
1299 1181 100       2294 if( scalar @$path == 1 ) {
1300 1114         1833 $path = $path->[0];
1301 1114 100       2464 $debug and print "#$indent| -simple opt=$optional unrev @{[_dump($path)]}\n";
  7         15  
1302 1114         2066 $path = _unrev_path($path, _descend($ctx) );
1303 1114         2684 $result{_node_key($path->[0])} = $path;
1304             }
1305             else {
1306 67 100       167 $debug and print "#$indent| _do_reduce(@{[_dump($path)]})\n";
  1         3  
1307 67         159 my ($common, $tail) = _do_reduce( $path, _descend($ctx) );
1308             $path = [
1309             (
1310             ref($tail) eq 'HASH'
1311             ? _unrev_node($tail, _descend($ctx) )
1312             : _unrev_path($tail, _descend($ctx) )
1313             ),
1314 67 100       310 @{_unrev_path($common, _descend($ctx) )}
  67         165  
1315             ];
1316 67 100       248 $debug and print "#$indent| +reduced @{[_dump($path)]}\n";
  1         4  
1317 67         156 $result{_node_key($path->[0])} = $path;
1318             }
1319             }
1320 513         878 my $f;
1321 513         983 for $f( @$fail ) {
1322 219 100       441 $debug and print "#$indent| +fail @{[_dump($f)]}\n";
  3         8  
1323 219         424 $result{$f->[0]} = $f;
1324             }
1325 513 100       1187 $debug and print "#$indent _reduce_fail $depth fail=@{[_dump(\%result)]}\n";
  5         32  
1326 513         2246 return ( [], \%result );
1327             }
1328              
1329             sub _scan_node {
1330 1019     1019   1991 my( $node, $ctx ) = @_;
1331 1019         2115 my $indent = ' ' x $ctx->{depth};
1332 1019         1681 my $debug = $ctx->{debug};
1333              
1334             # For all the paths in the node, reverse them. If the first token
1335             # of the path is a scalar, push it onto an array in a hash keyed by
1336             # the value of the scalar.
1337             #
1338             # If it is a node, call _reduce_node on this node beforehand. If we
1339             # get back a common head, all of the paths in the subnode shared a
1340             # common tail. We then store the common part and the remaining node
1341             # of paths (which is where the paths diverged from the end and install
1342             # this into the same hash. At this point both the common and the tail
1343             # are in reverse order, just as simple scalar paths are.
1344             #
1345             # On the other hand, if there were no common path returned then all
1346             # the paths of the sub-node diverge at the end character. In this
1347             # case the tail cannot participate in any further reductions and will
1348             # appear in forward order.
1349             #
1350             # certainly the hurgliest function in the whole file :(
1351              
1352             # $debug = 1 if $depth >= 8;
1353 1019         2433 my @fail;
1354             my %reduce;
1355              
1356 1019         0 my $n;
1357 1019         2586 for $n(
1358 2743         7088 map { substr($_, index($_, '#')+1) }
1359             sort
1360             map {
1361             join( '|' =>
1362 9029         17887 scalar(grep {ref($_) eq 'HASH'} @{$node->{$_}}),
  2743         4951  
1363             _node_offset($node->{$_}),
1364 2743         4145 scalar @{$node->{$_}},
  2743         10212  
1365             )
1366             . "#$_"
1367             }
1368             keys %$node ) {
1369 2743         4189 my( $end, @path ) = reverse @{$node->{$n}};
  2743         7030  
1370 2743 100       5715 if( ref($end) ne 'HASH' ) {
1371 2137 100       4408 $debug and print "# $indent|_scan_node push reduce ($end:@{[_dump(\@path)]})\n";
  87         182  
1372 2137         3079 push @{$reduce{$end}}, [ $end, @path ];
  2137         7679  
1373             }
1374             else {
1375 606 100       1385 $debug and print "# $indent|_scan_node head=", _dump(\@path), ' tail=', _dump($end), "\n";
1376 606         959 my $new_path;
1377             # deal with sing, singing => s(?:ing)?ing
1378 606 100 100     2633 if( keys %$end == 2 and exists $end->{''} ) {
1379 94         231 my ($key, $opt_path) = each %$end;
1380 94 100       303 ($key, $opt_path) = each %$end if $key eq '';
1381 94         151 $opt_path = [reverse @{$opt_path}];
  94         228  
1382 94 100       237 $debug and print "# $indent| check=", _dump($opt_path), "\n";
1383 94         289 my $end = { '' => undef, $opt_path->[0] => [@$opt_path] };
1384 94         178 my $head = [];
1385 94         205 my $path = [@path];
1386 94         250 ($head, my $slide, $path) = _slide_tail( $head, $end, $path, $ctx );
1387 94 100       334 if( @$head ) {
1388 40         171 $new_path = [ @$head, $slide, @$path ];
1389             }
1390             }
1391 606 100       1209 if( $new_path ) {
1392 40 100       95 $debug and print "# $indent|_scan_node slid=", _dump($new_path), "\n";
1393 40         63 push @{$reduce{$new_path->[0]}}, $new_path;
  40         128  
1394             }
1395             else {
1396 566         1168 my( $common, $tail ) = _reduce_node( $end, _descend($ctx) );
1397 566 100       1575 if( not @$common ) {
1398 219 100       552 $debug and print "# $indent| +failed $n\n";
1399 219         703 push @fail, [reverse(@path), $tail];
1400             }
1401             else {
1402 347         792 my $path = [@path];
1403 347 100       835 $debug and print "# $indent|_scan_node ++recovered common=@{[_dump($common)]} tail=",
  34         69  
1404 34         65 _dump($tail), " path=@{[_dump($path)]}\n";
1405 347 100 100     1720 if( ref($tail) eq 'HASH'
1406             and keys %$tail == 2
1407             ) {
1408 287 100       846 if( exists $tail->{''} ) {
1409 121         289 ($common, $tail, $path) = _slide_tail( $common, $tail, $path, $ctx );
1410             }
1411             }
1412 347 100       596 push @{$reduce{$common->[0]}}, [
  347         1916  
1413             @$common,
1414             (ref($tail) eq 'HASH' ? $tail : @$tail ),
1415             @$path
1416             ];
1417             }
1418             }
1419             }
1420             }
1421 1019 100       2825 $debug and print
1422 55         182 "# $indent|_scan_node counts: reduce=@{[scalar keys %reduce]} fail=@{[scalar @fail]}\n";
  55         209  
1423 1019         2640 return( \@fail, \%reduce );
1424             }
1425              
1426             sub _do_reduce {
1427 573     573   1110 my ($path, $ctx) = @_;
1428 573         1225 my $indent = ' ' x $ctx->{depth};
1429 573         896 my $debug = $ctx->{debug};
1430 573         1499 my $ra = Regexp::Assemble->new(chomp=>0);
1431 573         1603 $ra->debug($debug);
1432 573 100       1302 $debug and print "# $indent| do @{[_dump($path)]}\n";
  51         100  
1433 573         1821 $ra->_insertr( $_ ) for
1434             # When nodes come into the picture, we have to be careful
1435             # about how we insert the paths into the assembly.
1436             # Paths with nodes first, then closest node to front
1437             # then shortest path. Merely because if we can control
1438             # order in which paths containing nodes get inserted,
1439             # then we can make a couple of assumptions that simplify
1440             # the code in _insert_node.
1441             sort {
1442 6000         10572 scalar(grep {ref($_) eq 'HASH'} @$a)
1443 1113 50 100     2054 <=> scalar(grep {ref($_) eq 'HASH'} @$b)
  6578         12440  
1444             ||
1445             _node_offset($b) <=> _node_offset($a)
1446             ||
1447             scalar @$a <=> scalar @$b
1448             }
1449             @$path
1450             ;
1451 573         1328 $path = $ra->_path;
1452 573         1012 my $common = [];
1453 573         3127 push @$common, shift @$path while( ref($path->[0]) ne 'HASH' );
1454 573 100       1430 my $tail = scalar( @$path ) > 1 ? [@$path] : $path->[0];
1455 573 100       1263 $debug and print "# $indent| _do_reduce common=@{[_dump($common)]} tail=@{[_dump($tail)]}\n";
  51         104  
  51         101  
1456 573         2717 return ($common, $tail);
1457             }
1458              
1459             sub _node_offset {
1460             # return the offset that the first node is found, or -ve
1461             # optimised for speed
1462 4422     4422   6385 my $nr = @{$_[0]};
  4422         6891  
1463 4422         6401 my $atom = -1;
1464 4422   100     31167 ref($_[0]->[$atom]) eq 'HASH' and return $atom while ++$atom < $nr;
1465 3458         7646 return -1;
1466             }
1467              
1468             sub _slide_tail {
1469 240     240   20123 my $head = shift;
1470 240         354 my $tail = shift;
1471 240         361 my $path = shift;
1472 240         354 my $ctx = shift;
1473 240         515 my $indent = ' ' x $ctx->{depth};
1474 240         394 my $debug = $ctx->{debug};
1475 240 100       574 $debug and print "# $indent| slide in h=", _dump($head),
1476             ' t=', _dump($tail), ' p=', _dump($path), "\n";
1477 240         553 my $slide_path = (each %$tail)[-1];
1478 240 100       623 $slide_path = (each %$tail)[-1] unless defined $slide_path;
1479 240 100       560 $debug and print "# $indent| slide potential ", _dump($slide_path), " over ", _dump($path), "\n";
1480 240   100     1108 while( defined $path->[0] and $path->[0] eq $slide_path->[0] ) {
1481 154 100       429 $debug and print "# $indent| slide=tail=$slide_path->[0]\n";
1482 154         271 my $slide = shift @$path;
1483 154         238 shift @$slide_path;
1484 154         327 push @$slide_path, $slide;
1485 154         595 push @$head, $slide;
1486             }
1487 240 100       585 $debug and print "# $indent| slide path ", _dump($slide_path), "\n";
1488 240         590 my $slide_node = {
1489             '' => undef,
1490             _node_key($slide_path->[0]) => $slide_path,
1491             };
1492 240 100       624 $debug and print "# $indent| slide out h=", _dump($head),
1493             ' s=', _dump($slide_node), ' p=', _dump($path), "\n";
1494 240         652 return ($head, $slide_node, $path);
1495             }
1496              
1497             sub _unrev_path {
1498 2690     2690   4844 my ($path, $ctx) = @_;
1499 2690         5097 my $indent = ' ' x $ctx->{depth};
1500 2690         4107 my $debug = $ctx->{debug};
1501 2690         3622 my $new;
1502 2690 100       4347 if( not grep { ref($_) } @$path ) {
  6383         13257  
1503 2196 100       4657 $debug and print "# ${indent}_unrev path fast ", _dump($path);
1504 2196         5847 $new = [reverse @$path];
1505 2196 100       4667 $debug and print "# -> ", _dump($new), "\n";
1506 2196         4551 return $new;
1507             }
1508 494 100       1144 $debug and print "# ${indent}unrev path in ", _dump($path), "\n";
1509 494         1285 while( defined( my $p = pop @$path )) {
1510 1483 100       4801 push @$new,
    100          
1511             ref($p) eq 'HASH' ? _unrev_node($p, _descend($ctx) )
1512             : ref($p) eq 'ARRAY' ? _unrev_path($p, _descend($ctx) )
1513             : $p
1514             ;
1515             }
1516 494 100       1101 $debug and print "# ${indent}unrev path out ", _dump($new), "\n";
1517 494         975 return $new;
1518             }
1519              
1520             sub _unrev_node {
1521 630     630   1236 my ($node, $ctx ) = @_;
1522 630         1227 my $indent = ' ' x $ctx->{depth};
1523 630         960 my $debug = $ctx->{debug};
1524 630         1159 my $optional = _remove_optional($node);
1525 630 100       1423 $debug and print "# ${indent}unrev node in ", _dump($node), " opt=$optional\n";
1526 630         916 my $new;
1527 630 100       1446 $new->{''} = undef if $optional;
1528 630         905 my $n;
1529 630         1409 for $n( keys %$node ) {
1530 1167         2323 my $path = _unrev_path($node->{$n}, _descend($ctx) );
1531 1167         2794 $new->{_node_key($path->[0])} = $path;
1532             }
1533 630 100       1506 $debug and print "# ${indent}unrev node out ", _dump($new), "\n";
1534 630         2422 return $new;
1535             }
1536              
1537             sub _node_key {
1538 4653     4653   7893 my $node = shift;
1539 4653 100       9977 return _node_key($node->[0]) if ref($node) eq 'ARRAY';
1540 4642 100       15860 return $node unless ref($node) eq 'HASH';
1541 205         335 my $key = '';
1542 205         301 my $k;
1543 205         500 for $k( keys %$node ) {
1544 452 100       988 next if $k eq '';
1545 374 100 100     1318 $key = $k if $key eq '' or $key gt $k;
1546             }
1547 205         645 return $key;
1548             }
1549              
1550             sub _descend {
1551             # Take a context object, and increase the depth by one.
1552             # By creating a fresh hash each time, we don't have to
1553             # bother adding make-work code to decrease the depth
1554             # when we return from what we called.
1555 6190     6190   9293 my $ctx = shift;
1556 6190         23621 return {%$ctx, depth => $ctx->{depth}+1};
1557             }
1558              
1559             #####################################################################
1560              
1561             sub _make_class {
1562 650     650   1089 my $self = shift;
1563 650         1171 my %set = map { ($_,1) } @_;
  1675         3854  
1564 650 100       1813 delete $set{'\\d'} if exists $set{'\\w'};
1565 650 100       1450 delete $set{'\\D'} if exists $set{'\\W'};
1566             return '.' if exists $set{'.'}
1567             or ($self->{fold_meta_pairs} and (
1568             (exists $set{'\\d'} and exists $set{'\\D'})
1569             or (exists $set{'\\s'} and exists $set{'\\S'})
1570 650 100 100     4895 or (exists $set{'\\w'} and exists $set{'\\W'})
      100        
      100        
1571             ))
1572             ;
1573 632         1184 for my $meta( q/\\d/, q/\\D/, q/\\s/, q/\\S/, q/\\w/, q/\\W/ ) {
1574 3792 100       7692 if( exists $set{$meta} ) {
1575 28         175 my $re = qr/$meta/;
1576 28         47 my @delete;
1577 28   66     308 $_ =~ /^$re$/ and push @delete, $_ for keys %set;
1578 28 100       124 delete @set{@delete} if @delete;
1579             }
1580             }
1581 632 100       1451 return (keys %set)[0] if keys %set == 1;
1582 625         1117 for my $meta( '.', '+', '*', '?', '(', ')', '^', '@', '$', '[', '/', ) {
1583 6875 100       14335 exists $set{"\\$meta"} and $set{$meta} = delete $set{"\\$meta"};
1584             }
1585 625 100       1386 my $dash = exists $set{'-'} ? do { delete($set{'-'}), '-' } : '';
  20         53  
1586 625 100       1259 my $caret = exists $set{'^'} ? do { delete($set{'^'}), '^' } : '';
  7         25  
1587 625         2160 my $class = join( '' => sort keys %set );
1588 625 100 100     2051 $class =~ s/0123456789/\\d/ and $class eq '\\d' and return $class;
1589 622         3782 return "[$dash$class$caret]";
1590             }
1591              
1592             sub _re_sort {
1593 998   100 998   4593 return length $b <=> length $a || $a cmp $b
1594             }
1595              
1596             sub _combine {
1597 140     140   211 my $self = shift;
1598 140         225 my $type = shift;
1599             # print "c in = @{[_dump(\@_)]}\n";
1600             # my $combine =
1601             return '('
1602             . $type
1603 140         241 . do {
1604 140         221 my( @short, @long );
1605 140 100       260 push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
  377         1953  
1606 140 100       408 if( @short == 1 ) {
    100          
1607 31         88 @long = sort _re_sort @long, @short;
1608             }
1609             elsif( @short > 1 ) {
1610             # yucky but true
1611 77         179 my @combine = (_make_class($self, @short), sort _re_sort @long);
1612 77         188 @long = @combine;
1613             }
1614             else {
1615 32         101 @long = sort _re_sort @long;
1616             }
1617 140         469 join( '|', @long );
1618             }
1619             . ')';
1620             # print "combine <$combine>\n";
1621             # $combine;
1622             }
1623              
1624             sub _combine_new {
1625 1738     1738   2919 my $self = shift;
1626 1738         2648 my( @short, @long );
1627 1738 100       3133 push @{ /^$Single_Char$/ ? \@short : \@long}, $_ for @_;
  3244         17480  
1628 1738 100 100     8318 if( @short == 1 and @long == 0 ) {
    100 100        
1629 365         1782 return $short[0];
1630             }
1631             elsif( @short > 1 and @short == @_ ) {
1632 494         1106 return _make_class($self, @short);
1633             }
1634             else {
1635 879 100       4983 return '(?:'
1636             . join( '|' =>
1637             @short > 1
1638             ? ( _make_class($self, @short), sort _re_sort @long)
1639             : ( (sort _re_sort( @long )), @short )
1640             )
1641             . ')';
1642             }
1643             }
1644              
1645             sub _re_path {
1646 4772     4772   7336 my $self = shift;
1647             # in shorter assemblies, _re_path() is the second hottest
1648             # routine. after insert(), so make it fast.
1649              
1650 4772 100       10506 if ($self->{unroll_plus}) {
1651             # but we can't easily make this blockless
1652 72         113 my @arr = @{$_[0]};
  72         163  
1653 72         110 my $str = '';
1654 72         99 my $skip = 0;
1655 72         147 for my $i (0..$#arr) {
1656 127 100 100     1162 if (ref($arr[$i]) eq 'ARRAY') {
    100          
    100          
    100          
1657 1         4 $str .= _re_path($self, $arr[$i]);
1658             }
1659             elsif (ref($arr[$i]) eq 'HASH') {
1660             $str .= exists $arr[$i]->{''}
1661             ? _combine_new( $self,
1662 7         22 map { _re_path( $self, $arr[$i]->{$_} ) } grep { $_ ne '' } keys %{$arr[$i]}
  14         39  
  7         22  
1663             ) . '?'
1664 28 100       62 : _combine_new($self, map { _re_path( $self, $arr[$i]->{$_} ) } keys %{$arr[$i]})
  42         107  
  21         55  
1665             ;
1666             }
1667             elsif ($i < $#arr and $arr[$i+1] =~ /\A$arr[$i]\*(\??)\Z/) {
1668 7 50       34 $str .= "$arr[$i]+" . (defined $1 ? $1 : '');
1669 7         16 ++$skip;
1670             }
1671             elsif ($skip) {
1672 7         19 $skip = 0;
1673             }
1674             else {
1675 84         182 $str .= $arr[$i];
1676             }
1677             }
1678 72         232 return $str;
1679             }
1680              
1681 4700 50       7743 return join( '', @_ ) unless grep { length ref $_ } @_;
  4700         11983  
1682 4700         6609 my $p;
1683             return join '', map {
1684             ref($_) eq '' ? $_
1685 9506 100       25949 : ref($_) eq 'HASH' ? do {
    100          
1686             # In the case of a node, see whether there's a '' which
1687             # indicates that the whole thing is optional and thus
1688             # requires a trailing ?
1689             # Unroll the two different paths to avoid the needless
1690             # grep when it isn't necessary.
1691 1710         2601 $p = $_;
1692             exists $_->{''}
1693             ? _combine_new( $self,
1694 801         1790 map { _re_path( $self, $p->{$_} ) } grep { $_ ne '' } keys %$_
  1508         3071  
1695             ) . '?'
1696 1710 100       5033 : _combine_new($self, map { _re_path( $self, $p->{$_} ) } keys %$_ )
  2394         5124  
1697             }
1698             : _re_path($self, $_) # ref($_) eq 'ARRAY'
1699 4700         6570 } @{$_[0]}
  4700         7834  
1700             }
1701              
1702             sub _lookahead {
1703 132     132   256 my $in = shift;
1704 132         223 my %head;
1705             my $path;
1706 132         282 for $path( keys %$in ) {
1707 328 100       716 next unless defined $in->{$path};
1708             # print "look $path: ", ref($in->{$path}[0]), ".\n";
1709 267 100       659 if( ref($in->{$path}[0]) eq 'HASH' ) {
    100          
1710 15         27 my $next = 0;
1711 15   100     52 while( ref($in->{$path}[$next]) eq 'HASH' and @{$in->{$path}} > $next + 1 ) {
  16         71  
1712 11 100       30 if( exists $in->{$path}[$next]{''} ) {
1713 5         15 ++$head{$in->{$path}[$next+1]};
1714             }
1715 11         33 ++$next;
1716             }
1717 15         50 my $inner = _lookahead( $in->{$path}[0] );
1718 15         66 @head{ keys %$inner } = (values %$inner);
1719             }
1720             elsif( ref($in->{$path}[0]) eq 'ARRAY' ) {
1721 2         6 my $subpath = $in->{$path}[0];
1722 2         9 for( my $sp = 0; $sp < @$subpath; ++$sp ) {
1723 3 100       12 if( ref($subpath->[$sp]) eq 'HASH' ) {
1724 2         8 my $follow = _lookahead( $subpath->[$sp] );
1725 2         12 @head{ keys %$follow } = (values %$follow);
1726 2 100       12 last unless exists $subpath->[$sp]{''};
1727             }
1728             else {
1729 1         4 ++$head{$subpath->[$sp]};
1730 1         4 last;
1731             }
1732             }
1733             }
1734             else {
1735 250         522 ++$head{ $in->{$path}[0] };
1736             }
1737             }
1738             # print "_lookahead ", _dump($in), '==>', _dump([keys %head]), "\n";
1739 132         301 return \%head;
1740             }
1741              
1742             sub _re_path_lookahead {
1743 265     265   400 my $self = shift;
1744 265         380 my $in = shift;
1745             # print "_re_path_la in ", _dump($in), "\n";
1746 265         395 my $out = '';
1747 265         605 for( my $p = 0; $p < @$in; ++$p ) {
1748 573 100       13685 if( ref($in->[$p]) eq '' ) {
    100          
1749 462         715 $out .= $in->[$p];
1750 462         1011 next;
1751             }
1752             elsif( ref($in->[$p]) eq 'ARRAY' ) {
1753 2         10 $out .= _re_path_lookahead($self, $in->[$p]);
1754 2         8 next;
1755             }
1756             # print "$p ", _dump($in->[$p]), "\n";
1757             my $path = [
1758 228         527 map { _re_path_lookahead($self, $in->[$p]{$_} ) }
1759 275         550 grep { $_ ne '' }
1760 109         180 keys %{$in->[$p]}
  109         327  
1761             ];
1762 109         297 my $ahead = _lookahead($in->[$p]);
1763 109         168 my $more = 0;
1764 109 100 100     418 if( exists $in->[$p]{''} and $p + 1 < @$in ) {
1765 12         23 my $next = 1;
1766 12         31 while( $p + $next < @$in ) {
1767 14 100       43 if( ref( $in->[$p+$next] ) eq 'HASH' ) {
1768 2         8 my $follow = _lookahead( $in->[$p+$next] );
1769 2         5 @{$ahead}{ keys %$follow } = (values %$follow);
  2         7  
1770             }
1771             else {
1772 12         29 ++$ahead->{$in->[$p+$next]};
1773 12         20 last;
1774             }
1775 2         6 ++$next;
1776             }
1777 12         23 $more = 1;
1778             }
1779 109         220 my $nr_one = grep { /^$Single_Char$/ } @$path;
  228         1188  
1780 109         194 my $nr = @$path;
1781 109 100 100     383 if( $nr_one > 1 and $nr_one == $nr ) {
1782 18         48 $out .= _make_class($self, @$path);
1783 18 100       95 $out .= '?' if exists $in->[$p]{''};
1784             }
1785             else {
1786             my $zwla = keys(%$ahead) > 1
1787 91 100       280 ? _combine($self, '?=', grep { s/\+$//; $_ } keys %$ahead )
  191         312  
  191         365  
1788             : '';
1789 91 100       262 my $patt = $nr > 1 ? _combine($self, '?:', @$path ) : $path->[0];
1790             # print "have nr=$nr n1=$nr_one n=", _dump($in->[$p]), ' a=', _dump([keys %$ahead]), " zwla=$zwla patt=$patt @{[_dump($path)]}\n";
1791 91 100       220 if( exists $in->[$p]{''} ) {
1792 44 100       220 $out .= $more ? "$zwla(?:$patt)?" : "(?:$zwla$patt)?";
1793             }
1794             else {
1795 47         211 $out .= "$zwla$patt";
1796             }
1797             }
1798             }
1799 265         674 return $out;
1800             }
1801              
1802             sub _re_path_track {
1803 33     33   48 my $self = shift;
1804 33         45 my $in = shift;
1805 33         50 my $normal = shift;
1806 33         43 my $augmented = shift;
1807 33         43 my $o;
1808 33         51 my $simple = '';
1809 33         44 my $augment = '';
1810 33         109 for( my $n = 0; $n < @$in; ++$n ) {
1811 114 100       216 if( ref($in->[$n]) eq '' ) {
1812 104         149 $o = $in->[$n];
1813 104         140 $simple .= $o;
1814 104         165 $augment .= $o;
1815 104 100 100     627 if( (
      100        
      100        
1816             $n < @$in - 1
1817             and ref($in->[$n+1]) eq 'HASH' and exists $in->[$n+1]{''}
1818             )
1819             or $n == @$in - 1
1820             ) {
1821 24         36 push @{$self->{mlist}}, $normal . $simple ;
  24         55  
1822 24 50       62 $augment .= $] < 5.009005
1823             ? "(?{\$self->{m}=$self->{mcount}})"
1824             : "(?{$self->{mcount}})"
1825             ;
1826 24         61 ++$self->{mcount};
1827             }
1828             }
1829             else {
1830             my $path = [
1831 25         80 map { $self->_re_path_track( $in->[$n]{$_}, $normal.$simple , $augmented.$augment ) }
1832 26         56 grep { $_ ne '' }
1833 10         17 keys %{$in->[$n]}
  10         30  
1834             ];
1835 10         48 $o = '(?:' . join( '|' => sort _re_sort @$path ) . ')';
1836 10 100       26 $o .= '?' if exists $in->[$n]{''};
1837 10         20 $simple .= $o;
1838 10         30 $augment .= $o;
1839             }
1840             }
1841 33         107 return $augment;
1842             }
1843              
1844             sub _re_path_pretty {
1845 411     411   638 my $self = shift;
1846 411         603 my $in = shift;
1847 411         610 my $arg = shift;
1848 411         882 my $pre = ' ' x (($arg->{depth}+0) * $arg->{indent});
1849 411         800 my $indent = ' ' x (($arg->{depth}+1) * $arg->{indent});
1850 411         615 my $out = '';
1851 411         628 $arg->{depth}++;
1852 411         597 my $prev_was_paren = 0;
1853 411         1114 for( my $p = 0; $p < @$in; ++$p ) {
1854 1084 100       2262 if( ref($in->[$p]) eq '' ) {
    100          
1855 910 100       1808 $out .= "\n$pre" if $prev_was_paren;
1856 910         1402 $out .= $in->[$p];
1857 910         2266 $prev_was_paren = 0;
1858             }
1859             elsif( ref($in->[$p]) eq 'ARRAY' ) {
1860 3         11 $out .= _re_path($self, $in->[$p]);
1861             }
1862             else {
1863             my $path = [
1864 369         932 map { _re_path_pretty($self, $in->[$p]{$_}, $arg ) }
1865 419         864 grep { $_ ne '' }
1866 171         271 keys %{$in->[$p]}
  171         461  
1867             ];
1868 171         382 my $nr = @$path;
1869 171         278 my( @short, @long );
1870 171 100       332 push @{/^$Single_Char$/ ? \@short : \@long}, $_ for @$path;
  369         2162  
1871 171 100       419 if( @short == $nr ) {
1872 37 100       118 $out .= $nr == 1 ? $path->[0] : _make_class($self, @short);
1873 37 100       174 $out .= '?' if exists $in->[$p]{''};
1874             }
1875             else {
1876 134 100       355 $out .= "\n" if length $out;
1877 134 100       318 $out .= $pre if $p;
1878 134         291 $out .= "(?:\n$indent";
1879 134 100       286 if( @short < 2 ) {
1880 133         234 my $r = 0;
1881             $out .= join( "\n$indent|" => map {
1882 133 100       481 $r++ and $_ =~ s/^\(\?:/\n$indent(?:/;
  298         732  
1883 298         734 $_
1884             }
1885             sort _re_sort @$path
1886             );
1887             }
1888             else {
1889 1         5 $out .= join( "\n$indent|" => ( (sort _re_sort @long), _make_class($self, @short) ));
1890             }
1891 134         350 $out .= "\n$pre)";
1892 134 100       311 if( exists $in->[$p]{''} ) {
1893 37         86 $out .= "\n$pre?";
1894 37         134 $prev_was_paren = 0;
1895             }
1896             else {
1897 97         333 $prev_was_paren = 1;
1898             }
1899             }
1900             }
1901             }
1902 411         662 $arg->{depth}--;
1903 411         1100 return $out;
1904             }
1905              
1906             sub _node_eq {
1907 425 100 100 425   1832 return 0 if not defined $_[0] or not defined $_[1];
1908 422 100       1251 return 0 if ref $_[0] ne ref $_[1];
1909             # Now that we have determined that the reference of each
1910             # argument are the same, we only have to test the first
1911             # one, which gives us a nice micro-optimisation.
1912 381 100       865 if( ref($_[0]) eq 'HASH' ) {
    100          
1913 305         676 keys %{$_[0]} == keys %{$_[1]}
  305         950  
1914             and
1915             # does this short-circuit to avoid _re_path() cost more than it saves?
1916 305 100 100     453 join( '|' => sort keys %{$_[0]}) eq join( '|' => sort keys %{$_[1]})
  272         870  
  272         1518  
1917             and
1918             _re_path(undef, [$_[0]] ) eq _re_path(undef, [$_[1]] );
1919             }
1920             elsif( ref($_[0]) eq 'ARRAY' ) {
1921 9 100       12 scalar @{$_[0]} == scalar @{$_[1]}
  9         16  
  9         32  
1922             and
1923             _re_path(undef, $_[0]) eq _re_path(undef, $_[1]);
1924             }
1925             else {
1926 67         251 $_[0] eq $_[1];
1927             }
1928             }
1929              
1930             sub _pretty_dump {
1931 7     7   25 return sprintf "\\x%02x", ord(shift);
1932             }
1933              
1934             sub _dump {
1935 5579     5579   8986 my $path = shift;
1936 5579 100       11914 return _dump_node($path) if ref($path) eq 'HASH';
1937 5057         7330 my $dump = '[';
1938 5057         6645 my $d;
1939 5057         6892 my $nr = 0;
1940 5057         7822 for $d( @$path ) {
1941 11036 100       21262 $dump .= ' ' if $nr++;
1942 11036 100       24932 if( ref($d) eq 'HASH' ) {
    100          
    100          
1943 1340         2502 $dump .= _dump_node($d);
1944             }
1945             elsif( ref($d) eq 'ARRAY' ) {
1946 242         465 $dump .= _dump($d);
1947             }
1948             elsif( defined $d ) {
1949             # D::C indicates the second test is redundant
1950             # $dump .= ( $d =~ /\s/ or not length $d )
1951 9453 100       25879 $dump .= (
    100          
1952             $d =~ /\s/ ? qq{'$d'} :
1953             $d =~ /^[\x00-\x1f]$/ ? _pretty_dump($d) :
1954             $d
1955             );
1956             }
1957             else {
1958 1         3 $dump .= '*';
1959             }
1960             }
1961 5057         25902 return $dump . ']';
1962             }
1963              
1964             sub _dump_node {
1965 1862     1862   2807 my $node = shift;
1966 1862         2629 my $dump = '{';
1967 1862         2506 my $nr = 0;
1968 1862         2486 my $n;
1969 1862         4770 for $n (sort keys %$node) {
1970 3899 100       8141 $dump .= ' ' if $nr++;
1971             # Devel::Cover shows this to test to be redundant
1972             # $dump .= ( $n eq '' and not defined $node->{$n} )
1973             $dump .= $n eq ''
1974             ? '*'
1975             : ($n =~ /^[\x00-\x1f]$/ ? _pretty_dump($n) : $n)
1976 3899 100       11559 . "=>" . _dump($node->{$n})
    100          
1977             ;
1978             }
1979 1862         9728 return $dump . '}';
1980             }
1981              
1982             =pod
1983              
1984             =head1 NAME
1985              
1986             Regexp::Assemble - Assemble multiple Regular Expressions into a single RE
1987              
1988             =head1 SYNOPSIS
1989              
1990             use Regexp::Assemble;
1991              
1992             my $ra = Regexp::Assemble->new;
1993             $ra->add( 'ab+c' );
1994             $ra->add( 'ab+-' );
1995             $ra->add( 'a\w\d+' );
1996             $ra->add( 'a\d+' );
1997             print $ra->re; # prints a(?:\w?\d+|b+[-c])
1998              
1999             =head1 DESCRIPTION
2000              
2001             Regexp::Assemble takes an arbitrary number of regular expressions
2002             and assembles them into a single regular expression (or RE) that
2003             matches all that the individual REs match.
2004              
2005             As a result, instead of having a large list of expressions to loop
2006             over, a target string only needs to be tested against one expression.
2007             This is interesting when you have several thousand patterns to deal
2008             with. Serious effort is made to produce the smallest pattern possible.
2009              
2010             It is also possible to track the original patterns, so that you can
2011             determine which, among the source patterns that form the assembled
2012             pattern, was the one that caused the match to occur.
2013              
2014             You should realise that large numbers of alternations are processed
2015             in perl's regular expression engine in O(n) time, not O(1). If you
2016             are still having performance problems, you should look at using a
2017             trie. Note that Perl's own regular expression engine will implement
2018             trie optimisations in perl 5.10 (they are already available in
2019             perl 5.9.3 if you want to try them out). C will
2020             do the right thing when it knows it's running on a trie'd perl.
2021             (At least in some version after this one).
2022              
2023             Some more examples of usage appear in the accompanying README. If
2024             that file is not easy to access locally, you can find it on a web
2025             repository such as
2026             L or
2027             L.
2028              
2029             See also L.
2030              
2031             =head1 Methods
2032              
2033             =head2 add(LIST)
2034              
2035             Takes a string, breaks it apart into a set of tokens (respecting
2036             meta characters) and inserts the resulting list into the C
2037             object. It uses a naive regular expression to lex the string
2038             that may be fooled complex expressions (specifically, it will
2039             fail to lex nested parenthetical expressions such as
2040             C correctly). If this is the case, the end of
2041             the string will not be tokenised correctly and returned as one
2042             long string.
2043              
2044             On the one hand, this may indicate that the patterns you are
2045             trying to feed the C object are too complex. Simpler
2046             patterns might allow the algorithm to work more effectively and
2047             perform more reductions in the resulting pattern.
2048              
2049             On the other hand, you can supply your own pattern to perform the
2050             lexing if you need. The test suite contains an example of a lexer
2051             pattern that will match one level of nested parentheses.
2052              
2053             Note that there is an internal optimisation that will bypass a
2054             much of the lexing process. If a string contains no C<\>
2055             (backslash), C<[> (open square bracket), C<(> (open paren),
2056             C (question mark), C<+> (plus), C<*> (star) or C<{> (open
2057             curly), a character split will be performed directly.
2058              
2059             A list of strings may be supplied, thus you can pass it a file
2060             handle of a file opened for reading:
2061              
2062             $re->add( '\d+-\d+-\d+-\d+\.example\.com' );
2063             $re->add( );
2064              
2065             If the file is very large, it may be more efficient to use a
2066             C loop, to read the file line-by-line:
2067              
2068             $re->add($_) while ;
2069              
2070             The C method will chomp the lines automatically. If you
2071             do not want this to occur (you want to keep the record
2072             separator), then disable Cing.
2073              
2074             $re->chomp(0);
2075             $re->add($_) while ;
2076              
2077             This method is chainable.
2078              
2079             =head2 add_file(FILENAME [...])
2080              
2081             Takes a list of file names. Each file is opened and read
2082             line by line. Each line is added to the assembly.
2083              
2084             $r->add_file( 'file.1', 'file.2' );
2085              
2086             If a file cannot be opened, the method will croak. If you cannot
2087             afford to let this happen then you should wrap the call in a C
2088             block.
2089              
2090             Chomping happens automatically unless you the C method
2091             to disable it. By default, input lines are read according to the
2092             value of the C attribute (if defined), and
2093             will otherwise fall back to the current setting of the system C<$/>
2094             variable. The record separator may also be specified on each
2095             call to C. Internally, the routine Cises the
2096             value of C<$/> to whatever is required, for the duration of the
2097             call.
2098              
2099             An alternate calling mechanism using a hash reference is
2100             available. The recognised keys are:
2101              
2102             =over 4
2103              
2104             =item file
2105              
2106             Reference to a list of file names, or the name of a single
2107             file.
2108              
2109             $r->add_file({file => ['file.1', 'file.2', 'file.3']});
2110             $r->add_file({file => 'file.n'});
2111              
2112             =item input_record_separator
2113              
2114             If present, indicates what constitutes a line
2115              
2116             $r->add_file({file => 'data.txt', input_record_separator => ':' });
2117              
2118             =item rs
2119              
2120             An alias for input_record_separator (mnemonic: same as the
2121             English variable names).
2122              
2123             =back
2124              
2125             $r->add_file( {
2126             file => [ 'pattern.txt', 'more.txt' ],
2127             input_record_separator => "\r\n",
2128             });
2129              
2130             =head2 clone()
2131              
2132             Clones the contents of a Regexp::Assemble object and creates a new
2133             object (in other words it performs a deep copy).
2134              
2135             If the Storable module is installed, its dclone method will be used,
2136             otherwise the cloning will be performed using a pure perl approach.
2137              
2138             You can use this method to take a snapshot of the patterns that have
2139             been added so far to an object, and generate an assembly from the
2140             clone. Additional patterns may to be added to the original object
2141             afterwards.
2142              
2143             my $re = $main->clone->re();
2144             $main->add( 'another-pattern-\\d+' );
2145              
2146             =head2 insert(LIST)
2147              
2148             Takes a list of tokens representing a regular expression and
2149             stores them in the object. Note: you should not pass it a bare
2150             regular expression, such as C. You must pass it as
2151             a list of tokens, I C<('a', 'b+', 'c?', 'd*', 'e')>.
2152              
2153             This method is chainable, I:
2154              
2155             my $ra = Regexp::Assemble->new
2156             ->insert( qw[ a b+ c? d* e ] )
2157             ->insert( qw[ a c+ d+ e* f ] );
2158              
2159             Lexing complex patterns with metacharacters and so on can consume
2160             a significant proportion of the overall time to build an assembly.
2161             If you have the information available in a tokenised form, calling
2162             C directly can be a big win.
2163              
2164             =head2 lexstr
2165              
2166             Use the C method if you are curious to see how a pattern
2167             gets tokenised. It takes a scalar on input, representing a pattern,
2168             and returns a reference to an array, containing the tokenised
2169             pattern. You can recover the original pattern by performing a
2170             C:
2171              
2172             my @token = $re->lexstr($pattern);
2173             my $new_pattern = join( '', @token );
2174              
2175             If the original pattern contains unnecessary backslashes, or C<\x4b>
2176             escapes, or quotemeta escapes (C<\Q>...C<\E>) the resulting pattern
2177             may not be identical.
2178              
2179             Call C does not add the pattern to the object, it is merely
2180             for exploratory purposes. It will, however, update various statistical
2181             counters.
2182              
2183             =head2 pre_filter(CODE)
2184              
2185             Allows you to install a callback to check that the pattern being
2186             loaded contains valid input. It receives the pattern as a whole to
2187             be added, before it been tokenised by the lexer. It may to return
2188             0 or C to indicate that the pattern should not be added, any
2189             true value indicates that the contents are fine.
2190              
2191             A filter to strip out trailing comments (marked by #):
2192              
2193             $re->pre_filter( sub { $_[0] =~ s/\s*#.*$//; 1 } );
2194              
2195             A filter to ignore blank lines:
2196              
2197             $re->pre_filter( sub { length(shift) } );
2198              
2199             If you want to remove the filter, pass C as a parameter.
2200              
2201             $ra->pre_filter(undef);
2202              
2203             This method is chainable.
2204              
2205             =head2 filter(CODE)
2206              
2207             Allows you to install a callback to check that the pattern being
2208             loaded contains valid input. It receives a list on input, after it
2209             has been tokenised by the lexer. It may to return 0 or undef to
2210             indicate that the pattern should not be added, any true value
2211             indicates that the contents are fine.
2212              
2213             If you know that all patterns you expect to assemble contain
2214             a restricted set of of tokens (e.g. no spaces), you could do
2215             the following:
2216              
2217             $ra->filter(sub { not grep { / / } @_ });
2218              
2219             or
2220              
2221             sub only_spaces_and_digits {
2222             not grep { ![\d ] } @_
2223             }
2224             $ra->filter( \&only_spaces_and_digits );
2225              
2226             These two examples will silently ignore faulty patterns, If you
2227             want the user to be made aware of the problem you should raise an
2228             error (via C or C), log an error message, whatever is
2229             best. If you want to remove a filter, pass C as a parameter.
2230              
2231             $ra->filter(undef);
2232              
2233             This method is chainable.
2234              
2235             =head2 as_string
2236              
2237             Assemble the expression and return it as a string. You may want to do
2238             this if you are writing the pattern to a file. The following arguments
2239             can be passed to control the aspect of the resulting pattern:
2240              
2241             B, the number of spaces used to indent nested grouping of
2242             a pattern. Use this to produce a pretty-printed pattern (for some
2243             definition of "pretty"). The resulting output is rather verbose. The
2244             reason is to ensure that the metacharacters C<(?:> and C<)> always
2245             occur on otherwise empty lines. This allows you grep the result for an
2246             even more synthetic view of the pattern:
2247              
2248             egrep -v '^ *[()]'
2249              
2250             The result of the above is quite readable. Remember to backslash the
2251             spaces appearing in your own patterns if you wish to use an indented
2252             pattern in an C construct. Indenting is ignored if tracking
2253             is enabled.
2254              
2255             The B argument takes precedence over the C
2256             method/attribute of the object.
2257              
2258             Calling this
2259             method will drain the internal data structure. Large numbers of patterns
2260             can eat a significant amount of memory, and this lets perl recover the
2261             memory used for other purposes.
2262              
2263             If you want to reduce the pattern I continue to add new patterns,
2264             clone the object and reduce the clone, leaving the original object intact.
2265              
2266             =head2 re
2267              
2268             Assembles the pattern and return it as a compiled RE, using the
2269             C operator.
2270              
2271             As with C, calling this method will reset the internal data
2272             structures to free the memory used in assembling the RE.
2273              
2274             The B attribute, documented in the C method, can be
2275             used here (it will be ignored if tracking is enabled).
2276              
2277             With method chaining, it is possible to produce a RE without having
2278             a temporary C object lying around, I:
2279              
2280             my $re = Regexp::Assemble->new
2281             ->add( q[ab+cd+e] )
2282             ->add( q[ac\\d+e] )
2283             ->add( q[c\\d+e] )
2284             ->re;
2285              
2286             The C<$re> variable now contains a Regexp object that can be used
2287             directly:
2288              
2289             while( <> ) {
2290             /$re/ and print "Something in [$_] matched\n";
2291             )
2292              
2293             The C method is called when the object is used in string context
2294             (hence, within an C operator), so by and large you do not even
2295             need to save the RE in a separate variable. The following will work
2296             as expected:
2297              
2298             my $re = Regexp::Assemble->new->add( qw[ fee fie foe fum ] );
2299             while( ) {
2300             if( /($re)/ ) {
2301             print "Here be giants: $1\n";
2302             }
2303             }
2304              
2305             This approach does not work with tracked patterns. The
2306             C and C methods must be used instead, see below.
2307              
2308             =head2 match(SCALAR)
2309              
2310             The following information applies to Perl 5.8 and below. See
2311             the section that follows for information on Perl 5.10.
2312              
2313             If pattern tracking is in use, you must C in order
2314             to make things work correctly. At a minimum, this will make your
2315             code look like this:
2316              
2317             my $did_match = do { use re 'eval'; $target =~ /$ra/ }
2318             if( $did_match ) {
2319             print "matched ", $ra->matched, "\n";
2320             }
2321              
2322             (The main reason is that the C<$^R> variable is currently broken
2323             and an ugly workaround that runs some Perl code during the match
2324             is required, in order to simulate what C<$^R> should be doing. See
2325             Perl bug #32840 for more information if you are curious. The README
2326             also contains more information). This bug has been fixed in 5.10.
2327              
2328             The important thing to note is that with C, THERE
2329             ARE SECURITY IMPLICATIONS WHICH YOU IGNORE AT YOUR PERIL. The problem
2330             is this: if you do not have strict control over the patterns being
2331             fed to C when tracking is enabled, and someone
2332             slips you a pattern such as C and you
2333             attempt to match a string against the resulting pattern, you will
2334             know Fear and Loathing.
2335              
2336             What is more, the C<$^R> workaround means that that tracking does
2337             not work if you perform a bare C pattern match as shown
2338             above. You have to instead call the C method, in order to
2339             supply the necessary context to take care of the tracking housekeeping
2340             details.
2341              
2342             if( defined( my $match = $ra->match($_)) ) {
2343             print " $_ matched by $match\n";
2344             }
2345              
2346             In the case of a successful match, the original matched pattern
2347             is returned directly. The matched pattern will also be available
2348             through the C method.
2349              
2350             (Except that the above is not true for 5.6.0: the C method
2351             returns true or undef, and the C method always returns
2352             undef).
2353              
2354             If you are capturing parts of the pattern I C
2355             you will want to get at the captures. See the C, C,
2356             C and C methods. If you are not using captures
2357             then you may safely ignore this section.
2358              
2359             In 5.10, since the bug concerning C<$^R> has been resolved, there
2360             is no need to use C and the assembled pattern does
2361             not require any Perl code to be executed during the match.
2362              
2363             =head2 new()
2364              
2365             Creates a new C object. The following optional
2366             key/value parameters may be employed. All keys have a corresponding
2367             method that can be used to change the behaviour later on. As a
2368             general rule, especially if you're just starting out, you don't
2369             have to bother with any of these.
2370              
2371             B, a family of optional attributes that allow anchors
2372             (C<^>, C<\b>, C<\Z>...) to be added to the resulting pattern.
2373              
2374             B, sets the C flags to add to the assembled regular
2375             expression. Warning: no error checking is done, you should ensure
2376             that the flags you pass are understood by the version of Perl you
2377             are using. B exists as an alias, for users familiar
2378             with L.
2379              
2380             B, controls whether the pattern should be chomped before being
2381             lexed. Handy if you are reading patterns from a file. By default,
2382             Cing is performed (this behaviour changed as of version 0.24,
2383             prior versions did not chomp automatically).
2384             See also the C attribute and the C method.
2385              
2386             B, slurp the contents of the specified file and add them
2387             to the assembly. Multiple files may be processed by using a list.
2388              
2389             my $r = Regexp::Assemble->new(file => 're.list');
2390              
2391             my $r = Regexp::Assemble->new(file => ['re.1', 're.2']);
2392              
2393             If you really don't want chomping to occur, you will have to set
2394             the C attribute to 0 (zero). You may also want to look at
2395             the C attribute, as well.
2396              
2397             B, controls what constitutes a record
2398             separator when using the C attribute or the C
2399             method. May be abbreviated to B. See the C<$/> variable in
2400             L.
2401              
2402             B, controls whether the pattern should contain zero-width
2403             lookahead assertions (For instance: (?=[abc])(?:bob|alice|charles).
2404             This is not activated by default, because in many circumstances the
2405             cost of processing the assertion itself outweighs the benefit of
2406             its faculty for short-circuiting a match that will fail. This is
2407             sensitive to the probability of a match succeeding, so if you're
2408             worried about performance you'll have to benchmark a sample population
2409             of targets to see which way the benefits lie.
2410              
2411             B, controls whether you want know which of the initial
2412             patterns was the one that matched. See the C method for
2413             more details. Note for version 5.8 of Perl and below, in this mode
2414             of operation YOU SHOULD BE AWARE OF THE SECURITY IMPLICATIONS that
2415             this entails. Perl 5.10 does not suffer from any such restriction.
2416              
2417             B, the number of spaces used to indent nested grouping of
2418             a pattern. Use this to produce a pretty-printed pattern. See the
2419             C method for a more detailed explanation.
2420              
2421             B, allows you to add a callback to enable sanity checks
2422             on the pattern being loaded. This callback is triggered before the
2423             pattern is split apart by the lexer. In other words, it operates
2424             on the entire pattern. If you are loading patterns from a file,
2425             this would be an appropriate place to remove comments.
2426              
2427             B, allows you to add a callback to enable sanity checks on
2428             the pattern being loaded. This callback is triggered after the
2429             pattern has been split apart by the lexer.
2430              
2431             B, controls whether to unroll, for example, C into
2432             C, C, which may allow additional reductions in the
2433             resulting assembled pattern.
2434              
2435             B, controls whether tail reduction occurs or not. If set,
2436             patterns like C will be reduced to C.
2437             That is, the end of the pattern in each part of the b... and d...
2438             alternations is identical, and hence is hoisted out of the alternation
2439             and placed after it. On by default. Turn it off if you're really
2440             pressed for short assembly times.
2441              
2442             B, specifies the pattern used to lex the input lines into
2443             tokens. You could replace the default pattern by a more sophisticated
2444             version that matches arbitrarily nested parentheses, for example.
2445              
2446             B, controls whether copious amounts of output is produced
2447             during the loading stage or the reducing stage of assembly.
2448              
2449             my $ra = Regexp::Assemble->new;
2450             my $rb = Regexp::Assemble->new( chomp => 1, debug => 3 );
2451              
2452             B, controls whether new patterns can be added to the object
2453             after the assembled pattern is generated. DEPRECATED.
2454              
2455             This method/attribute will be removed in a future release. It doesn't
2456             really serve any purpose, and may be more effectively replaced by
2457             cloning an existing C object and spinning out a
2458             pattern from that instead.
2459              
2460             =head2 source()
2461              
2462             When using tracked mode, after a successful match is made, returns
2463             the original source pattern that caused the match. In Perl 5.10,
2464             the C<$^R> variable can be used to as an index to fetch the correct
2465             pattern from the object.
2466              
2467             If no successful match has been performed, or the object is not in
2468             tracked mode, this method returns C.
2469              
2470             my $r = Regexp::Assemble->new->track(1)->add(qw(foo? bar{2} [Rr]at));
2471              
2472             for my $w (qw(this food is rather barren)) {
2473             if ($w =~ /$r/) {
2474             print "$w matched by ", $r->source($^R), $/;
2475             }
2476             else {
2477             print "$w no match\n";
2478             }
2479             }
2480              
2481             =head2 mbegin()
2482              
2483             This method returns a copy of C<@-> at the moment of the
2484             last match. You should ordinarily not need to bother with
2485             this, C should be able to supply all your needs.
2486              
2487             =head2 mend()
2488              
2489             This method returns a copy of C<@+> at the moment of the
2490             last match.
2491              
2492             =head2 mvar(NUMBER)
2493              
2494             The C method returns the captures of the last match.
2495             C corresponds to $1, C to $2, and so on.
2496             C happens to return the target string matched,
2497             as a byproduct of walking down the C<@-> and C<@+> arrays
2498             after the match.
2499              
2500             If called without a parameter, C will return a
2501             reference to an array containing all captures.
2502              
2503             =head2 capture
2504              
2505             The C method returns the the captures of the last
2506             match as an array. Unlink C, this method does not
2507             include the matched string. It is equivalent to getting an
2508             array back that contains C<$1, $2, $3, ...>.
2509              
2510             If no captures were found in the match, an empty array is
2511             returned, rather than C. You are therefore guaranteed
2512             to be able to use C<< for my $c ($re->capture) { ... >>
2513             without have to check whether anything was captured.
2514              
2515             =head2 matched()
2516              
2517             If pattern tracking has been set, via the C attribute,
2518             or through the C method, this method will return the
2519             original pattern of the last successful match. Returns undef
2520             match has yet been performed, or tracking has not been enabled.
2521              
2522             See below in the NOTES section for additional subtleties of
2523             which you should be aware of when tracking patterns.
2524              
2525             Note that this method is not available in 5.6.0, due to
2526             limitations in the implementation of C<(?{...})> at the time.
2527              
2528             =head2 Statistics/Reporting routines
2529              
2530             =head2 stats_add
2531              
2532             Returns the number of patterns added to the assembly (whether
2533             by C or C). Duplicate patterns are not included
2534             in this total.
2535              
2536             =head2 stats_dup
2537              
2538             Returns the number of duplicate patterns added to the assembly.
2539             If non-zero, this may be a sign that something is wrong with
2540             your data (or at the least, some needless redundancy). This may
2541             occur when you have two patterns (for instance, C and
2542             C) which map to the same result.
2543              
2544             =head2 stats_raw()
2545              
2546             Returns the raw number of bytes in the patterns added to the
2547             assembly. This includes both original and duplicate patterns.
2548             For instance, adding the two patterns C and C will
2549             count as 4 bytes.
2550              
2551             =head2 stats_cooked()
2552              
2553             Return the true number of bytes added to the assembly. This
2554             will not include duplicate patterns. Furthermore, it may differ
2555             from the raw bytes due to quotemeta treatment. For instance,
2556             C will count as 7 (not 8) bytes, because C<\,> will
2557             be stored as C<,>. Also, C<\Qa.b\E> is 7 bytes long, however,
2558             after the quotemeta directives are processed, C will be
2559             stored, for a total of 4 bytes.
2560              
2561             =head2 stats_length()
2562              
2563             Returns the length of the resulting assembled expression.
2564             Until C or C have been called, the length
2565             will be 0 (since the assembly will have not yet been
2566             performed). The length includes only the pattern, not the
2567             additional (C<(?-xism...>) fluff added by the compilation.
2568              
2569             =head2 dup_warn(NUMBER|CODEREF)
2570              
2571             Turns warnings about duplicate patterns on or off. By
2572             default, no warnings are emitted. If the method is
2573             called with no parameters, or a true parameter,
2574             the object will carp about patterns it has
2575             already seen. To turn off the warnings, use 0 as a
2576             parameter.
2577              
2578             $r->dup_warn();
2579              
2580             The method may also be passed a code block. In this case
2581             the code will be executed and it will receive a reference
2582             to the object in question, and the lexed pattern.
2583              
2584             $r->dup_warn(
2585             sub {
2586             my $self = shift;
2587             print $self->stats_add, " patterns added at line $.\n",
2588             join( '', @_ ), " added previously\n";
2589             }
2590             )
2591              
2592             =head2 Anchor routines
2593              
2594             Suppose you wish to assemble a series of patterns that all begin
2595             with C<^> and end with C<$> (anchor pattern to the beginning and
2596             end of line). Rather than add the anchors to each and every pattern
2597             (and possibly forget to do so when a new entry is added), you may
2598             specify the anchors in the object, and they will appear in the
2599             resulting pattern, and you no longer need to (or should) put them
2600             in your source patterns. For example, the two following snippets
2601             will produce identical patterns:
2602              
2603             $r->add(qw(^this ^that ^them))->as_string;
2604              
2605             $r->add(qw(this that them))->anchor_line_begin->as_string;
2606              
2607             # both techniques will produce ^th(?:at|em|is)
2608              
2609             All anchors are possible word (C<\b>) boundaries, line
2610             boundaries (C<^> and C<$>) and string boundaries (C<\A>
2611             and C<\Z> (or C<\z> if you absolutely need it)).
2612              
2613             The shortcut C> implies both
2614             C_begin> C_end>
2615             is also available. If different anchors are specified
2616             the most specific anchor wins. For instance, if both
2617             C and C are
2618             specified, C takes precedence.
2619              
2620             All the anchor methods are chainable.
2621              
2622             =head2 anchor_word_begin
2623              
2624             The resulting pattern will be prefixed with a C<\b>
2625             word boundary assertion when the value is true. Set
2626             to 0 to disable.
2627              
2628             $r->add('pre')->anchor_word_begin->as_string;
2629             # produces '\bpre'
2630              
2631             =head2 anchor_word_end
2632              
2633             The resulting pattern will be suffixed with a C<\b>
2634             word boundary assertion when the value is true. Set
2635             to 0 to disable.
2636              
2637             $r->add(qw(ing tion))
2638             ->anchor_word_end
2639             ->as_string; # produces '(?:tion|ing)\b'
2640              
2641             =head2 anchor_word
2642              
2643             The resulting pattern will be have C<\b>
2644             word boundary assertions at the beginning and end
2645             of the pattern when the value is true. Set
2646             to 0 to disable.
2647              
2648             $r->add(qw(cat carrot)
2649             ->anchor_word(1)
2650             ->as_string; # produces '\bca(?:rro)t\b'
2651              
2652             =head2 anchor_line_begin
2653              
2654             The resulting pattern will be prefixed with a C<^>
2655             line boundary assertion when the value is true. Set
2656             to 0 to disable.
2657              
2658             $r->anchor_line_begin;
2659             # or
2660             $r->anchor_line_begin(1);
2661              
2662             =head2 anchor_line_end
2663              
2664             The resulting pattern will be suffixed with a C<$>
2665             line boundary assertion when the value is true. Set
2666             to 0 to disable.
2667              
2668             # turn it off
2669             $r->anchor_line_end(0);
2670              
2671             =head2 anchor_line
2672              
2673             The resulting pattern will be have the C<^> and C<$>
2674             line boundary assertions at the beginning and end
2675             of the pattern, respectively, when the value is true. Set
2676             to 0 to disable.
2677              
2678             $r->add(qw(cat carrot)
2679             ->anchor_line
2680             ->as_string; # produces '^ca(?:rro)t$'
2681              
2682             =head2 anchor_string_begin
2683              
2684             The resulting pattern will be prefixed with a C<\A>
2685             string boundary assertion when the value is true. Set
2686             to 0 to disable.
2687              
2688             $r->anchor_string_begin(1);
2689              
2690             =head2 anchor_string_end
2691              
2692             The resulting pattern will be suffixed with a C<\Z>
2693             string boundary assertion when the value is true. Set
2694             to 0 to disable.
2695              
2696             # disable the string boundary end anchor
2697             $r->anchor_string_end(0);
2698              
2699             =head2 anchor_string_end_absolute
2700              
2701             The resulting pattern will be suffixed with a C<\z>
2702             string boundary assertion when the value is true. Set
2703             to 0 to disable.
2704              
2705             # disable the string boundary absolute end anchor
2706             $r->anchor_string_end_absolute(0);
2707              
2708             If you don't understand the difference between
2709             C<\Z> and C<\z>, the former will probably do what
2710             you want.
2711              
2712             =head2 anchor_string
2713              
2714             The resulting pattern will be have the C<\A> and C<\Z>
2715             string boundary assertions at the beginning and end
2716             of the pattern, respectively, when the value is true. Set
2717             to 0 to disable.
2718              
2719             $r->add(qw(cat carrot)
2720             ->anchor_string
2721             ->as_string; # produces '\Aca(?:rro)t\Z'
2722              
2723             =head2 anchor_string_absolute
2724              
2725             The resulting pattern will be have the C<\A> and C<\z>
2726             string boundary assertions at the beginning and end
2727             of the pattern, respectively, when the value is true. Set
2728             to 0 to disable.
2729              
2730             $r->add(qw(cat carrot)
2731             ->anchor_string_absolute
2732             ->as_string; # produces '\Aca(?:rro)t\z'
2733              
2734             =head2 debug(NUMBER)
2735              
2736             Turns debugging on or off. Statements are printed
2737             to the currently selected file handle (STDOUT by default).
2738             If you are already using this handle, you will have to
2739             arrange to select an output handle to a file of your own
2740             choosing, before call the C, C or C)
2741             functions, otherwise it will scribble all over your
2742             carefully formatted output.
2743              
2744             =over 4
2745              
2746             =item * 0
2747              
2748             Off. Turns off all debugging output.
2749              
2750             =item * 1
2751              
2752             Add. Trace the addition of patterns.
2753              
2754             =item * 2
2755              
2756             Reduce. Trace the process of reduction and assembly.
2757              
2758             =item * 4
2759              
2760             Lex. Trace the lexing of the input patterns into its constituent
2761             tokens.
2762              
2763             =item * 8
2764              
2765             Time. Print to STDOUT the time taken to load all the patterns. This is
2766             nothing more than the difference between the time the object was
2767             instantiated and the time reduction was initiated.
2768              
2769             # load=
2770              
2771             Any lengthy computation performed in the client code will be reflected
2772             in this value. Another line will be printed after reduction is
2773             complete.
2774              
2775             # reduce=
2776              
2777             The above output lines will be changed to C and
2778             C if the internal state of the object is corrupted
2779             and the initial timestamp is lost.
2780              
2781             The code attempts to load L in order to report fractional
2782             seconds. If this is not successful, the elapsed time is displayed
2783             in whole seconds.
2784              
2785             =back
2786              
2787             Values can be added (or or'ed together) to trace everything
2788              
2789             $r->debug(7)->add( '\\d+abc' );
2790              
2791             Calling C with no arguments turns debugging off.
2792              
2793             =head2 dump()
2794              
2795             Produces a synthetic view of the internal data structure. How
2796             to interpret the results is left as an exercise to the reader.
2797              
2798             print $r->dump;
2799              
2800             =head2 chomp(0|1)
2801              
2802             Turns chomping on or off.
2803              
2804             IMPORTANT: As of version 0.24, chomping is now on by default as it
2805             makes C Just Work. The only time you may run into trouble
2806             is with C. So don't do that, or else explicitly turn
2807             off chomping.
2808              
2809             To avoid incorporating (spurious)
2810             record separators (such as "\n" on Unix) when reading from a file,
2811             C Cs its input. If you don't want this to happen,
2812             call C with a false value.
2813              
2814             $re->chomp(0); # really want the record separators
2815             $re->add();
2816              
2817             =head2 fold_meta_pairs(NUMBER)
2818              
2819             Determines whether C<\s>, C<\S> and C<\w>, C<\W> and C<\d>, C<\D>
2820             are folded into a C<.> (dot). Folding happens by default (for
2821             reasons of backwards compatibility, even though it is wrong when
2822             the C expression modifier is active).
2823              
2824             Call this method with a false value to prevent this behaviour (which
2825             is only a problem when dealing with C<\n> if the C expression
2826             modifier is also set).
2827              
2828             $re->add( '\\w', '\\W' );
2829             my $clone = $re->clone;
2830              
2831             $clone->fold_meta_pairs(0);
2832             print $clone->as_string; # prints '.'
2833             print $re->as_string; # print '[\W\w]'
2834              
2835             =head2 indent(NUMBER)
2836              
2837             Sets the level of indent for pretty-printing nested groups
2838             within a pattern. See the C method for more details.
2839             When called without a parameter, no indenting is performed.
2840              
2841             $re->indent( 4 );
2842             print $re->as_string;
2843              
2844             =head2 lookahead(0|1)
2845              
2846             Turns on zero-width lookahead assertions. This is usually
2847             beneficial when you expect that the pattern will usually fail.
2848             If you expect that the pattern will usually match you will
2849             probably be worse off.
2850              
2851             =head2 flags(STRING)
2852              
2853             Sets the flags that govern how the pattern behaves (for
2854             versions of Perl up to 5.9 or so, these are C). By
2855             default no flags are enabled.
2856              
2857             =head2 modifiers(STRING)
2858              
2859             An alias of the C method, for users familiar with
2860             C.
2861              
2862             =head2 track(0|1)
2863              
2864             Turns tracking on or off. When this attribute is enabled,
2865             additional housekeeping information is inserted into the
2866             assembled expression using C<({...}> embedded code
2867             constructs. This provides the necessary information to
2868             determine which, of the original patterns added, was the
2869             one that caused the match.
2870              
2871             $re->track( 1 );
2872             if( $target =~ /$re/ ) {
2873             print "$target matched by ", $re->matched, "\n";
2874             }
2875              
2876             Note that when this functionality is enabled, no
2877             reduction is performed and no character classes are
2878             generated. In other words, C is not
2879             reduced down to C<(?:br|t)ag> and C is not
2880             reduced to C.
2881              
2882             =head2 unroll_plus(0|1)
2883              
2884             Turns the unrolling of plus metacharacters on or off. When
2885             a pattern is broken up, C becomes C, C (and
2886             C becomes C, C. This may allow the freed C
2887             to assemble with other patterns. Not enabled by default.
2888              
2889             =head2 lex(SCALAR)
2890              
2891             Change the pattern used to break a string apart into tokens.
2892             You can examine the C script as a starting point.
2893              
2894             =head2 reduce(0|1)
2895              
2896             Turns pattern reduction on or off. A reduced pattern may
2897             be considerably shorter than an unreduced pattern. Consider
2898             C I C. An unreduced
2899             pattern will be very similar to those produced by
2900             C. Reduction is on by default. Turning
2901             it off speeds assembly (but assembly is pretty fast -- it's
2902             the breaking up of the initial patterns in the lexing stage
2903             that can consume a non-negligible amount of time).
2904              
2905             =head2 mutable(0|1)
2906              
2907             This method has been marked as DEPRECATED. It will be removed
2908             in a future release. See the C method for a technique
2909             to replace its functionality.
2910              
2911             =head2 reset()
2912              
2913             Empties out the patterns that have been Ced or C-ed
2914             into the object. Does not modify the state of controller attributes
2915             such as C, C, C and the like.
2916              
2917             =head2 Default_Lexer
2918              
2919             B the C function is a class method, not
2920             an object method. It is a fatal error to call it as an object
2921             method.
2922              
2923             The C method lets you replace the default pattern
2924             used for all subsequently created C objects. It
2925             will not have any effect on existing objects. (It is also possible
2926             to override the lexer pattern used on a per-object basis).
2927              
2928             The parameter should be an ordinary scalar, not a compiled
2929             pattern. If the pattern fails to match all parts of the string,
2930             the missing parts will be returned as single chunks. Therefore
2931             the following pattern is legal (albeit rather cork-brained):
2932              
2933             Regexp::Assemble::Default_Lexer( '\\d' );
2934              
2935             The above pattern will split up input strings digit by digit, and
2936             all non-digit characters as single chunks.
2937              
2938             =head1 DIAGNOSTICS
2939              
2940             "Cannot pass a C to Default_Lexer"
2941              
2942             You tried to replace the default lexer pattern with an object
2943             instead of a scalar. Solution: You probably tried to call
2944             C<< $obj->Default_Lexer >>. Call the qualified class method instead
2945             C.
2946              
2947             "filter method not passed a coderef"
2948              
2949             "pre_filter method not passed a coderef"
2950              
2951             A reference to a subroutine (anonymous or otherwise) was expected.
2952             Solution: read the documentation for the C method.
2953              
2954             "duplicate pattern added: /.../"
2955              
2956             The C attribute is active, and a duplicate pattern was
2957             added (well duh!). Solution: clean your data.
2958              
2959             "cannot open [file] for input: [reason]"
2960              
2961             The C method was unable to open the specified file for
2962             whatever reason. Solution: make sure the file exists and the script
2963             has the required privileges to read it.
2964              
2965             =head1 NOTES
2966              
2967             This module has been tested successfully with a range of versions
2968             of perl, from 5.005_03 to 5.9.3. Use of 5.6.0 is not recommended.
2969              
2970             The expressions produced by this module can be used with the PCRE
2971             library.
2972              
2973             Remember to "double up" your backslashes if the patterns are
2974             hard-coded as constants in your program. That is, you should
2975             literally C rather than C. It
2976             usually will work either way, but it's good practice to do so.
2977              
2978             Where possible, supply the simplest tokens possible. Don't add
2979             C when C will do. The reason is that
2980             if you also add C the resulting assembly changes
2981             dramatically: C I
2982             C. Since R::A doesn't perform enough analysis,
2983             it won't "unroll" the C<{2}> quantifier, and will fail to notice
2984             the divergence after the first C<-d\d+>.
2985              
2986             Furthermore, when the string 'X-123000P' is matched against the
2987             first assembly, the regexp engine will have to backtrack over each
2988             alternation (the one that ends in Y B the one that ends in Z)
2989             before determining that there is no match. No such backtracking
2990             occurs in the second pattern: as soon as the engine encounters the
2991             'P' in the target string, neither of the alternations at that point
2992             (C<-\d+Y> or C) could succeed and so the match fails.
2993              
2994             C does, however, know how to build character
2995             classes. Given C, C and C, it will assemble these
2996             into C. When C<-> (dash) appears as a candidate for a
2997             character class it will be the first character in the class. When
2998             C<^> (circumflex) appears as a candidate for a character class it
2999             will be the last character in the class.
3000              
3001             It also knows about meta-characters than can "absorb" regular
3002             characters. For instance, given C and C, it knows that
3003             C<5> can be represented by C<\d> and so the assembly is just C.
3004             The "absorbent" meta-characters it deals with are C<.>, C<\d>, C<\s>
3005             and C<\W> and their complements. It will replace C<\d>/C<\D>,
3006             C<\s>/C<\S> and C<\w>/C<\W> by C<.> (dot), and it will drop C<\d>
3007             if C<\w> is also present (as will C<\D> in the presence of C<\W>).
3008              
3009             C deals correctly with C's propensity
3010             to backslash many characters that have no need to be. Backslashes on
3011             non-metacharacters will be removed. Similarly, in character classes,
3012             a number of characters lose their magic and so no longer need to be
3013             backslashed within a character class. Two common examples are C<.>
3014             (dot) and C<$>. Such characters will lose their backslash.
3015              
3016             At the same time, it will also process C<\Q...\E> sequences. When
3017             such a sequence is encountered, the inner section is extracted and
3018             C is applied to the section. The resulting quoted text
3019             is then used in place of the original unquoted text, and the C<\Q>
3020             and C<\E> metacharacters are thrown away. Similar processing occurs
3021             with the C<\U...\E> and C<\L...\E> sequences. This may have surprising
3022             effects when using a dispatch table. In this case, you will need
3023             to know exactly what the module makes of your input. Use the C
3024             method to find out what's going on:
3025              
3026             $pattern = join( '', @{$re->lexstr($pattern)} );
3027              
3028             If all the digits 0..9 appear in a character class, C
3029             will replace them by C<\d>. I'd do it for letters as well, but
3030             thinking about accented characters and other glyphs hurts my head.
3031              
3032             In an alternation, the longest paths are chosen first (for example,
3033             C). When two paths have the same length, the path
3034             with the most subpaths will appear first. This aims to put the
3035             "busiest" paths to the front of the alternation. For example, the
3036             list C, C, C, C and C will produce the
3037             pattern C<(?:f(?:ew|ig|un)|b(?:ad|it))>. See F for a
3038             real-world example of how alternations are sorted. Once you have
3039             looked at that, everything should be crystal clear.
3040              
3041             When tracking is in use, no reduction is performed. nor are
3042             character classes formed. The reason is that it is
3043             too difficult to determine the original pattern afterwards. Consider the
3044             two patterns C and C. These should be reduced to
3045             C. The final character matches one of two possibilities.
3046             To resolve whether it matched an C<'e'> or C<'m'> would require
3047             keeping track of the fact that the pattern finished up in a character
3048             class, which would the require a whole lot more work to figure out
3049             which character of the class matched. Without character classes
3050             it becomes much easier. Instead, C is produced, which
3051             lets us find out more simply where we ended up.
3052              
3053             Similarly, C and C should form C<(?:dog|sea)food>.
3054             When the pattern is being assembled, the tracking decision needs
3055             to be made at the end of the grouping, but the tail of the pattern
3056             has not yet been visited. Deferring things to make this work correctly
3057             is a vast hassle. In this case, the pattern becomes merely
3058             C<(?:dogfood|seafood>. Tracked patterns will therefore be bulkier than
3059             simple patterns.
3060              
3061             There is an open bug on this issue:
3062              
3063             L
3064              
3065             If this bug is ever resolved, tracking would become much easier to
3066             deal with (none of the C hassle would be required - you could
3067             just match like a regular RE and it would Just Work).
3068              
3069             =head1 SEE ALSO
3070              
3071             =over 4
3072              
3073             =item L
3074              
3075             General information about Perl's regular expressions.
3076              
3077             =item L
3078              
3079             Specific information about C.
3080              
3081             =item Regex::PreSuf
3082              
3083             C takes a string and chops it itself into tokens of
3084             length 1. Since it can't deal with tokens of more than one character,
3085             it can't deal with meta-characters and thus no regular expressions.
3086             Which is the main reason why I wrote this module.
3087              
3088             =item Regexp::Optimizer
3089              
3090             C produces regular expressions that are similar to
3091             those produced by R::A with reductions switched off. It's biggest
3092             drawback is that it is exponentially slower than Regexp::Assemble on
3093             very large sets of patterns.
3094              
3095             =item Regexp::Parser
3096              
3097             Fine grained analysis of regular expressions.
3098              
3099             =item Regexp::Trie
3100              
3101             Funnily enough, this was my working name for C
3102             during its development. I changed the name because I thought it
3103             was too obscure. Anyway, C does much the same as
3104             C and C except that it runs
3105             much faster (according to the author). It does not recognise
3106             meta characters (that is, 'a+b' is interpreted as 'a\+b').
3107              
3108             =item Text::Trie
3109              
3110             C is well worth investigating. Tries can outperform very
3111             bushy (read: many alternations) patterns.
3112              
3113             =item Tree::Trie
3114              
3115             C is another module that builds tries. The algorithm that
3116             C uses appears to be quite similar to the
3117             algorithm described therein, except that C solves its
3118             end-marker problem without having to rewrite the leaves.
3119              
3120             =back
3121              
3122             =head1 See Also
3123              
3124             For alternatives to this module, consider one of:
3125              
3126             =over 4
3127              
3128             =item o L
3129              
3130             =item o L
3131              
3132             =item o L
3133              
3134             =back
3135              
3136             =head1 LIMITATIONS
3137              
3138             Some mildly complex cases are not handled well. See examples/failure.01.pl
3139             and L.
3140              
3141             See also L for a discussion
3142             of some of the issues arising with the use of a huge number of alterations. Thanx to
3143             Slaven Rezic for the details of trie 'v' non-trie operations within Perl which influence
3144             regexp handling of alternations.
3145              
3146             does not attempt to find common substrings. For
3147             instance, it will not collapse C down to C.
3148             If there's a module out there that performs this sort of string
3149             analysis I'd like to know about it. But keep in mind that the
3150             algorithms that do this are very expensive: quadratic or worse.
3151              
3152             C does not interpret meta-character modifiers.
3153             For instance, if the following two patterns are
3154             given: C and C, it will not determine that C<\d> can be
3155             matched by C<\d+>. Instead, it will produce C. Along
3156             a similar line of reasoning, it will not determine that C and
3157             C is equivalent to C (It will produce C
3158             instead).
3159              
3160             You cannot remove a pattern that has been added to an object. You'll
3161             just have to start over again. Adding a pattern is difficult enough,
3162             I'd need a solid argument to convince me to add a C method.
3163             If you need to do this you should read the documentation for the
3164             C method.
3165              
3166             C does not (yet)? employ the C<(?E...)>
3167             construct.
3168              
3169             The module does not produce POSIX-style regular expressions. This
3170             would be quite easy to add, if there was a demand for it.
3171              
3172             =head1 BUGS
3173              
3174             Patterns that generate look-ahead assertions sometimes produce
3175             incorrect patterns in certain obscure corner cases. If you
3176             suspect that this is occurring in your pattern, disable
3177             lookaheads.
3178              
3179             Tracking doesn't really work at all with 5.6.0. It works better
3180             in subsequent 5.6 releases. For maximum reliability, the use of
3181             a 5.8 release is strongly recommended. Tracking barely works with
3182             5.005_04. Of note, using C<\d>-style meta-characters invariably
3183             causes panics. Tracking really comes into its own in Perl 5.10.
3184              
3185             If you feed C patterns with nested parentheses,
3186             there is a chance that the resulting pattern will be uncompilable
3187             due to mismatched parentheses (not enough closing parentheses). This
3188             is normal, so long as the default lexer pattern is used. If you want
3189             to find out which pattern among a list of 3000 patterns are to blame
3190             (speaking from experience here), the F script offers
3191             a strategy for pinpointing the pattern at fault. While you may not
3192             be able to use the script directly, the general approach is easy to
3193             implement.
3194              
3195             The algorithm used to assemble the regular expressions makes extensive
3196             use of mutually-recursive functions (that is, A calls B, B calls
3197             A, ...) For deeply similar expressions, it may be possible to provoke
3198             "Deep recursion" warnings.
3199              
3200             The module has been tested extensively, and has an extensive test
3201             suite (that achieves close to 100% statement coverage), but you
3202             never know... A bug may manifest itself in two ways: creating a
3203             pattern that cannot be compiled, such as C, or a pattern
3204             that compiles correctly but that either matches things it shouldn't,
3205             or doesn't match things it should. It is assumed that Such problems
3206             will occur when the reduction algorithm encounters some sort of
3207             edge case. A temporary work-around is to disable reductions:
3208              
3209             my $pattern = $assembler->reduce(0)->re;
3210              
3211             A discussion about implementation details and where bugs might lurk
3212             appears in the README file. If this file is not available locally,
3213             you should be able to find a copy on the Web at your nearest CPAN
3214             mirror.
3215              
3216             Seriously, though, a number of people have been using this module to
3217             create expressions anywhere from 140Kb to 600Kb in size, and it seems to
3218             be working according to spec. Thus, I don't think there are any serious
3219             bugs remaining.
3220              
3221             If you are feeling brave, extensive debugging traces are available to
3222             figure out where assembly goes wrong.
3223              
3224             Please report all bugs at
3225             L
3226              
3227             Make sure you include the output from the following two commands:
3228              
3229             perl -MRegexp::Assemble -le 'print $Regexp::Assemble::VERSION'
3230             perl -V
3231              
3232             There is a mailing list for the discussion of C.
3233             Subscription details are available at
3234             L.
3235              
3236             =head1 ACKNOWLEDGEMENTS
3237              
3238             This module grew out of work I did building access maps for Postfix,
3239             a modern SMTP mail transfer agent. See L
3240             for more information. I used Perl to build large regular expressions
3241             for blocking dynamic/residential IP addresses to cut down on spam
3242             and viruses. Once I had the code running for this, it was easy to
3243             start adding stuff to block really blatant spam subject lines, bogus
3244             HELO strings, spammer mailer-ids and more...
3245              
3246             I presented the work at the French Perl Workshop in 2004, and the
3247             thing most people asked was whether the underlying mechanism for
3248             assembling the REs was available as a module. At that time it was
3249             nothing more that a twisty maze of scripts, all different. The
3250             interest shown indicated that a module was called for. I'd like to
3251             thank the people who showed interest. Hey, it's going to make I
3252             messy scripts smaller, in any case.
3253              
3254             Thomas Drugeon was a valuable sounding board for trying out
3255             early ideas. Jean Forget and Philippe Blayo looked over an early
3256             version. H.Merijn Brandt stopped over in Paris one evening, and
3257             discussed things over a few beers.
3258              
3259             Nicholas Clark pointed out that while what this module does
3260             (?:c|sh)ould be done in perl's core, as per the 2004 TODO, he
3261             encouraged me to continue with the development of this module. In
3262             any event, this module allows one to gauge the difficulty of
3263             undertaking the endeavour in C. I'd rather gouge my eyes out with
3264             a blunt pencil.
3265              
3266             Paul Johnson settled the question as to whether this module should
3267             live in the Regex:: namespace, or Regexp:: namespace. If you're
3268             not convinced, try running the following one-liner:
3269              
3270             perl -le 'print ref qr//'
3271              
3272             Philippe Bruhat found a couple of corner cases where this module
3273             could produce incorrect results. Such feedback is invaluable,
3274             and only improves the module's quality.
3275              
3276             =head1 Machine-Readable Change Log
3277              
3278             The file Changes was converted into Changelog.ini by L.
3279              
3280             =head1 AUTHOR
3281              
3282             David Landgren
3283              
3284             Copyright (C) 2004-2011. All rights reserved.
3285              
3286             http://www.landgren.net/perl/
3287              
3288             If you use this module, I'd love to hear about what you're using
3289             it for. If you want to be informed of updates, send me a note.
3290              
3291             Ron Savage is co-maint of the module, starting with V 0.36.
3292              
3293             =head1 Repository
3294              
3295             L
3296              
3297             =head1 TODO
3298              
3299             1. Tree equivalencies. Currently, /contend/ /content/ /resend/ /resent/
3300             produces (?:conten[dt]|resend[dt]) but it is possible to produce
3301             (?:cont|res)en[dt] if one can spot the common tail nodes (and walk back
3302             the equivalent paths). Or be by me my => /[bm][ey]/ in the simplest case.
3303              
3304             To do this requires a certain amount of restructuring of the code.
3305             Currently, the algorithm uses a two-phase approach. In the first
3306             phase, the trie is traversed and reductions are performed. In the
3307             second phase, the reduced trie is traversed and the pattern is
3308             emitted.
3309              
3310             What has to occur is that the reduction and emission have to occur
3311             together. As a node is completed, it is replaced by its string
3312             representation. This then allows child nodes to be compared for
3313             equality with a simple 'eq'. Since there is only a single traversal,
3314             the overall generation time might drop, even though the context
3315             baggage required to delve through the tree will be more expensive
3316             to carry along (a hash rather than a couple of scalars).
3317              
3318             Actually, a simpler approach is to take on a secret sentinel
3319             atom at the end of every pattern, which gives the reduction
3320             algorithm sufficient traction to create a perfect trie.
3321              
3322             I'm rewriting the reduction code using this technique.
3323              
3324             2. Investigate how (?>foo) works. Can it be applied?
3325              
3326             5. How can a tracked pattern be serialised? (Add freeze and thaw methods).
3327              
3328             6. Store callbacks per tracked pattern.
3329              
3330             12. utf-8... hmmmm...
3331              
3332             14. Adding qr//'ed patterns. For example, consider
3333             $r->add ( qr/^abc/i )
3334             ->add( qr/^abd/ )
3335             ->add( qr/^ab e/x );
3336             this should admit abc abC aBc aBC abd abe as matches
3337              
3338             16. Allow a fast, unsafe tracking mode, that can be used if a(?bc)?
3339             can't happen. (Possibly carp if it does appear during traversal)?
3340              
3341             17. given a-\d+-\d+-\d+-\d+-b, produce a(?:-\d+){4}-b. Something
3342             along the lines of (.{4))(\1+) would let the regexp engine
3343             itself be brought to bear on the matter, which is a rather
3344             appealing idea. Consider
3345              
3346             while(/(?!\+)(\S{2,}?)(\1+)/g) { ... $1, $2 ... }
3347              
3348             as a starting point.
3349              
3350             19. The reduction code has become unbelievably baroque. Adding code
3351             to handle (sting,singing,sing) => s(?:(?:ing)?|t)ing was far
3352             too difficult. Adding more stuff just breaks existing behaviour.
3353             And fixing the ^abcd$ ... bug broke stuff all over again.
3354             Now that the corner cases are more clearly identified, a full
3355             rewrite of the reduction code is needed. And would admit the
3356             possibility of implementing items 1 and 17.
3357              
3358             20. Handle debug unrev with a separate bit
3359              
3360             23. Japhy's http://www.perlmonks.org/index.pl?node_id=90876 list2range
3361             regexp
3362              
3363             24. Lookahead assertions contain serious bugs (as shown by
3364             assembling powersets. Need to save more context during reduction,
3365             which in turn will simplify the preparation of the lookahead
3366             classes. See also 19.
3367              
3368             26. _lex() swamps the overall run-time. It stems from the decision
3369             to use a single regexp to pull apart any pattern. A suite of
3370             simpler regexp to pick of parens, char classes, quantifiers
3371             and bare tokens may be faster. (This has been implemented as
3372             _fastlex(), but it's only marginally faster. Perhaps split-by-
3373             char and lex a la C?
3374              
3375             27. We don't, as yet, unroll_plus a paren e.g. (abc)+?
3376              
3377             28. We don't reroll unrolled a a* to a+ in indented or tracked
3378             output
3379              
3380             29. Use (*MARK n) in blead for tracked patterns, and use (*FAIL) for
3381             the unmatchable pattern.
3382              
3383             =head1 LICENSE
3384              
3385             This library is free software; you can redistribute it and/or modify
3386             it under the same terms as Perl itself.
3387              
3388             =cut
3389              
3390             # Return a +ve value to tell Perl the module is ready to go.
3391              
3392             'The Lusty Decadent Delights of Imperial Pompeii';