File Coverage

lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm
Criterion Covered Total %
statement 400 523 76.4
branch 152 272 55.8
condition 92 151 60.9
subroutine 29 33 87.8
pod 3 14 21.4
total 676 993 68.0


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Copyright 2006 Apache Software Foundation
3             #
4             # Licensed under the Apache License, Version 2.0 (the "License");
5             # you may not use this file except in compliance with the License.
6             # You may obtain a copy of the License at
7             #
8             # http://www.apache.org/licenses/LICENSE-2.0
9             #
10             # Unless required by applicable law or agreed to in writing, software
11             # distributed under the License is distributed on an "AS IS" BASIS,
12             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13             # See the License for the specific language governing permissions and
14             # limitations under the License.
15             # </@LICENSE>
16              
17             =head1 NAME
18              
19             Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset
20              
21             =head1 SYNOPSIS
22              
23             This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
24             suitable for use in Rule2XSBody rules or other parallel matching algorithms.
25              
26             =cut
27              
28             package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;
29              
30 1     1   8 use Mail::SpamAssassin::Plugin;
  1         2  
  1         31  
31 1     1   7 use Mail::SpamAssassin::Logger;
  1         2  
  1         81  
32 1     1   7 use Mail::SpamAssassin::Util qw(untaint_var qr_to_string);
  1         3  
  1         83  
33 1     1   299 use Mail::SpamAssassin::Util::Progress;
  1         3  
  1         47  
34              
35 1     1   8 use Errno qw(ENOENT EACCES EEXIST);
  1         2  
  1         104  
36 1     1   792 use Data::Dumper;
  1         6562  
  1         187  
37              
38 1     1   12 use strict;
  1         2  
  1         26  
39 1     1   4 use warnings;
  1         3  
  1         35  
40             # use bytes;
41 1     1   5 use re 'taint';
  1         5  
  1         37  
42              
43             # Not a constant hashref for 5.6 compat
44 1     1   5 use constant SLOT_BASE => 0;
  1         2  
  1         76  
45 1     1   7 use constant SLOT_NAME => 1;
  1         2  
  1         40  
46 1     1   5 use constant SLOT_ORIG => 2;
  1         2  
  1         36  
47 1     1   5 use constant SLOT_LEN_BASE => 3;
  1         4  
  1         36  
48 1     1   5 use constant SLOT_BASE_INITIAL => 4;
  1         2  
  1         52  
49 1     1   7 use constant SLOT_HAS_MULTIPLE => 5;
  1         2  
  1         42  
50              
51 1     1   5 use constant CLOBBER => '';
  1         2  
  1         109  
52              
53             our @ISA = qw(Mail::SpamAssassin::Plugin);
54              
55 1     1   7 use constant DEBUG_RE_PARSING => 0; # noisy!
  1         3  
  1         5428  
56              
57             # a few settings that control what kind of bases are output.
58              
59             # treat all rules as lowercase for purposes of term extraction?
60             # $main->{bases_must_be_casei} = 1;
61             # $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/
62             # $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/
63             # $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
64             # $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]
65             # $main->{base_quiet} = 0; # silences progress output
66              
67             # TODO: it would be nice to have a clean API to pass such settings
68             # through to plugins instead of hanging them off $main
69              
70             ##############################################################################
71              
72             # testing purposes only
73             my $fixup_re_test;
74             #$fixup_re_test = 1; fixup_re("fr()|\\\\|"); die;
75             #$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;
76             #$fixup_re_test = 1; fixup_re("\\33\$b"); die;
77             #$fixup_re_test = 1; fixup_re("[link]"); die;
78             #$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;
79              
80             ###########################################################################
81              
82             sub new {
83 15     15 1 100 my $class = shift;
84 15         88 my $mailsaobject = shift;
85 15   33     204 $class = ref($class) || $class;
86 15         91 my $self = $class->SUPER::new($mailsaobject);
87 15         31 bless ($self, $class);
88              
89 15         116 $self->{show_progress} = !$mailsaobject->{base_quiet};
90              
91             # $self->test(); exit;
92 15         224 return $self;
93             }
94              
95             ###########################################################################
96              
97             sub finish_parsing_end {
98 15     15 1 46 my ($self, $params) = @_;
99 15         38 my $conf = $params->{conf};
100 15         60 $self->extract_bases($conf);
101             }
102              
103             sub extract_bases {
104 15     15 0 44 my ($self, $conf) = @_;
105              
106 15         40 my $main = $conf->{main};
107 15 50       45 if (!$main->{base_extract}) { return; }
  0         0  
108              
109             $self->{show_progress} and
110 15 50       41 info("base extraction starting. this can take a while...");
111              
112 15         179 $self->extract_set($conf, $conf->{body_tests}, 'body');
113             }
114              
115             sub extract_set {
116 15     15 0 136 my ($self, $conf, $test_set, $ruletype) = @_;
117              
118 15         49 foreach my $pri (keys %{$test_set}) {
  15         106  
119 15         45 my $nicepri = $pri; $nicepri =~ s/-/neg/g;
  15         133  
120 15         167 $self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
121             }
122             }
123              
124             ###########################################################################
125              
126             sub extract_set_pri {
127 15     15 0 54 my ($self, $conf, $rules, $ruletype) = @_;
128              
129 15         34 my @good_bases;
130             my @failed;
131 15         26 my $yes = 0;
132 15         23 my $no = 0;
133 15         21 my $count = 0;
134 15         31 my $start = time;
135 15         40 $self->{main} = $conf->{main}; # for use in extract_hints()
136 15 50       50 $self->{show_progress} and info ("extracting from rules of type $ruletype");
137 15         29 my $tflags = $conf->{tflags};
138              
139             # attempt to find good "base strings" (simplified regexp subsets) for each
140             # regexp. We try looking at the regexp from both ends, since there
141             # may be a good long string of text at the end of the rule.
142              
143             # require this many chars in a base string + delimiters for it to be viable
144 15         26 my $min_chars = 5;
145              
146 15         20 my $progress;
147             $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
148 15 50 0     36 total => (scalar keys %{$rules} || 1),
149             itemtype => 'rules',
150             });
151              
152 15         33 my $cached = { };
153 15         42 my $cachefile;
154              
155 15 50       51 if ($self->{main}->{bases_cache_dir}) {
156 0         0 $cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";
157 0         0 dbg("zoom: reading cache file $cachefile");
158 0         0 $cached = $self->read_cachefile($cachefile);
159             }
160              
161             NEXT_RULE:
162 15         30 foreach my $name (keys %{$rules}) {
  15         134  
163 42 50 33     341 $self->{show_progress} and $progress and $progress->update(++$count);
164              
165             #my $rule = $rules->{$name};
166 42         824 my $rule = qr_to_string($conf->{test_qrs}->{$name});
167 42 50       123 if (!defined $rule) {
168 0         0 die "zoom: error: regexp for $rule not found\n";
169             }
170 42         247 my $cachekey = $name.'#'.$rule;
171              
172 42         213 my $cent = $cached->{rule_bases}->{$cachekey};
173 42 50       129 if (defined $cent) {
174 0 0       0 if (defined $cent->{g}) {
175 0         0 dbg("zoom: YES (cached) $rule $name");
176 0         0 foreach my $ent (@{$cent->{g}}) {
  0         0  
177             # note: we have to copy these, since otherwise later
178             # modifications corrupt the cached data
179             push @good_bases, {
180             base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}
181 0         0 };
182             }
183 0         0 $yes++;
184             }
185             else {
186 0         0 dbg("zoom: NO (cached) $rule $name");
187 0         0 push @failed, { orig => $rule }; # no need to cache this
188 0         0 $no++;
189             }
190 0         0 next NEXT_RULE;
191             }
192              
193             # ignore ReplaceTags rules
194 42         108 my $is_a_replacetags_rule = $conf->{replace_rules}->{$name};
195 42         95 my ($minlen, $lossy, @bases);
196              
197 42 50       124 if (!$is_a_replacetags_rule) {
198             eval { # catch die()s
199 42         439 my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
200 41         180 ($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);
201             # dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases));
202 40         496 1;
203 42 100       106 } or do {
204 2 50       22 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  2         8  
205 2         47 dbg("zoom: giving up on regexp: $eval_stat");
206             };
207              
208             #if ($lossy && ($tflags->{$name}||'') =~ /\bmultiple\b/) {
209             # warn "\nzoom: $vers rule $name will loop on SpamAssassin older than 3.3.2 ".
210             # "running under Perl 5.12 or older, Bug 6558\n";
211             #}
212              
213             # if any of the extracted hints in a set are too short, the entire
214             # set is invalid; this is because each set of N hints represents just
215             # 1 regexp.
216 42         360 foreach my $str (@bases) {
217 135         787 my $len = length fixup_re($str); # bug 6143: count decoded characters
218 135 100 100     1447 if ($len < $min_chars) { $minlen = undef; @bases = (); last; }
  3 100       46  
  3         18  
  3         21  
219 62         311 elsif (!defined($minlen) || $len < $minlen) { $minlen = $len; }
220             }
221             }
222              
223 42 100 66     915 if ($is_a_replacetags_rule || !$minlen || !@bases) {
      66        
224 5 50       150 dbg("zoom: ignoring rule %s, %s", $name,
    50          
225             $is_a_replacetags_rule ? 'is a replace rule'
226             : !@bases ? 'no bases' : 'no minlen');
227 5         55 push @failed, { orig => $rule };
228 5         45 $cached->{rule_bases}->{$cachekey} = { };
229 5         31 $no++;
230             }
231             else {
232             # dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
233              
234             # figure out if we have e.g. ["foo", "foob", "foobar"]; in this
235             # case, we only need to track ["foo"].
236 37         96 my %subsumed;
237 37         85 foreach my $base1 (@bases) {
238 132         431 foreach my $base2 (@bases) {
239 2524 100 100     22737 if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {
240 4         77 $subsumed{$base1} = 1; # base2 is inside base1; discard the longer
241             }
242             }
243             }
244              
245 37         241 my @forcache;
246 37         109 foreach my $base (@bases) {
247 132 100       580 next if $subsumed{$base};
248 128         1475 push @good_bases, {
249             base => $base, orig => $rule, name => "$name,[l=$lossy]"
250             };
251             # *separate* copies for cache -- we modify the @good_bases entry
252 128         961 push @forcache, {
253             base => $base, orig => $rule, name => "$name,[l=$lossy]"
254             };
255             }
256              
257 37         512 $cached->{rule_bases}->{$cachekey} = { g => \@forcache };
258 37         314 $yes++;
259             }
260             }
261              
262 15 50 33     259 $self->{show_progress} and $progress and $progress->final();
263              
264 15         558 dbg("zoom: $ruletype: found ".(scalar @good_bases).
265             " usable base strings in $yes rules, skipped $no rules");
266              
267             # NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
268             # ("food" =~ "foo" / "food") will return "food". So therefore if a pattern
269             # subsumes other patterns, we need to return hits for all of them. We also
270             # need to take care of the case where multiple regexps wind up sharing the
271             # same base.
272             #
273             # Another gotcha, an exception to the subsumption rule; if one pattern isn't
274             # entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
275             # returned as two hits, correctly. So we only have to be smart about the
276             # full-subsumption case; overlapping is taken care of for us, by re2c.
277             #
278             # TODO: there's a bug here. Since the code in extract_hints() has been
279             # modified to support more complex regexps, we can no longer simply assume
280             # that if pattern A is not contained in pattern B, that means that pattern B
281             # doesn't subsume it. Consider, for example, A="foo*bar" and
282             # B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
283             # that without running the A RE match itself somehow against B.
284             # same issue remains with:
285             #
286             # "foo?bar" / "fobar"
287             # "fo(?:o|oo|)bar" / "fobar"
288             # "fo(?:o|oo)?bar" / "fobar"
289             # "fo(?:o*|baz)bar" / "fobar"
290             # "(?:fo(?:o*|baz)bar|blargh)" / "fobar"
291             #
292             # it's worse with this:
293             #
294             # "fo(?:o|oo|)bar" / "foo*bar"
295             #
296             # basically, this is impossible to compute without reimplementing most of
297             # re2c, and it appears the re2c developers don't plan to offer this:
298             # https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
299              
300 15         196 $conf->{base_orig}->{$ruletype} = { };
301 15         121 $conf->{base_string}->{$ruletype} = { };
302              
303 15         38 $count = 0;
304 15 50 0     51 $self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
305             total => (scalar @good_bases || 1),
306             itemtype => 'bases',
307             });
308              
309             # this bit is annoyingly O(N^2). Rewrite the data -- the @good_bases
310             # array -- into a more efficient format, using arrays and with a little
311             # bit of precomputation, to go (quite a bit) faster
312 15         26 my @rewritten;
313 15         37 foreach my $set1 (@good_bases) {
314 128         395 my $base = $set1->{base};
315 128 50 33     588 next if (!$base || !$set1->{name});
316             push @rewritten, [
317             $base, # 0 - SLOT_BASE
318             $set1->{name}, # 1 - SLOT_NAME
319             $set1->{orig}, # 2 - SLOT_ORIG
320 128         772 length $base, # 3 - SLOT_LEN_BASE
321             $base, # 4 - SLOT_BASE_INITIAL
322             0 # 5 - SLOT_HAS_MULTIPLE, has_multiple flag
323             ];
324             }
325              
326             @good_bases = sort {
327 15 50 100     194 $b->[SLOT_LEN_BASE] <=> $a->[SLOT_LEN_BASE] ||
  414   100     1826  
328             $a->[SLOT_BASE] cmp $b->[SLOT_BASE] ||
329             $a->[SLOT_NAME] cmp $b->[SLOT_NAME] ||
330             $a->[SLOT_ORIG] cmp $b->[SLOT_ORIG]
331             } @rewritten;
332              
333              
334 15         63 my $base_orig = $conf->{base_orig}->{$ruletype};
335 15         32 my $next_base_position = 0;
336 15         50 for my $set1 (@good_bases) {
337 128         259 $next_base_position++;
338 128 50 33     273 $self->{show_progress} and $progress and $progress->update(++$count);
339 128 100       427 my $base1 = $set1->[SLOT_BASE] or next; # got clobbered
340 104         194 my $name1 = $set1->[SLOT_NAME];
341 104         140 my $orig1 = $set1->[SLOT_ORIG];
342 104         162 my $len1 = $set1->[SLOT_LEN_BASE];
343 104         305 $base_orig->{$name1} = $orig1;
344              
345 104         277 foreach my $set2 (@good_bases[$next_base_position .. $#good_bases]) { # order from smallest to largest
346             # clobber false and exact dups; this can happen if a regexp outputs the
347             # same base string multiple times
348 1685 100 100     6016 if (!$set2->[SLOT_BASE] ||
      66        
      66        
349             (
350             $base1 eq $set2->[SLOT_BASE] &&
351             $name1 eq $set2->[SLOT_NAME] &&
352             $orig1 eq $set2->[SLOT_ORIG]
353             )
354             )
355             {
356             #dbg("clobbering: [base2][$set2->[SLOT_BASE]][name2][$set2->[SLOT_NAME]][orig][$set2->[SLOT_ORIG]]");
357 24         174 $set2->[SLOT_BASE] = CLOBBER; # clobber
358 24         102 next;
359             }
360              
361             # Cannot be a subset if it does not contain the other base string
362 1661 100       6060 next if index($base1,$set2->[SLOT_BASE_INITIAL]) == -1;
363              
364             # skip if either already contains the other rule's name
365             # optimize: this can only happen if the base has more than
366             # one rule already attached, ie [5]
367 33 0 33     100 next if ($set2->[SLOT_HAS_MULTIPLE] && index($set2->[SLOT_NAME],$name1) > -1 && $set2->[SLOT_NAME] =~ /(?: |^)\Q$name1\E(?: |$)/);
      33        
368              
369             # don't use $name1 here, since another base in the set2 loop
370             # may have added $name2 since we set that
371 33 50 66     95 next if ($set1->[SLOT_HAS_MULTIPLE] && index($set1->[SLOT_NAME],$set2->[SLOT_NAME]) > -1 && $set1->[SLOT_NAME] =~ /(?: |^)\Q$set2->[SLOT_NAME]\E(?: |$)/);
      33        
372              
373             # $set2->[SLOT_BASE] is just a subset of base1
374             #dbg("zoom: subsuming '$set2->[SLOT_BASE]' ($set2->[SLOT_NAME]) into '$base1': [SLOT_BASE]=$set1->[SLOT_BASE] [SLOT_HAS_MULTIPLE]=$set1->[SLOT_HAS_MULTIPLE]");
375 33         113 $set1->[SLOT_NAME] .= " ".$set2->[SLOT_NAME];
376 33         82 $set1->[SLOT_HAS_MULTIPLE] = 1;
377             }
378             }
379              
380             # we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS
381             # both contain "killed" for example, pointing at different rules, which
382             # the above search hasn't found. Collapse them here with a hash
383 15         68 my %bases;
384 15         67 foreach my $set (@good_bases) {
385 128         373 my $base = $set->[0];
386 128 100       305 next unless $base;
387              
388 104 100       233 if (defined $bases{$base}) {
389 12         62 $bases{$base} .= " ".$set->[1];
390             } else {
391 92         630 $bases{$base} = $set->[1];
392             }
393             }
394 15         55 undef @good_bases;
395              
396 15         41 my $base_string = $conf->{base_string}->{$ruletype};
397 15         152 foreach my $base (keys %bases) {
398             # uniq the list, since there are probably dup rules listed
399 92         123 my %u;
400 92         247 for my $i (split ' ', $bases{$base}) {
401 137 100       297 next if exists $u{$i}; undef $u{$i};
  120         241  
402             }
403 92         409 $base_string->{$base} = join ' ', sort keys %u;
404             }
405              
406 15 50 33     73 $self->{show_progress} and $progress and $progress->final();
407              
408 15 50       43 if ($cachefile) {
409 0         0 $self->write_cachefile ($cachefile, $cached);
410             }
411              
412 15         34 my $elapsed = time - $start;
413             $self->{show_progress} and info ("$ruletype: ".
414 15 50       1144 (scalar keys %{$conf->{base_string}->{$ruletype}}).
  0         0  
415             " base strings extracted in $elapsed seconds\n");
416             }
417              
418             ###########################################################################
419              
420             # TODO:
421             # NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i
422             # => should extract 'scription' somehow
423             # /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
424             # => should understand alternations; tricky
425              
426             sub simplify_and_qr_regexp {
427 42     42 0 221 my $self = shift;
428 42         111 my $rule = shift;
429              
430 42         93 my $main = $self->{main};
431              
432              
433 42         173 my $mods = '';
434              
435             # remove the regexp modifiers, keep for later
436 42         526 while ($rule =~ s/^\(\?([a-z]*)\)//) {
437 20         105 $mods .= $1;
438             }
439              
440             # modifier removal
441 42         215 while ($rule =~ s/^\(\?-([a-z]*)\)//) {
442 0         0 foreach my $modchar (split '', $mods) {
443 0         0 $mods =~ s/$modchar//g;
444             }
445             }
446              
447 42         78 my $lossy = 0;
448              
449             # now: simplify aspects of the regexp. Bear in mind that we can
450             # simplify as long as we cause the regexp to become more general;
451             # more hits is OK, since false positives will be discarded afterwards
452             # anyway. Simplification that causes the regexp to *not* hit
453             # stuff that the "real" rule would hit, however, is a bad thing.
454              
455 42 100       243 if ($main->{bases_must_be_casei}) {
456 37         180 $rule = lc $rule;
457              
458 37         80 $lossy = 1;
459 37 100       335 $mods =~ s/i// and $lossy = 0;
460              
461             # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
462 37 100       353 $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and $lossy++;
463              
464             # always case-i: /A(?-i:ct)/ => /Act/
465 37 50       158 $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs and $lossy++;
466              
467             # remove (?i)
468 37         88 $rule =~ s/\(\?i\)//gs;
469             }
470             else {
471 5 50       49 die "case-i" if $rule =~ /\(\?i\)/;
472 5 100       28 die "case-i" if $mods =~ /i/;
473              
474             # always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
475 4 50       13 $rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and die "case-i";
476              
477             # we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/
478 4         11 $rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs;
479             }
480              
481             # remove /m and /s modifiers
482 41 50       173 $mods =~ s/m// and $lossy++;
483 41 100       129 $mods =~ s/s// and $lossy++;
484              
485             # remove (^|\b)'s
486             # T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is
487 41 50       110 $rule =~ s/\(\^\|\\b\)//gs and $lossy++;
488 41 50       97 $rule =~ s/\(\$\|\\b\)//gs and $lossy++;
489 41 50       126 $rule =~ s/\(\\b\|\^\)//gs and $lossy++;
490 41 50       100 $rule =~ s/\(\\b\|\$\)//gs and $lossy++;
491              
492             # remove (?!credit)
493 41 100       198 $rule =~ s/\(\?\![^\)]+\)//gs and $lossy++;
494              
495             # remove \b's
496 41 100       311 $rule =~ s/(?<!\\)\\b//gs and $lossy++;
497              
498             # remove the "?=" trick
499             # (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...)
500 41         90 $rule =~ s/\(\?\=\[[^\]]+\]\)//gs;
501              
502 41 100       245 $mods .= "L" if $lossy;
503 41         311 ($rule, $mods);
504             }
505              
506             sub extract_hints {
507 41     41 0 134 my $self = shift;
508 41         83 my $rawrule = shift;
509 41         139 my $rule = shift;
510 41         136 my $mods = shift;
511              
512 41         89 my $main = $self->{main};
513 41         187 my $orig = $rule;
514              
515 41         95 my $lossy = 0;
516 41 100       280 $mods =~ s/L// and $lossy++;
517              
518             # if there are anchors, give up; we can't get much
519             # faster than these anyway
520 41 50       239 die "anchors" if $rule =~ /^\(?(?:\^|\\A)/;
521              
522             # die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/;
523             # just remove end-of-string anchors; they're slow so could gain
524             # from our speedup
525 41 50       237 $rule =~ s/(?<!\\)(?:\$|\\Z)\)?$// and $lossy++;
526              
527             # simplify (?:..) to (..)
528             $main->{bases_allow_noncapture_groups} or
529 41 50       377 $rule =~ s/\(\?:/\(/g;
530              
531             # simplify some grouping arrangements so they're easier for us to parse
532             # (foo)? => (foo|)
533 41         258 $rule =~ s/\((.*?)\)\?/\($1\|\)/gs;
534             # r? => (r|)
535 41         153 $rule =~ s/(?<!\\)(\w)\?/\($1\|\)/gs;
536              
537 41         509 my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
538 41 50       230 $tmpfh or die "failed to create a temporary file";
539 41         178 untaint_var(\$tmpf);
540              
541 41 50       432 print $tmpfh "use bytes; m{" . $rule . "}" . $mods
542             or die "error writing to $tmpf: $!";
543 41 50       1874 close $tmpfh or die "error closing $tmpf: $!";
544              
545 41         410 my $perl = $self->get_perl();
546 41         148 local *IN;
547 41 50       159795 open (IN, "$perl -c -Mre=debug $tmpf 2>&1 |")
548             or die "cannot run $perl: ".exit_status_str($?,$!);
549              
550 41         597 my($inbuf,$nread,$fullstr); $fullstr = '';
  41         368  
551 41         348142 while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }
  41         1260  
552 41 50       236 defined $nread or die "error reading from pipe: $!";
553              
554 41 50       3865 unlink $tmpf or die "cannot unlink $tmpf: $!";
555 41 50       1690 close IN or die "error closing pipe: $!";
556 41 50       426 defined $fullstr or warn "empty result from a pipe";
557              
558             # now parse the -Mre=debug output.
559             # perl 5.10 format
560 41         2161 $fullstr =~ s/^.*\nFinal program:\n//gs;
561             # perl 5.6/5.8 format
562 41         247 $fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs;
563 41         173 $fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;
564             # common to all
565 41         107 $fullstr =~ s/\nOffsets:.*$//gs;
566              
567             # clean up every other line that doesn't start with a space
568 41         1305 $fullstr =~ s/^\S.*$//gm;
569              
570 41 50       1019 if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
571 0         0 die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";
572             }
573 41         949 my $opsstr = $1;
574              
575             # what's left looks like this:
576             # 1: EXACTF <v>(3)
577             # 3: ANYOF[1ILil](14)
578             # 14: EXACTF <a>(16)
579             # 16: CURLY {2,7}(29)
580             # 18: ANYOF[A-Za-z](0)
581             # 29: SPACE(30)
582             # 30: EXACTF <http://>(33)
583             # 33: END(0)
584             #
585 41         136 DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";
586              
587 41         95 my @ops;
588 41         2171 foreach my $op (split(/\n/s, $opsstr)) {
589 439 50       1239 next unless $op;
590              
591 439 100       3899 if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*?)\s*(?:\(\d+\))?$/) {
    100          
    50          
    0          
592             # perl 5.8: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)
593             # perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)
594 321         6180 push @ops, [ $1, $2, $3 ];
595             }
596             elsif ($op =~ /^ (\s*)<(.*)>\.\.\.\s*$/) {
597             # 5: TRIE-EXACT[im](44)
598             # <message contained attachments that have been blocked by guin>...
599 4         49 my $spcs = $1;
600             # we could use the entire length here, but it's easier to trim to
601             # the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
602             # suite results will match, since the sa-update --list extraction will
603             # be the same for all versions. (The "..." trailer is important btw)
604 4         83 my $str = substr ($2, 0, 55);
605 4         97 push @ops, [ $spcs, '_moretrie', "<$str...>" ];
606             }
607             elsif ($op =~ /^ (\s*)(<.*>)\s*(?:\(\d+\))?$/) {
608             # 5: TRIE-EXACT[am](21)
609             # <am> (21)
610             # <might> (12)
611 114         1411 push @ops, [ $1, '_moretrie', $2 ];
612             }
613             elsif ($op =~ /^ at .+ line \d+$/) {
614 0         0 next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':
615             }
616             else {
617 0         0 warn "cannot parse '$op': $opsstr";
618 0         0 next;
619             }
620             }
621              
622             # unroll the branches; returns a list of versions.
623             # e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ]
624 41         336 my @unrolled;
625 41 100       275 if ($main->{bases_split_out_alternations}) {
626 35         764 @unrolled = $self->unroll_branches(0, \@ops);
627             } else {
628 6         61 @unrolled = ( \@ops );
629             }
630              
631             # now find the longest DFA-friendly string in each unrolled version
632 41         116 my @longests;
633 41         153 foreach my $opsarray (@unrolled) {
634 136         537 my $longestexact = '';
635 136         335 my $buf = '';
636              
637             # use a closure to keep the code succinct
638             my $add_candidate = sub {
639 430 100   430   1249 if (length $buf > length $longestexact) { $longestexact = $buf; }
  159         366  
640 430         699 $buf = '';
641 136         1146 };
642              
643 136         396 my $prevop;
644 136         180 foreach my $op (@{$opsarray}) {
  136         277  
645 635         966 my ($spcs, $item, $args) = @{$op};
  635         1844  
646              
647 635 100       1559 next if ($item eq 'NOTHING');
648              
649             # EXACT == case-sensitive
650             # EXACTF == case-i
651             # we can do both, since we canonicalize to lc.
652 626 100 100     8942 if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
    100 66        
    50 100        
    100 66        
      66        
      100        
      66        
      33        
      33        
      0        
      66        
      100        
      66        
      66        
      33        
      33        
653             {
654 142         589 my $str = $1;
655 142         395 $buf .= $str;
656 142 100       659 if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
657             # a high Unicode codepoint, interpreted by perl 5.8.x. cut and stop
658 1         28 $add_candidate->();
659             }
660 142 50 33     617 if (length $str >= 55 && $buf =~ s/\.\.\.$//) {
661             # perl 5.8.x truncates with a "..." here! cut and stop
662 0         0 $add_candidate->();
663             }
664             }
665             # _moretrie == a TRIE-EXACT entry
666             elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
667             {
668 190         494 $buf .= $1;
669 190 100 66     670 if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
670             # perl 5.8.x truncates with a "..." here! cut and stop
671 4         37 $add_candidate->();
672             }
673             }
674             # /(?:foo|bar|baz){2}/ results in a CURLYX beforehand
675             elsif ($item =~ /^EXACT/ &&
676             $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
677             $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
678             $args =~ /<(.*)>/)
679             {
680 0         0 $buf .= $1;
681 0 0 0     0 if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
682             # perl 5.8.x truncates with a "..." here! cut and stop
683 0         0 $add_candidate->();
684             }
685             }
686             # CURLYX, for perl >= 5.9.5
687             elsif ($item =~ /^_moretrie/ &&
688             $prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
689             $prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
690             $args =~ /<(.*)>/)
691             {
692 5         44 $buf .= $1;
693 5 50 33     38 if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
694             # perl 5.8.x truncates with a "..." here! cut and stop
695 0         0 $add_candidate->();
696             }
697             }
698             else {
699             # not an /^EXACT/; clear the buffer
700 289         682 $add_candidate->();
701 289 100       1280 if ($item !~ /^(?:END|CLOSE\d|MINMOD)$/)
702             {
703 173         310 $lossy = 1;
704 173         241 DEBUG_RE_PARSING and warn "item $item makes regexp lossy";
705             }
706             }
707 626         1550 $prevop = $op;
708             }
709 136         348 $add_candidate->();
710              
711 136 100       296 if (!$longestexact) {
712 1         90 die "no long-enough string found in $rawrule\n";
713             # all unrolled versions must have a long string, otherwise
714             # we cannot reliably match all variants of the rule
715             } else {
716 135 100       1263 push @longests, ($main->{bases_must_be_casei}) ?
717             lc $longestexact : $longestexact;
718             }
719             }
720              
721 40         74 DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
722 40         2451 return ($lossy, @longests);
723             }
724              
725             ###########################################################################
726              
727             sub unroll_branches {
728 187     187 0 492 my ($self, $depth, $opslist) = @_;
729              
730 187 50       384 die "too deep" if ($depth++ > 5);
731              
732 187         289 my @ops = (@{$opslist}); # copy
  187         659  
733 187         520 my @pre_branch_ops;
734             my $branch_spcs;
735 187         0 my $trie_spcs;
736 187         0 my $open_spcs;
737              
738             # our input looks something like this 2-level structure:
739             # 1: BOUND(2)
740             # 2: EXACT <Dear >(5)
741             # 5: BRANCH(9)
742             # 6: EXACT <IT>(8)
743             # 8: NALNUM(24)
744             # 9: BRANCH(23)
745             # 10: EXACT <Int>(12)
746             # 12: BRANCH(14)
747             # 13: NOTHING(21)
748             # 14: BRANCH(17)
749             # 15: EXACT <a>(21)
750             # 17: BRANCH(20)
751             # 18: EXACT <er>(21)
752             # 20: TAIL(21)
753             # 21: EXACT <net>(24)
754             # 23: TAIL(24)
755             # 24: EXACT < shop>(27)
756             # 27: END(0)
757             #
758             # or:
759             #
760             # 1: OPEN1(3)
761             # 3: BRANCH(6)
762             # 4: EXACT <v>(9)
763             # 6: BRANCH(9)
764             # 7: EXACT <\\/>(9)
765             # 9: CLOSE1(11)
766             # 11: CURLY {2,5}(14)
767             # 13: REG_ANY(0)
768             # 14: EXACT < g r a >(17)
769             # 17: ANYOF[a-z](28)
770             # 28: END(0)
771             #
772             # or:
773             #
774             # 1: EXACT <i >(3)
775             # 3: OPEN1(5)
776             # 5: TRIE-EXACT[am](21)
777             # <am> (21)
778             # <might> (12)
779             # 12: OPEN2(14)
780             # 14: TRIE-EXACT[ ](19)
781             # < be>
782             # <>
783             # 19: CLOSE2(21)
784             # 21: CLOSE1(23)
785             # 23: EXACT < c>(25)
786              
787 187         238 DEBUG_RE_PARSING and warn "starting parse";
788              
789             # this happens for /foo|bar/ instead of /(?:foo|bar)/ ; transform
790             # it into the latter. bit of a kludge to do this before the loop, but hey.
791             # note that it doesn't fix the CLOSE1/END ordering to be correct
792 187 50 66     1093 if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {
793 0         0 my @newops = ([ "", "OPEN1", "" ]);
794 0         0 foreach my $op (@ops) {
795 0         0 push @newops, [ " ".$op->[0], $op->[1], $op->[2] ];
796             }
797 0         0 push @newops, [ "", "CLOSE1", "" ];
798 0         0 @ops = @newops;
799             }
800              
801             # iterate until we start a branch set. using
802             # /dkjfksl(foo|bar(baz|argh)boo)gab/ as an example, we're at "dkj..."
803             # just hitting an OPEN is not enough; wait until we see a TRIE-EXACT
804             # or a BRANCH, *then* unroll the most recent OPEN set.
805 187         267 while (1) {
806 968         1189 my $op = shift @ops;
807 968 100       1550 last unless defined $op;
808              
809 838         926 my ($spcs, $item, $args) = @{$op};
  838         2314  
810 838         966 DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";
811              
812 838 100 100     4907 if ($item =~ /^OPEN/) {
    100          
    100          
    100          
    100          
813 54         202 $open_spcs = $spcs;
814 54         204 next; # next will be a BRANCH or TRIE
815              
816             } elsif ($item =~ /^TRIE/) {
817 49         157 $trie_spcs = $spcs;
818 49         153 last;
819              
820             } elsif ($item =~ /^BRANCH/) {
821 8         63 $branch_spcs = $spcs;
822 8         58 last;
823              
824             } elsif ($item =~ /^EXACT/ && defined $open_spcs) {
825             # perl 5.9.5 does this; f(o|oish) => OPEN, EXACT, TRIE-EXACT
826 7         96 push @pre_branch_ops, [ $open_spcs, $item, $args ];
827 7         105 next;
828              
829             } elsif (defined $open_spcs) {
830             # OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:
831             # ignore this OPEN block entirely and don't try to unroll it
832 2         11 undef $open_spcs;
833              
834             } else {
835 718         1696 push @pre_branch_ops, $op;
836             }
837             }
838              
839             # no branches found? we're done unrolling on this one!
840 187 100       441 if (scalar @ops == 0) {
841 130         657 return [ @pre_branch_ops ];
842             }
843              
844             # otherwise we're at the start of a new branch set
845             # /(foo|bar(baz|argh)boo)gab/
846 57         109 my @alts;
847             my @in_this_branch;
848              
849 57         87 DEBUG_RE_PARSING and warn "entering branch: ".
850             "open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".
851             "branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".
852             "trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";
853              
854             # indentation level to remove from "normal" ops (using a s///)
855 57 100       450 my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
856 57         314 my $trie_sub_spcs = "";
857 57         99 while (1) {
858 387         573 my $op = shift @ops;
859 387 50       789 last unless defined $op;
860 387         495 my ($spcs, $item, $args) = @{$op};
  387         1330  
861 387         464 DEBUG_RE_PARSING and warn "in: [$spcs] $item $args";
862              
863 387 100 100     5484 if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt
    50 66        
    100 66        
    100 33        
    100 100        
      100        
      100        
      66        
864 9         42 push @alts, [ @pre_branch_ops, @in_this_branch ];
865 9         30 @in_this_branch = ();
866 9         30 $open_sub_spcs = $branch_spcs." ";
867 9         32 $trie_sub_spcs = "";
868 9         33 next;
869             }
870             elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end
871 0         0 push @alts, [ @pre_branch_ops, @in_this_branch ];
872 0         0 undef $branch_spcs;
873 0         0 $open_sub_spcs = "";
874 0         0 $trie_sub_spcs = "";
875 0         0 last;
876             }
877             elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {
878 135 100       308 if (scalar @in_this_branch > 0) {
879 86         238 push @alts, [ @pre_branch_ops, @in_this_branch ];
880             }
881             # use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)
882 135         711 @in_this_branch = ( [ $open_spcs, $item, $args ] );
883 135 50       318 $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
884 135         253 $trie_sub_spcs = " ";
885 135         382 next;
886             }
887             elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end
888 52         230 push @alts, [ @pre_branch_ops, @in_this_branch ];
889 52         101 undef $branch_spcs;
890 52         110 undef $open_spcs;
891 52         106 undef $trie_spcs;
892 52         132 $open_sub_spcs = "";
893 52         102 $trie_sub_spcs = "";
894 52         160 last;
895             }
896             elsif ($item eq 'END') { # of string
897 5         32 push @alts, [ @pre_branch_ops, @in_this_branch ];
898 5         27 undef $branch_spcs;
899 5         18 undef $open_spcs;
900 5         16 undef $trie_spcs;
901 5         17 $open_sub_spcs = "";
902 5         15 $trie_sub_spcs = "";
903 5         26 last;
904             }
905             else {
906 186 50       440 if ($open_sub_spcs) {
907             # deindent the space-level to match the opening brace
908 186         1321 $spcs =~ s/^$open_sub_spcs//;
909             # tries also add one more indent level in
910 186         1016 $spcs =~ s/^$trie_sub_spcs//;
911             }
912 186         1389 push @in_this_branch, [ $spcs, $item, $args ];
913             # note that we ignore ops at a deeper $spcs level entirely (until later!)
914             }
915             }
916              
917 57 50       130 if (defined $branch_spcs) {
918 0         0 die "fell off end of string with a branch open: '$branch_spcs'";
919             }
920              
921             # we're now after the branch set: /gab/
922             # @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz|argh)boo/ ]
923 57         188 foreach my $alt (@alts) {
924 152         231 push @{$alt}, @ops; # add all remaining ops to each one
  152         568  
925             # note that this could include more (?:...); we don't care, since
926             # those can be handled by recursing
927             }
928              
929             # ok, parsed the entire ops list
930             # @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz|argh)boogab/ ]
931              
932 57         96 if (DEBUG_RE_PARSING) {
933             print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
934             }
935              
936             # now recurse, to unroll the remaining branches (if any exist)
937 57         107 my @rets;
938 57         103 foreach my $alt (@alts) {
939 152         672 push @rets, $self->unroll_branches($depth, $alt);
940             }
941              
942 57         79 if (DEBUG_RE_PARSING) {
943             print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
944             }
945              
946 57         420 return @rets;
947             }
948              
949             ###########################################################################
950              
951             sub test {
952 0     0 0 0 my ($self) = @_;
953              
954 0         0 $self->test_split_alt("foo", "/foo/");
955 0         0 $self->test_split_alt("(foo)", "/foo/");
956 0         0 $self->test_split_alt("foo(bar)baz", "/foobarbaz/");
957 0         0 $self->test_split_alt("x(foo|)", "/xfoo/ /x/");
958 0         0 $self->test_split_alt("fo(o|)", "/foo/ /fo/");
959 0         0 $self->test_split_alt("(foo|bar)", "/foo/ /bar/");
960 0         0 $self->test_split_alt("foo|bar", "/foo/ /bar/");
961 0         0 $self->test_split_alt("foo (bar|baz) argh", "/foo bar argh/ /foo baz argh/");
962 0         0 $self->test_split_alt("foo (bar|baz|bl(arg|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");
963 0         0 $self->test_split_alt("(s(otc|tco)k)", "/sotck/ /stcok/");
964 0         0 $self->test_split_alt("(business partner(s|ship|)|silent partner(s|ship|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");
965             }
966              
967             sub test_split_alt {
968 0     0 0 0 my ($self, $in, $out) = @_;
969              
970 0         0 my @got = $self->split_alt($in);
971 0         0 $out =~ s/^\///;
972 0         0 $out =~ s/\/$//;
973 0         0 my @want = split(/\/ \//, $out);
974              
975 0         0 my $failed = 0;
976 0 0       0 if (scalar @want != scalar @got) {
977 0         0 warn "FAIL: results count don't match";
978 0         0 $failed++;
979             }
980             else {
981 0         0 my %got = map { $_ => 1 } @got;
  0         0  
982 0         0 foreach my $w (@want) {
983 0 0       0 if (!$got{$w}) {
984 0         0 warn "FAIL: '$w' not found";
985 0         0 $failed++;
986             }
987             }
988             }
989              
990 0 0       0 if ($failed) {
991 0 0       0 print "want: /".join('/ /', @want)."/\n" or die "error writing: $!";
992 0 0       0 print "got: /".join('/ /', @got)."/\n" or die "error writing: $!";
993 0         0 return 0;
994             } else {
995 0 0       0 print "ok\n" or die "error writing: $!";
996 0         0 return 1;
997             }
998             }
999              
1000             ###########################################################################
1001              
1002             sub get_perl {
1003 41     41 0 120 my ($self) = @_;
1004 41         81 my $perl;
1005              
1006             # allow user override of the perl interpreter to use when
1007             # extracting base strings.
1008             # TODO: expose this via sa-compile command-line option
1009 41         120 my $fromconf = $self->{main}->{conf}->{re_parser_perl};
1010              
1011 41 50       396 if ($fromconf) {
    50          
1012 0         0 $perl = $fromconf;
1013             } elsif ($^X =~ m|^/|) {
1014 41         174 $perl = $^X;
1015             } else {
1016 1     1   24 use Config;
  1         3  
  1         1210  
1017 0         0 $perl = $Config{perlpath};
1018 0         0 $perl =~ s|/[^/]*$|/$^X|;
1019             }
1020 41         166 untaint_var(\$perl);
1021 41         243 return $perl;
1022             }
1023              
1024             ###########################################################################
1025              
1026             sub read_cachefile {
1027 0     0 0 0 my ($self, $cachefile) = @_;
1028 0         0 local *IN;
1029 0 0       0 if (open(IN, "<".$cachefile)) {
1030 0         0 my($inbuf,$nread,$str); $str = '';
  0         0  
1031 0         0 while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf }
  0         0  
1032 0 0       0 defined $nread or die "error reading from $cachefile: $!";
1033 0 0       0 close IN or die "error closing $cachefile: $!";
1034              
1035 0         0 untaint_var(\$str);
1036 0         0 my $VAR1; # Data::Dumper
1037 0 0       0 if (eval $str) {
1038 0         0 return $VAR1; # Data::Dumper's naming
1039             }
1040             }
1041 0         0 return { };
1042             }
1043              
1044             sub write_cachefile {
1045 0     0 0 0 my ($self, $cachefile, $cached) = @_;
1046              
1047 0         0 my $dump = Data::Dumper->new ([ $cached ]);
1048 0         0 $dump->Deepcopy(1);
1049 0         0 $dump->Purity(1);
1050 0         0 $dump->Indent(1);
1051              
1052 0         0 my $cachedir = $self->{main}->{bases_cache_dir};
1053 0 0       0 if (mkdir($cachedir)) {
    0          
1054             # successfully created
1055             } elsif ($! == EEXIST) {
1056 0         0 dbg("zoom: ok, cache directory already existed");
1057             } else {
1058 0         0 warn "zoom: could not create cache directory: $cachedir ($!)\n";
1059 0         0 return;
1060             }
1061 0 0       0 open(CACHE, ">$cachefile") or warn "cannot write to $cachefile";
1062 0 0       0 print CACHE ($dump->Dump, ";1;") or die "error writing: $!";
1063 0 0       0 close CACHE or die "error closing $cachefile: $!";
1064             }
1065              
1066             =over 4
1067              
1068             =item my ($cleanregexp) = fixup_re($regexp);
1069              
1070             Converts encoded characters in a regular expression pattern into their
1071             equivalent characters
1072              
1073             =back
1074              
1075             =cut
1076              
1077             sub fixup_re {
1078 135     135 1 331 my $re = shift;
1079            
1080 135 0       413 if ($fixup_re_test) { print "INPUT: /$re/\n" or die "error writing: $!" }
  0 50       0  
1081            
1082 135         592 my $output = "";
1083 135         1147 my $TOK = qr([\"\\]);
1084              
1085 135         220 my $STATE;
1086 135         656 local ($1,$2);
1087 135         1207 while ($re =~ /\G(.*?)($TOK)/gcs) {
1088 0         0 my $pre = $1;
1089 0         0 my $tok = $2;
1090              
1091 0 0       0 if (length($pre)) {
1092 0         0 $output .= "\"$pre\"";
1093             }
1094              
1095 0 0       0 if ($tok eq '"') {
    0          
1096 0         0 $output .= '"\\""';
1097             }
1098             elsif ($tok eq '\\') {
1099 0 0       0 $re =~ /\G(x\{[^\}]+\}|[0-7]{1,3}|.)/gcs or die "\\ at end of string!";
1100 0         0 my $esc = $1;
1101 0 0       0 if ($esc eq '"') {
    0          
    0          
    0          
1102 0         0 $output .= '"\\""';
1103             } elsif ($esc eq '\\') {
1104 0         0 $output .= '"**BACKSLASH**"'; # avoid hairy escape-parsing
1105             } elsif ($esc =~ /^x\{(\S+)\}\z/) {
1106 0         0 $output .= '"'.chr(hex($1)).'"';
1107             } elsif ($esc =~ /^[0-7]{1,3}\z/) {
1108 0         0 $output .= '"'.chr(oct($esc)).'"';
1109             } else {
1110 0         0 $output .= "\"$esc\"";
1111             }
1112             }
1113             else {
1114 0 0       0 print "PRE: $pre\nTOK: $tok\n" or die "error writing: $!";
1115             }
1116             }
1117            
1118 135 50       375 if (!defined(pos($re))) {
    0          
1119             # no matches
1120 135         661 $output .= "\"$re\"";
1121             # Bug 6649: protect NL, NULL, ^Z, (and controls to stay on the safe side)
1122 135         491 $output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
  0         0  
1123             }
1124             elsif (pos($re) <= length($re)) {
1125 0         0 $output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
  0         0  
1126 0         0 $output .= fixup_re(substr($re, pos($re)));
1127             }
1128              
1129 135         265 $output =~ s/^""/"/; # protect start and end quotes
1130 135         195 $output =~ s/(?<!\\)""\z/"/;
1131 135         191 $output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
1132 135         171 $output =~ s/\*\*BACKSLASH\*\*/\\\\/gs;
1133              
1134 135 0       235 if ($fixup_re_test) { print "OUTPUT: $output\n" or die "error writing: $!" }
  0 50       0  
1135 135         725 return $output;
1136             }
1137              
1138             1;