File Coverage

lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

Criterion	Covered	Total	%
statement	400	523	76.4
branch	152	272	55.8
condition	92	151	60.9
subroutine	29	33	87.8
pod	3	14	21.4
total	676	993	68.0

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Copyright 2006 Apache Software Foundation
3							#
4							# Licensed under the Apache License, Version 2.0 (the "License");
5							# you may not use this file except in compliance with the License.
6							# You may obtain a copy of the License at
7							#
8							# http://www.apache.org/licenses/LICENSE-2.0
9							#
10							# Unless required by applicable law or agreed to in writing, software
11							# distributed under the License is distributed on an "AS IS" BASIS,
12							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13							# See the License for the specific language governing permissions and
14							# limitations under the License.
15							# </@LICENSE>
16
17							=head1 NAME
18
19							Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset
20
21							=head1 SYNOPSIS
22
23							This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
24							suitable for use in Rule2XSBody rules or other parallel matching algorithms.
25
26							=cut
27
28							package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;
29
30	1			1		8	use Mail::SpamAssassin::Plugin;
	1					2
	1					31
31	1			1		7	use Mail::SpamAssassin::Logger;
	1					2
	1					81
32	1			1		7	use Mail::SpamAssassin::Util qw(untaint_var qr_to_string);
	1					3
	1					83
33	1			1		299	use Mail::SpamAssassin::Util::Progress;
	1					3
	1					47
34
35	1			1		8	use Errno qw(ENOENT EACCES EEXIST);
	1					2
	1					104
36	1			1		792	use Data::Dumper;
	1					6562
	1					187
37
38	1			1		12	use strict;
	1					2
	1					26
39	1			1		4	use warnings;
	1					3
	1					35
40							# use bytes;
41	1			1		5	use re 'taint';
	1					5
	1					37
42
43							# Not a constant hashref for 5.6 compat
44	1			1		5	use constant SLOT_BASE => 0;
	1					2
	1					76
45	1			1		7	use constant SLOT_NAME => 1;
	1					2
	1					40
46	1			1		5	use constant SLOT_ORIG => 2;
	1					2
	1					36
47	1			1		5	use constant SLOT_LEN_BASE => 3;
	1					4
	1					36
48	1			1		5	use constant SLOT_BASE_INITIAL => 4;
	1					2
	1					52
49	1			1		7	use constant SLOT_HAS_MULTIPLE => 5;
	1					2
	1					42
50
51	1			1		5	use constant CLOBBER => '';
	1					2
	1					109
52
53							our @ISA = qw(Mail::SpamAssassin::Plugin);
54
55	1			1		7	use constant DEBUG_RE_PARSING => 0; # noisy!
	1					3
	1					5428
56
57							# a few settings that control what kind of bases are output.
58
59							# treat all rules as lowercase for purposes of term extraction?
60							# $main->{bases_must_be_casei} = 1;
61							# $main->{bases_can_use_alternations} = 0; # /(foo\|bar\|baz)/
62							# $main->{bases_can_use_quantifiers} = 0; # /foo.bar/ or /foobar/ or /foooo?bar/
63							# $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
64							# $main->{bases_split_out_alternations} = 1; # /(foo\|bar\|baz)/ => ["foo", "bar", "baz"]
65							# $main->{base_quiet} = 0; # silences progress output
66
67							# TODO: it would be nice to have a clean API to pass such settings
68							# through to plugins instead of hanging them off $main
69
70							##############################################################################
71
72							# testing purposes only
73							my $fixup_re_test;
74							#$fixup_re_test = 1; fixup_re("fr()\|\\\\\|"); die;
75							#$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;
76							#$fixup_re_test = 1; fixup_re("\\33\$b"); die;
77							#$fixup_re_test = 1; fixup_re("[link]"); die;
78							#$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;
79
80							###########################################################################
81
82							sub new {
83	15			15	1	100	my $class = shift;
84	15					88	my $mailsaobject = shift;
85	15		33			204	$class = ref($class) \|\| $class;
86	15					91	my $self = $class->SUPER::new($mailsaobject);
87	15					31	bless ($self, $class);
88
89	15					116	$self->{show_progress} = !$mailsaobject->{base_quiet};
90
91							# $self->test(); exit;
92	15					224	return $self;
93							}
94
95							###########################################################################
96
97							sub finish_parsing_end {
98	15			15	1	46	my ($self, $params) = @_;
99	15					38	my $conf = $params->{conf};
100	15					60	$self->extract_bases($conf);
101							}
102
103							sub extract_bases {
104	15			15	0	44	my ($self, $conf) = @_;
105
106	15					40	my $main = $conf->{main};
107	15	50				45	if (!$main->{base_extract}) { return; }
	0					0
108
109							$self->{show_progress} and
110	15	50				41	info("base extraction starting. this can take a while...");
111
112	15					179	$self->extract_set($conf, $conf->{body_tests}, 'body');
113							}
114
115							sub extract_set {
116	15			15	0	136	my ($self, $conf, $test_set, $ruletype) = @_;
117
118	15					49	foreach my $pri (keys %{$test_set}) {
	15					106
119	15					45	my $nicepri = $pri; $nicepri =~ s/-/neg/g;
	15					133
120	15					167	$self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
121							}
122							}
123
124							###########################################################################
125
126							sub extract_set_pri {
127	15			15	0	54	my ($self, $conf, $rules, $ruletype) = @_;
128
129	15					34	my @good_bases;
130							my @failed;
131	15					26	my $yes = 0;
132	15					23	my $no = 0;
133	15					21	my $count = 0;
134	15					31	my $start = time;
135	15					40	$self->{main} = $conf->{main}; # for use in extract_hints()
136	15	50				50	$self->{show_progress} and info ("extracting from rules of type $ruletype");
137	15					29	my $tflags = $conf->{tflags};
138
139							# attempt to find good "base strings" (simplified regexp subsets) for each
140							# regexp. We try looking at the regexp from both ends, since there
141							# may be a good long string of text at the end of the rule.
142
143							# require this many chars in a base string + delimiters for it to be viable
144	15					26	my $min_chars = 5;
145
146	15					20	my $progress;
147							$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
148	15	50	0			36	total => (scalar keys %{$rules} \|\| 1),
149							itemtype => 'rules',
150							});
151
152	15					33	my $cached = { };
153	15					42	my $cachefile;
154
155	15	50				51	if ($self->{main}->{bases_cache_dir}) {
156	0					0	$cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";
157	0					0	dbg("zoom: reading cache file $cachefile");
158	0					0	$cached = $self->read_cachefile($cachefile);
159							}
160
161							NEXT_RULE:
162	15					30	foreach my $name (keys %{$rules}) {
	15					134
163	42	50	33			341	$self->{show_progress} and $progress and $progress->update(++$count);
164
165							#my $rule = $rules->{$name};
166	42					824	my $rule = qr_to_string($conf->{test_qrs}->{$name});
167	42	50				123	if (!defined $rule) {
168	0					0	die "zoom: error: regexp for $rule not found\n";
169							}
170	42					247	my $cachekey = $name.'#'.$rule;
171
172	42					213	my $cent = $cached->{rule_bases}->{$cachekey};
173	42	50				129	if (defined $cent) {
174	0	0				0	if (defined $cent->{g}) {
175	0					0	dbg("zoom: YES (cached) $rule $name");
176	0					0	foreach my $ent (@{$cent->{g}}) {
	0					0
177							# note: we have to copy these, since otherwise later
178							# modifications corrupt the cached data
179							push @good_bases, {
180							base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}
181	0					0	};
182							}
183	0					0	$yes++;
184							}
185							else {
186	0					0	dbg("zoom: NO (cached) $rule $name");
187	0					0	push @failed, { orig => $rule }; # no need to cache this
188	0					0	$no++;
189							}
190	0					0	next NEXT_RULE;
191							}
192
193							# ignore ReplaceTags rules
194	42					108	my $is_a_replacetags_rule = $conf->{replace_rules}->{$name};
195	42					95	my ($minlen, $lossy, @bases);
196
197	42	50				124	if (!$is_a_replacetags_rule) {
198							eval { # catch die()s
199	42					439	my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
200	41					180	($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);
201							# dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases));
202	40					496	1;
203	42	100				106	} or do {
204	2	50				22	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
	2					8
205	2					47	dbg("zoom: giving up on regexp: $eval_stat");
206							};
207
208							#if ($lossy && ($tflags->{$name}\|\|'') =~ /\bmultiple\b/) {
209							# warn "\nzoom: $vers rule $name will loop on SpamAssassin older than 3.3.2 ".
210							# "running under Perl 5.12 or older, Bug 6558\n";
211							#}
212
213							# if any of the extracted hints in a set are too short, the entire
214							# set is invalid; this is because each set of N hints represents just
215							# 1 regexp.
216	42					360	foreach my $str (@bases) {
217	135					787	my $len = length fixup_re($str); # bug 6143: count decoded characters
218	135	100	100			1447	if ($len < $min_chars) { $minlen = undef; @bases = (); last; }
	3	100				46
	3					18
	3					21
219	62					311	elsif (!defined($minlen) \|\| $len < $minlen) { $minlen = $len; }
220							}
221							}
222
223	42	100	66			915	if ($is_a_replacetags_rule \|\| !$minlen \|\| !@bases) {
			66
224	5	50				150	dbg("zoom: ignoring rule %s, %s", $name,
		50
225							$is_a_replacetags_rule ? 'is a replace rule'
226							: !@bases ? 'no bases' : 'no minlen');
227	5					55	push @failed, { orig => $rule };
228	5					45	$cached->{rule_bases}->{$cachekey} = { };
229	5					31	$no++;
230							}
231							else {
232							# dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
233
234							# figure out if we have e.g. ["foo", "foob", "foobar"]; in this
235							# case, we only need to track ["foo"].
236	37					96	my %subsumed;
237	37					85	foreach my $base1 (@bases) {
238	132					431	foreach my $base2 (@bases) {
239	2524	100	100			22737	if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {
240	4					77	$subsumed{$base1} = 1; # base2 is inside base1; discard the longer
241							}
242							}
243							}
244
245	37					241	my @forcache;
246	37					109	foreach my $base (@bases) {
247	132	100				580	next if $subsumed{$base};
248	128					1475	push @good_bases, {
249							base => $base, orig => $rule, name => "$name,[l=$lossy]"
250							};
251							# separate copies for cache -- we modify the @good_bases entry
252	128					961	push @forcache, {
253							base => $base, orig => $rule, name => "$name,[l=$lossy]"
254							};
255							}
256
257	37					512	$cached->{rule_bases}->{$cachekey} = { g => \@forcache };
258	37					314	$yes++;
259							}
260							}
261
262	15	50	33			259	$self->{show_progress} and $progress and $progress->final();
263
264	15					558	dbg("zoom: $ruletype: found ".(scalar @good_bases).
265							" usable base strings in $yes rules, skipped $no rules");
266
267							# NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
268							# ("food" =~ "foo" / "food") will return "food". So therefore if a pattern
269							# subsumes other patterns, we need to return hits for all of them. We also
270							# need to take care of the case where multiple regexps wind up sharing the
271							# same base.
272							#
273							# Another gotcha, an exception to the subsumption rule; if one pattern isn't
274							# entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
275							# returned as two hits, correctly. So we only have to be smart about the
276							# full-subsumption case; overlapping is taken care of for us, by re2c.
277							#
278							# TODO: there's a bug here. Since the code in extract_hints() has been
279							# modified to support more complex regexps, we can no longer simply assume
280							# that if pattern A is not contained in pattern B, that means that pattern B
281							# doesn't subsume it. Consider, for example, A="foo*bar" and
282							# B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
283							# that without running the A RE match itself somehow against B.
284							# same issue remains with:
285							#
286							# "foo?bar" / "fobar"
287							# "fo(?:o\|oo\|)bar" / "fobar"
288							# "fo(?:o\|oo)?bar" / "fobar"
289							# "fo(?:o*\|baz)bar" / "fobar"
290							# "(?:fo(?:o*\|baz)bar\|blargh)" / "fobar"
291							#
292							# it's worse with this:
293							#
294							# "fo(?:o\|oo\|)bar" / "foo*bar"
295							#
296							# basically, this is impossible to compute without reimplementing most of
297							# re2c, and it appears the re2c developers don't plan to offer this:
298							# https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
299
300	15					196	$conf->{base_orig}->{$ruletype} = { };
301	15					121	$conf->{base_string}->{$ruletype} = { };
302
303	15					38	$count = 0;
304	15	50	0			51	$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
305							total => (scalar @good_bases \|\| 1),
306							itemtype => 'bases',
307							});
308
309							# this bit is annoyingly O(N^2). Rewrite the data -- the @good_bases
310							# array -- into a more efficient format, using arrays and with a little
311							# bit of precomputation, to go (quite a bit) faster
312	15					26	my @rewritten;
313	15					37	foreach my $set1 (@good_bases) {
314	128					395	my $base = $set1->{base};
315	128	50	33			588	next if (!$base \|\| !$set1->{name});
316							push @rewritten, [
317							$base, # 0 - SLOT_BASE
318							$set1->{name}, # 1 - SLOT_NAME
319							$set1->{orig}, # 2 - SLOT_ORIG
320	128					772	length $base, # 3 - SLOT_LEN_BASE
321							$base, # 4 - SLOT_BASE_INITIAL
322							0 # 5 - SLOT_HAS_MULTIPLE, has_multiple flag
323							];
324							}
325
326							@good_bases = sort {
327	15	50	100			194	$b->[SLOT_LEN_BASE] <=> $a->[SLOT_LEN_BASE] \|\|
	414		100			1826
328							$a->[SLOT_BASE] cmp $b->[SLOT_BASE] \|\|
329							$a->[SLOT_NAME] cmp $b->[SLOT_NAME] \|\|
330							$a->[SLOT_ORIG] cmp $b->[SLOT_ORIG]
331							} @rewritten;
332
333
334	15					63	my $base_orig = $conf->{base_orig}->{$ruletype};
335	15					32	my $next_base_position = 0;
336	15					50	for my $set1 (@good_bases) {
337	128					259	$next_base_position++;
338	128	50	33			273	$self->{show_progress} and $progress and $progress->update(++$count);
339	128	100				427	my $base1 = $set1->[SLOT_BASE] or next; # got clobbered
340	104					194	my $name1 = $set1->[SLOT_NAME];
341	104					140	my $orig1 = $set1->[SLOT_ORIG];
342	104					162	my $len1 = $set1->[SLOT_LEN_BASE];
343	104					305	$base_orig->{$name1} = $orig1;
344
345	104					277	foreach my $set2 (@good_bases[$next_base_position .. $#good_bases]) { # order from smallest to largest
346							# clobber false and exact dups; this can happen if a regexp outputs the
347							# same base string multiple times
348	1685	100	100			6016	if (!$set2->[SLOT_BASE] \|\|
			66
			66
349							(
350							$base1 eq $set2->[SLOT_BASE] &&
351							$name1 eq $set2->[SLOT_NAME] &&
352							$orig1 eq $set2->[SLOT_ORIG]
353							)
354							)
355							{
356							#dbg("clobbering: [base2][$set2->[SLOT_BASE]][name2][$set2->[SLOT_NAME]][orig][$set2->[SLOT_ORIG]]");
357	24					174	$set2->[SLOT_BASE] = CLOBBER; # clobber
358	24					102	next;
359							}
360
361							# Cannot be a subset if it does not contain the other base string
362	1661	100				6060	next if index($base1,$set2->[SLOT_BASE_INITIAL]) == -1;
363
364							# skip if either already contains the other rule's name
365							# optimize: this can only happen if the base has more than
366							# one rule already attached, ie [5]
367	33	0	33			100	next if ($set2->[SLOT_HAS_MULTIPLE] && index($set2->[SLOT_NAME],$name1) > -1 && $set2->[SLOT_NAME] =~ /(?: \|^)\Q$name1\E(?: \|$)/);
			33
368
369							# don't use $name1 here, since another base in the set2 loop
370							# may have added $name2 since we set that
371	33	50	66			95	next if ($set1->[SLOT_HAS_MULTIPLE] && index($set1->[SLOT_NAME],$set2->[SLOT_NAME]) > -1 && $set1->[SLOT_NAME] =~ /(?: \|^)\Q$set2->[SLOT_NAME]\E(?: \|$)/);
			33
372
373							# $set2->[SLOT_BASE] is just a subset of base1
374							#dbg("zoom: subsuming '$set2->[SLOT_BASE]' ($set2->[SLOT_NAME]) into '$base1': [SLOT_BASE]=$set1->[SLOT_BASE] [SLOT_HAS_MULTIPLE]=$set1->[SLOT_HAS_MULTIPLE]");
375	33					113	$set1->[SLOT_NAME] .= " ".$set2->[SLOT_NAME];
376	33					82	$set1->[SLOT_HAS_MULTIPLE] = 1;
377							}
378							}
379
380							# we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS
381							# both contain "killed" for example, pointing at different rules, which
382							# the above search hasn't found. Collapse them here with a hash
383	15					68	my %bases;
384	15					67	foreach my $set (@good_bases) {
385	128					373	my $base = $set->[0];
386	128	100				305	next unless $base;
387
388	104	100				233	if (defined $bases{$base}) {
389	12					62	$bases{$base} .= " ".$set->[1];
390							} else {
391	92					630	$bases{$base} = $set->[1];
392							}
393							}
394	15					55	undef @good_bases;
395
396	15					41	my $base_string = $conf->{base_string}->{$ruletype};
397	15					152	foreach my $base (keys %bases) {
398							# uniq the list, since there are probably dup rules listed
399	92					123	my %u;
400	92					247	for my $i (split ' ', $bases{$base}) {
401	137	100				297	next if exists $u{$i}; undef $u{$i};
	120					241
402							}
403	92					409	$base_string->{$base} = join ' ', sort keys %u;
404							}
405
406	15	50	33			73	$self->{show_progress} and $progress and $progress->final();
407
408	15	50				43	if ($cachefile) {
409	0					0	$self->write_cachefile ($cachefile, $cached);
410							}
411
412	15					34	my $elapsed = time - $start;
413							$self->{show_progress} and info ("$ruletype: ".
414	15	50				1144	(scalar keys %{$conf->{base_string}->{$ruletype}}).
	0					0
415							" base strings extracted in $elapsed seconds\n");
416							}
417
418							###########################################################################
419
420							# TODO:
421							# NO /no.{1,10}P(?:er\|re)scription.{1,10}(?:needed\|require\|necessary)/i
422							# => should extract 'scription' somehow
423							# /time to refinance\|refinanc\w{1,3}\b.{0,16}\bnow\b/i
424							# => should understand alternations; tricky
425
426							sub simplify_and_qr_regexp {
427	42			42	0	221	my $self = shift;
428	42					111	my $rule = shift;
429
430	42					93	my $main = $self->{main};
431
432
433	42					173	my $mods = '';
434
435							# remove the regexp modifiers, keep for later
436	42					526	while ($rule =~ s/^$\?([a-z]*)$//) {
437	20					105	$mods .= $1;
438							}
439
440							# modifier removal
441	42					215	while ($rule =~ s/^$\?-([a-z]*)$//) {
442	0					0	foreach my $modchar (split '', $mods) {
443	0					0	$mods =~ s/$modchar//g;
444							}
445							}
446
447	42					78	my $lossy = 0;
448
449							# now: simplify aspects of the regexp. Bear in mind that we can
450							# simplify as long as we cause the regexp to become more general;
451							# more hits is OK, since false positives will be discarded afterwards
452							# anyway. Simplification that causes the regexp to not hit
453							# stuff that the "real" rule would hit, however, is a bad thing.
454
455	42	100				243	if ($main->{bases_must_be_casei}) {
456	37					180	$rule = lc $rule;
457
458	37					80	$lossy = 1;
459	37	100				335	$mods =~ s/i// and $lossy = 0;
460
461							# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
462	37	100				353	$rule =~ s/(?<!\\)$\?i\:(.*?)$/$1/gs and $lossy++;
463
464							# always case-i: /A(?-i:ct)/ => /Act/
465	37	50				158	$rule =~ s/(?<!\\)$\?-i\:(.*?)$/$1/gs and $lossy++;
466
467							# remove (?i)
468	37					88	$rule =~ s/$\?i$//gs;
469							}
470							else {
471	5	50				49	die "case-i" if $rule =~ /$\?i$/;
472	5	100				28	die "case-i" if $mods =~ /i/;
473
474							# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
475	4	50				13	$rule =~ s/(?<!\\)$\?i\:(.*?)$/$1/gs and die "case-i";
476
477							# we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/
478	4					11	$rule =~ s/(?<!\\)$\?-i\:(.*?)$/$1/gs;
479							}
480
481							# remove /m and /s modifiers
482	41	50				173	$mods =~ s/m// and $lossy++;
483	41	100				129	$mods =~ s/s// and $lossy++;
484
485							# remove (^\|\b)'s
486							# T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS\|(^\|\b)EXTI($\|\b))/is
487	41	50				110	$rule =~ s/$\^\\|\\b$//gs and $lossy++;
488	41	50				97	$rule =~ s/$\$\\|\\b$//gs and $lossy++;
489	41	50				126	$rule =~ s/$\\b\\|\^$//gs and $lossy++;
490	41	50				100	$rule =~ s/$\\b\\|\$$//gs and $lossy++;
491
492							# remove (?!credit)
493	41	100				198	$rule =~ s/$\?\![^$]+\)//gs and $lossy++;
494
495							# remove \b's
496	41	100				311	$rule =~ s/(?<!\\)\\b//gs and $lossy++;
497
498							# remove the "?=" trick
499							# (?=[dehklnswxy])(horny\|nasty\|hot\|wild\|young\|....etc...)
500	41					90	$rule =~ s/$\?\=\[[^\]]+\]$//gs;
501
502	41	100				245	$mods .= "L" if $lossy;
503	41					311	($rule, $mods);
504							}
505
506							sub extract_hints {
507	41			41	0	134	my $self = shift;
508	41					83	my $rawrule = shift;
509	41					139	my $rule = shift;
510	41					136	my $mods = shift;
511
512	41					89	my $main = $self->{main};
513	41					187	my $orig = $rule;
514
515	41					95	my $lossy = 0;
516	41	100				280	$mods =~ s/L// and $lossy++;
517
518							# if there are anchors, give up; we can't get much
519							# faster than these anyway
520	41	50				239	die "anchors" if $rule =~ /^\(?(?:\^\|\\A)/;
521
522							# die "anchors" if $rule =~ /(?:\$\|\\Z)\)?$/;
523							# just remove end-of-string anchors; they're slow so could gain
524							# from our speedup
525	41	50				237	$rule =~ s/(?<!\\)(?:\$\|\\Z)\)?$// and $lossy++;
526
527							# simplify (?:..) to (..)
528							$main->{bases_allow_noncapture_groups} or
529	41	50				377	$rule =~ s/\(\?:/\(/g;
530
531							# simplify some grouping arrangements so they're easier for us to parse
532							# (foo)? => (foo\|)
533	41					258	$rule =~ s/$(.*?)$\?/$$1\\|$/gs;
534							# r? => (r\|)
535	41					153	$rule =~ s/(?<!\\)(\w)\?/$$1\\|$/gs;
536
537	41					509	my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
538	41	50				230	$tmpfh or die "failed to create a temporary file";
539	41					178	untaint_var(\$tmpf);
540
541	41	50				432	print $tmpfh "use bytes; m{" . $rule . "}" . $mods
542							or die "error writing to $tmpf: $!";
543	41	50				1874	close $tmpfh or die "error closing $tmpf: $!";
544
545	41					410	my $perl = $self->get_perl();
546	41					148	local *IN;
547	41	50				159795	open (IN, "$perl -c -Mre=debug $tmpf 2>&1 \|")
548							or die "cannot run $perl: ".exit_status_str($?,$!);
549
550	41					597	my($inbuf,$nread,$fullstr); $fullstr = '';
	41					368
551	41					348142	while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }
	41					1260
552	41	50				236	defined $nread or die "error reading from pipe: $!";
553
554	41	50				3865	unlink $tmpf or die "cannot unlink $tmpf: $!";
555	41	50				1690	close IN or die "error closing pipe: $!";
556	41	50				426	defined $fullstr or warn "empty result from a pipe";
557
558							# now parse the -Mre=debug output.
559							# perl 5.10 format
560	41					2161	$fullstr =~ s/^.*\nFinal program:\n//gs;
561							# perl 5.6/5.8 format
562	41					247	$fullstr =~ s/^(?:.\n\|)size \d[^\n]\n//gs;
563	41					173	$fullstr =~ s/^(?:.\n\|)first at \d[^\n]\n//gs;
564							# common to all
565	41					107	$fullstr =~ s/\nOffsets:.*$//gs;
566
567							# clean up every other line that doesn't start with a space
568	41					1305	$fullstr =~ s/^\S.*$//gm;
569
570	41	50				1019	if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
571	0					0	die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";
572							}
573	41					949	my $opsstr = $1;
574
575							# what's left looks like this:
576							# 1: EXACTF <v>(3)
577							# 3: ANYOF[1ILil](14)
578							# 14: EXACTF <a>(16)
579							# 16: CURLY {2,7}(29)
580							# 18: ANYOF[A-Za-z](0)
581							# 29: SPACE(30)
582							# 30: EXACTF <http://>(33)
583							# 33: END(0)
584							#
585	41					136	DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";
586
587	41					95	my @ops;
588	41					2171	foreach my $op (split(/\n/s, $opsstr)) {
589	439	50				1239	next unless $op;
590
591	439	100				3899	if ($op =~ /^\s+\d+: (\s)([A-Z]\w+)\b(.?)\s*(?:$\d+$)?$/) {
		100
		50
		0
592							# perl 5.8: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)
593							# perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)
594	321					6180	push @ops, [ $1, $2, $3 ];
595							}
596							elsif ($op =~ /^ (\s)<(.)>\.\.\.\s*$/) {
597							# 5: TRIE-EXACT[im](44)
598							# <message contained attachments that have been blocked by guin>...
599	4					49	my $spcs = $1;
600							# we could use the entire length here, but it's easier to trim to
601							# the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
602							# suite results will match, since the sa-update --list extraction will
603							# be the same for all versions. (The "..." trailer is important btw)
604	4					83	my $str = substr ($2, 0, 55);
605	4					97	push @ops, [ $spcs, '_moretrie', "<$str...>" ];
606							}
607							elsif ($op =~ /^ (\s)(<.>)\s*(?:$\d+$)?$/) {
608							# 5: TRIE-EXACT[am](21)
609							# <am> (21)
610							# <might> (12)
611	114					1411	push @ops, [ $1, '_moretrie', $2 ];
612							}
613							elsif ($op =~ /^ at .+ line \d+$/) {
614	0					0	next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':
615							}
616							else {
617	0					0	warn "cannot parse '$op': $opsstr";
618	0					0	next;
619							}
620							}
621
622							# unroll the branches; returns a list of versions.
623							# e.g. /foo(bar\|baz)argh/ => [ "foobarargh", "foobazargh" ]
624	41					336	my @unrolled;
625	41	100				275	if ($main->{bases_split_out_alternations}) {
626	35					764	@unrolled = $self->unroll_branches(0, \@ops);
627							} else {
628	6					61	@unrolled = ( \@ops );
629							}
630
631							# now find the longest DFA-friendly string in each unrolled version
632	41					116	my @longests;
633	41					153	foreach my $opsarray (@unrolled) {
634	136					537	my $longestexact = '';
635	136					335	my $buf = '';
636
637							# use a closure to keep the code succinct
638							my $add_candidate = sub {
639	430	100		430		1249	if (length $buf > length $longestexact) { $longestexact = $buf; }
	159					366
640	430					699	$buf = '';
641	136					1146	};
642
643	136					396	my $prevop;
644	136					180	foreach my $op (@{$opsarray}) {
	136					277
645	635					966	my ($spcs, $item, $args) = @{$op};
	635					1844
646
647	635	100				1559	next if ($item eq 'NOTHING');
648
649							# EXACT == case-sensitive
650							# EXACTF == case-i
651							# we can do both, since we canonicalize to lc.
652	626	100	100			8942	if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
		100	66
		50	100
		100	66
			66
			100
			66
			33
			33
			0
			66
			100
			66
			66
			33
			33
653							{
654	142					589	my $str = $1;
655	142					395	$buf .= $str;
656	142	100				659	if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
657							# a high Unicode codepoint, interpreted by perl 5.8.x. cut and stop
658	1					28	$add_candidate->();
659							}
660	142	50	33			617	if (length $str >= 55 && $buf =~ s/\.\.\.$//) {
661							# perl 5.8.x truncates with a "..." here! cut and stop
662	0					0	$add_candidate->();
663							}
664							}
665							# _moretrie == a TRIE-EXACT entry
666							elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
667							{
668	190					494	$buf .= $1;
669	190	100	66			670	if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
670							# perl 5.8.x truncates with a "..." here! cut and stop
671	4					37	$add_candidate->();
672							}
673							}
674							# /(?:foo\|bar\|baz){2}/ results in a CURLYX beforehand
675							elsif ($item =~ /^EXACT/ &&
676							$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
677							$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
678							$args =~ /<(.*)>/)
679							{
680	0					0	$buf .= $1;
681	0	0	0			0	if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
682							# perl 5.8.x truncates with a "..." here! cut and stop
683	0					0	$add_candidate->();
684							}
685							}
686							# CURLYX, for perl >= 5.9.5
687							elsif ($item =~ /^_moretrie/ &&
688							$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
689							$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
690							$args =~ /<(.*)>/)
691							{
692	5					44	$buf .= $1;
693	5	50	33			38	if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
694							# perl 5.8.x truncates with a "..." here! cut and stop
695	0					0	$add_candidate->();
696							}
697							}
698							else {
699							# not an /^EXACT/; clear the buffer
700	289					682	$add_candidate->();
701	289	100				1280	if ($item !~ /^(?:END\|CLOSE\d\|MINMOD)$/)
702							{
703	173					310	$lossy = 1;
704	173					241	DEBUG_RE_PARSING and warn "item $item makes regexp lossy";
705							}
706							}
707	626					1550	$prevop = $op;
708							}
709	136					348	$add_candidate->();
710
711	136	100				296	if (!$longestexact) {
712	1					90	die "no long-enough string found in $rawrule\n";
713							# all unrolled versions must have a long string, otherwise
714							# we cannot reliably match all variants of the rule
715							} else {
716	135	100				1263	push @longests, ($main->{bases_must_be_casei}) ?
717							lc $longestexact : $longestexact;
718							}
719							}
720
721	40					74	DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
722	40					2451	return ($lossy, @longests);
723							}
724
725							###########################################################################
726
727							sub unroll_branches {
728	187			187	0	492	my ($self, $depth, $opslist) = @_;
729
730	187	50				384	die "too deep" if ($depth++ > 5);
731
732	187					289	my @ops = (@{$opslist}); # copy
	187					659
733	187					520	my @pre_branch_ops;
734							my $branch_spcs;
735	187					0	my $trie_spcs;
736	187					0	my $open_spcs;
737
738							# our input looks something like this 2-level structure:
739							# 1: BOUND(2)
740							# 2: EXACT <Dear >(5)
741							# 5: BRANCH(9)
742							# 6: EXACT <IT>(8)
743							# 8: NALNUM(24)
744							# 9: BRANCH(23)
745							# 10: EXACT <Int>(12)
746							# 12: BRANCH(14)
747							# 13: NOTHING(21)
748							# 14: BRANCH(17)
749							# 15: EXACT <a>(21)
750							# 17: BRANCH(20)
751							# 18: EXACT <er>(21)
752							# 20: TAIL(21)
753							# 21: EXACT <net>(24)
754							# 23: TAIL(24)
755							# 24: EXACT < shop>(27)
756							# 27: END(0)
757							#
758							# or:
759							#
760							# 1: OPEN1(3)
761							# 3: BRANCH(6)
762							# 4: EXACT <v>(9)
763							# 6: BRANCH(9)
764							# 7: EXACT <\\/>(9)
765							# 9: CLOSE1(11)
766							# 11: CURLY {2,5}(14)
767							# 13: REG_ANY(0)
768							# 14: EXACT < g r a >(17)
769							# 17: ANYOF[a-z](28)
770							# 28: END(0)
771							#
772							# or:
773							#
774							# 1: EXACT <i >(3)
775							# 3: OPEN1(5)
776							# 5: TRIE-EXACT[am](21)
777							# <am> (21)
778							# <might> (12)
779							# 12: OPEN2(14)
780							# 14: TRIE-EXACT[ ](19)
781							# < be>
782							# <>
783							# 19: CLOSE2(21)
784							# 21: CLOSE1(23)
785							# 23: EXACT < c>(25)
786
787	187					238	DEBUG_RE_PARSING and warn "starting parse";
788
789							# this happens for /foo\|bar/ instead of /(?:foo\|bar)/ ; transform
790							# it into the latter. bit of a kludge to do this before the loop, but hey.
791							# note that it doesn't fix the CLOSE1/END ordering to be correct
792	187	50	66			1093	if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {
793	0					0	my @newops = ([ "", "OPEN1", "" ]);
794	0					0	foreach my $op (@ops) {
795	0					0	push @newops, [ " ".$op->[0], $op->[1], $op->[2] ];
796							}
797	0					0	push @newops, [ "", "CLOSE1", "" ];
798	0					0	@ops = @newops;
799							}
800
801							# iterate until we start a branch set. using
802							# /dkjfksl(foo\|bar(baz\|argh)boo)gab/ as an example, we're at "dkj..."
803							# just hitting an OPEN is not enough; wait until we see a TRIE-EXACT
804							# or a BRANCH, then unroll the most recent OPEN set.
805	187					267	while (1) {
806	968					1189	my $op = shift @ops;
807	968	100				1550	last unless defined $op;
808
809	838					926	my ($spcs, $item, $args) = @{$op};
	838					2314
810	838					966	DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";
811
812	838	100	100			4907	if ($item =~ /^OPEN/) {
		100
		100
		100
		100
813	54					202	$open_spcs = $spcs;
814	54					204	next; # next will be a BRANCH or TRIE
815
816							} elsif ($item =~ /^TRIE/) {
817	49					157	$trie_spcs = $spcs;
818	49					153	last;
819
820							} elsif ($item =~ /^BRANCH/) {
821	8					63	$branch_spcs = $spcs;
822	8					58	last;
823
824							} elsif ($item =~ /^EXACT/ && defined $open_spcs) {
825							# perl 5.9.5 does this; f(o\|oish) => OPEN, EXACT, TRIE-EXACT
826	7					96	push @pre_branch_ops, [ $open_spcs, $item, $args ];
827	7					105	next;
828
829							} elsif (defined $open_spcs) {
830							# OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:
831							# ignore this OPEN block entirely and don't try to unroll it
832	2					11	undef $open_spcs;
833
834							} else {
835	718					1696	push @pre_branch_ops, $op;
836							}
837							}
838
839							# no branches found? we're done unrolling on this one!
840	187	100				441	if (scalar @ops == 0) {
841	130					657	return [ @pre_branch_ops ];
842							}
843
844							# otherwise we're at the start of a new branch set
845							# /(foo\|bar(baz\|argh)boo)gab/
846	57					109	my @alts;
847							my @in_this_branch;
848
849	57					87	DEBUG_RE_PARSING and warn "entering branch: ".
850							"open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".
851							"branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".
852							"trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";
853
854							# indentation level to remove from "normal" ops (using a s///)
855	57	100				450	my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
856	57					314	my $trie_sub_spcs = "";
857	57					99	while (1) {
858	387					573	my $op = shift @ops;
859	387	50				789	last unless defined $op;
860	387					495	my ($spcs, $item, $args) = @{$op};
	387					1330
861	387					464	DEBUG_RE_PARSING and warn "in: [$spcs] $item $args";
862
863	387	100	100			5484	if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt
		50	66
		100	66
		100	33
		100	100
			100
			100
			66
864	9					42	push @alts, [ @pre_branch_ops, @in_this_branch ];
865	9					30	@in_this_branch = ();
866	9					30	$open_sub_spcs = $branch_spcs." ";
867	9					32	$trie_sub_spcs = "";
868	9					33	next;
869							}
870							elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end
871	0					0	push @alts, [ @pre_branch_ops, @in_this_branch ];
872	0					0	undef $branch_spcs;
873	0					0	$open_sub_spcs = "";
874	0					0	$trie_sub_spcs = "";
875	0					0	last;
876							}
877							elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {
878	135	100				308	if (scalar @in_this_branch > 0) {
879	86					238	push @alts, [ @pre_branch_ops, @in_this_branch ];
880							}
881							# use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)
882	135					711	@in_this_branch = ( [ $open_spcs, $item, $args ] );
883	135	50				318	$open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
884	135					253	$trie_sub_spcs = " ";
885	135					382	next;
886							}
887							elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end
888	52					230	push @alts, [ @pre_branch_ops, @in_this_branch ];
889	52					101	undef $branch_spcs;
890	52					110	undef $open_spcs;
891	52					106	undef $trie_spcs;
892	52					132	$open_sub_spcs = "";
893	52					102	$trie_sub_spcs = "";
894	52					160	last;
895							}
896							elsif ($item eq 'END') { # of string
897	5					32	push @alts, [ @pre_branch_ops, @in_this_branch ];
898	5					27	undef $branch_spcs;
899	5					18	undef $open_spcs;
900	5					16	undef $trie_spcs;
901	5					17	$open_sub_spcs = "";
902	5					15	$trie_sub_spcs = "";
903	5					26	last;
904							}
905							else {
906	186	50				440	if ($open_sub_spcs) {
907							# deindent the space-level to match the opening brace
908	186					1321	$spcs =~ s/^$open_sub_spcs//;
909							# tries also add one more indent level in
910	186					1016	$spcs =~ s/^$trie_sub_spcs//;
911							}
912	186					1389	push @in_this_branch, [ $spcs, $item, $args ];
913							# note that we ignore ops at a deeper $spcs level entirely (until later!)
914							}
915							}
916
917	57	50				130	if (defined $branch_spcs) {
918	0					0	die "fell off end of string with a branch open: '$branch_spcs'";
919							}
920
921							# we're now after the branch set: /gab/
922							# @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz\|argh)boo/ ]
923	57					188	foreach my $alt (@alts) {
924	152					231	push @{$alt}, @ops; # add all remaining ops to each one
	152					568
925							# note that this could include more (?:...); we don't care, since
926							# those can be handled by recursing
927							}
928
929							# ok, parsed the entire ops list
930							# @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz\|argh)boogab/ ]
931
932	57					96	if (DEBUG_RE_PARSING) {
933							print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
934							}
935
936							# now recurse, to unroll the remaining branches (if any exist)
937	57					107	my @rets;
938	57					103	foreach my $alt (@alts) {
939	152					672	push @rets, $self->unroll_branches($depth, $alt);
940							}
941
942	57					79	if (DEBUG_RE_PARSING) {
943							print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
944							}
945
946	57					420	return @rets;
947							}
948
949							###########################################################################
950
951							sub test {
952	0			0	0	0	my ($self) = @_;
953
954	0					0	$self->test_split_alt("foo", "/foo/");
955	0					0	$self->test_split_alt("(foo)", "/foo/");
956	0					0	$self->test_split_alt("foo(bar)baz", "/foobarbaz/");
957	0					0	$self->test_split_alt("x(foo\|)", "/xfoo/ /x/");
958	0					0	$self->test_split_alt("fo(o\|)", "/foo/ /fo/");
959	0					0	$self->test_split_alt("(foo\|bar)", "/foo/ /bar/");
960	0					0	$self->test_split_alt("foo\|bar", "/foo/ /bar/");
961	0					0	$self->test_split_alt("foo (bar\|baz) argh", "/foo bar argh/ /foo baz argh/");
962	0					0	$self->test_split_alt("foo (bar\|baz\|bl(arg\|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");
963	0					0	$self->test_split_alt("(s(otc\|tco)k)", "/sotck/ /stcok/");
964	0					0	$self->test_split_alt("(business partner(s\|ship\|)\|silent partner(s\|ship\|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");
965							}
966
967							sub test_split_alt {
968	0			0	0	0	my ($self, $in, $out) = @_;
969
970	0					0	my @got = $self->split_alt($in);
971	0					0	$out =~ s/^\///;
972	0					0	$out =~ s/\/$//;
973	0					0	my @want = split(/\/ \//, $out);
974
975	0					0	my $failed = 0;
976	0	0				0	if (scalar @want != scalar @got) {
977	0					0	warn "FAIL: results count don't match";
978	0					0	$failed++;
979							}
980							else {
981	0					0	my %got = map { $_ => 1 } @got;
	0					0
982	0					0	foreach my $w (@want) {
983	0	0				0	if (!$got{$w}) {
984	0					0	warn "FAIL: '$w' not found";
985	0					0	$failed++;
986							}
987							}
988							}
989
990	0	0				0	if ($failed) {
991	0	0				0	print "want: /".join('/ /', @want)."/\n" or die "error writing: $!";
992	0	0				0	print "got: /".join('/ /', @got)."/\n" or die "error writing: $!";
993	0					0	return 0;
994							} else {
995	0	0				0	print "ok\n" or die "error writing: $!";
996	0					0	return 1;
997							}
998							}
999
1000							###########################################################################
1001
1002							sub get_perl {
1003	41			41	0	120	my ($self) = @_;
1004	41					81	my $perl;
1005
1006							# allow user override of the perl interpreter to use when
1007							# extracting base strings.
1008							# TODO: expose this via sa-compile command-line option
1009	41					120	my $fromconf = $self->{main}->{conf}->{re_parser_perl};
1010
1011	41	50				396	if ($fromconf) {
		50
1012	0					0	$perl = $fromconf;
1013							} elsif ($^X =~ m\|^/\|) {
1014	41					174	$perl = $^X;
1015							} else {
1016	1			1		24	use Config;
	1					3
	1					1210
1017	0					0	$perl = $Config{perlpath};
1018	0					0	$perl =~ s\|/[^/]*$\|/$^X\|;
1019							}
1020	41					166	untaint_var(\$perl);
1021	41					243	return $perl;
1022							}
1023
1024							###########################################################################
1025
1026							sub read_cachefile {
1027	0			0	0	0	my ($self, $cachefile) = @_;
1028	0					0	local *IN;
1029	0	0				0	if (open(IN, "<".$cachefile)) {
1030	0					0	my($inbuf,$nread,$str); $str = '';
	0					0
1031	0					0	while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf }
	0					0
1032	0	0				0	defined $nread or die "error reading from $cachefile: $!";
1033	0	0				0	close IN or die "error closing $cachefile: $!";
1034
1035	0					0	untaint_var(\$str);
1036	0					0	my $VAR1; # Data::Dumper
1037	0	0				0	if (eval $str) {
1038	0					0	return $VAR1; # Data::Dumper's naming
1039							}
1040							}
1041	0					0	return { };
1042							}
1043
1044							sub write_cachefile {
1045	0			0	0	0	my ($self, $cachefile, $cached) = @_;
1046
1047	0					0	my $dump = Data::Dumper->new ([ $cached ]);
1048	0					0	$dump->Deepcopy(1);
1049	0					0	$dump->Purity(1);
1050	0					0	$dump->Indent(1);
1051
1052	0					0	my $cachedir = $self->{main}->{bases_cache_dir};
1053	0	0				0	if (mkdir($cachedir)) {
		0
1054							# successfully created
1055							} elsif ($! == EEXIST) {
1056	0					0	dbg("zoom: ok, cache directory already existed");
1057							} else {
1058	0					0	warn "zoom: could not create cache directory: $cachedir ($!)\n";
1059	0					0	return;
1060							}
1061	0	0				0	open(CACHE, ">$cachefile") or warn "cannot write to $cachefile";
1062	0	0				0	print CACHE ($dump->Dump, ";1;") or die "error writing: $!";
1063	0	0				0	close CACHE or die "error closing $cachefile: $!";
1064							}
1065
1066							=over 4
1067
1068							=item my ($cleanregexp) = fixup_re($regexp);
1069
1070							Converts encoded characters in a regular expression pattern into their
1071							equivalent characters
1072
1073							=back
1074
1075							=cut
1076
1077							sub fixup_re {
1078	135			135	1	331	my $re = shift;
1079
1080	135	0				413	if ($fixup_re_test) { print "INPUT: /$re/\n" or die "error writing: $!" }
	0	50				0
1081
1082	135					592	my $output = "";
1083	135					1147	my $TOK = qr([\"\\]);
1084
1085	135					220	my $STATE;
1086	135					656	local ($1,$2);
1087	135					1207	while ($re =~ /\G(.*?)($TOK)/gcs) {
1088	0					0	my $pre = $1;
1089	0					0	my $tok = $2;
1090
1091	0	0				0	if (length($pre)) {
1092	0					0	$output .= "\"$pre\"";
1093							}
1094
1095	0	0				0	if ($tok eq '"') {
		0
1096	0					0	$output .= '"\\""';
1097							}
1098							elsif ($tok eq '\\') {
1099	0	0				0	$re =~ /\G(x\{[^\}]+\}\|[0-7]{1,3}\|.)/gcs or die "\\ at end of string!";
1100	0					0	my $esc = $1;
1101	0	0				0	if ($esc eq '"') {
		0
		0
		0
1102	0					0	$output .= '"\\""';
1103							} elsif ($esc eq '\\') {
1104	0					0	$output .= '"BACKSLASH"'; # avoid hairy escape-parsing
1105							} elsif ($esc =~ /^x\{(\S+)\}\z/) {
1106	0					0	$output .= '"'.chr(hex($1)).'"';
1107							} elsif ($esc =~ /^[0-7]{1,3}\z/) {
1108	0					0	$output .= '"'.chr(oct($esc)).'"';
1109							} else {
1110	0					0	$output .= "\"$esc\"";
1111							}
1112							}
1113							else {
1114	0	0				0	print "PRE: $pre\nTOK: $tok\n" or die "error writing: $!";
1115							}
1116							}
1117
1118	135	50				375	if (!defined(pos($re))) {
		0
1119							# no matches
1120	135					661	$output .= "\"$re\"";
1121							# Bug 6649: protect NL, NULL, ^Z, (and controls to stay on the safe side)
1122	135					491	$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
	0					0
1123							}
1124							elsif (pos($re) <= length($re)) {
1125	0					0	$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
	0					0
1126	0					0	$output .= fixup_re(substr($re, pos($re)));
1127							}
1128
1129	135					265	$output =~ s/^""/"/; # protect start and end quotes
1130	135					195	$output =~ s/(?<!\\)""\z/"/;
1131	135					191	$output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
1132	135					171	$output =~ s/\\BACKSLASH\\/\\\\/gs;
1133
1134	135	0				235	if ($fixup_re_test) { print "OUTPUT: $output\n" or die "error writing: $!" }
	0	50				0
1135	135					725	return $output;
1136							}
1137
1138							1;