File Coverage

lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

Criterion	Covered	Total	%
statement	400	523	76.4
branch	152	272	55.8
condition	92	151	60.9
subroutine	29	33	87.8
pod	3	14	21.4
total	676	993	68.0

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Copyright 2006 Apache Software Foundation
3							#
4							# Licensed under the Apache License, Version 2.0 (the "License");
5							# you may not use this file except in compliance with the License.
6							# You may obtain a copy of the License at
7							#
8							# http://www.apache.org/licenses/LICENSE-2.0
9							#
10							# Unless required by applicable law or agreed to in writing, software
11							# distributed under the License is distributed on an "AS IS" BASIS,
12							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13							# See the License for the specific language governing permissions and
14							# limitations under the License.
15							# </@LICENSE>
16
17							=head1 NAME
18
19							Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset
20
21							=head1 SYNOPSIS
22
23							This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
24							suitable for use in Rule2XSBody rules or other parallel matching algorithms.
25
26							=cut
27
28
29							use Mail::SpamAssassin::Plugin;
30	1			1		8	use Mail::SpamAssassin::Logger;
	1					2
	1					27
31	1			1		4	use Mail::SpamAssassin::Util qw(untaint_var qr_to_string);
	1					2
	1					60
32	1			1		7	use Mail::SpamAssassin::Util::Progress;
	1					2
	1					52
33	1			1		306
	1					3
	1					39
34							use Errno qw(ENOENT EACCES EEXIST);
35	1			1		6	use Data::Dumper;
	1					2
	1					97
36	1			1		598
	1					5591
	1					53
37							use strict;
38	1			1		7	use warnings;
	1					2
	1					18
39	1			1		12	# use bytes;
	1					3
	1					27
40							use re 'taint';
41	1			1		5
	1					2
	1					26
42							# Not a constant hashref for 5.6 compat
43							use constant SLOT_BASE => 0;
44	1			1		4	use constant SLOT_NAME => 1;
	1					2
	1					46
45	1			1		5	use constant SLOT_ORIG => 2;
	1					2
	1					36
46	1			1		6	use constant SLOT_LEN_BASE => 3;
	1					2
	1					46
47	1			1		6	use constant SLOT_BASE_INITIAL => 4;
	1					2
	1					37
48	1			1		5	use constant SLOT_HAS_MULTIPLE => 5;
	1					2
	1					49
49	1			1		6
	1					2
	1					51
50							use constant CLOBBER => '';
51	1			1		7
	1					2
	1					65
52							our @ISA = qw(Mail::SpamAssassin::Plugin);
53
54							use constant DEBUG_RE_PARSING => 0; # noisy!
55	1			1		7
	1					2
	1					4608
56							# a few settings that control what kind of bases are output.
57
58							# treat all rules as lowercase for purposes of term extraction?
59							# $main->{bases_must_be_casei} = 1;
60							# $main->{bases_can_use_alternations} = 0; # /(foo\|bar\|baz)/
61							# $main->{bases_can_use_quantifiers} = 0; # /foo.bar/ or /foobar/ or /foooo?bar/
62							# $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
63							# $main->{bases_split_out_alternations} = 1; # /(foo\|bar\|baz)/ => ["foo", "bar", "baz"]
64							# $main->{base_quiet} = 0; # silences progress output
65
66							# TODO: it would be nice to have a clean API to pass such settings
67							# through to plugins instead of hanging them off $main
68
69							##############################################################################
70
71							# testing purposes only
72							my $fixup_re_test;
73							#$fixup_re_test = 1; fixup_re("fr()\|\\\\\|"); die;
74							#$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;
75							#$fixup_re_test = 1; fixup_re("\\33\$b"); die;
76							#$fixup_re_test = 1; fixup_re("[link]"); die;
77							#$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;
78
79							###########################################################################
80
81							my $class = shift;
82							my $mailsaobject = shift;
83	15			15	1	122	$class = ref($class) \|\| $class;
84	15					111	my $self = $class->SUPER::new($mailsaobject);
85	15		33			244	bless ($self, $class);
86	15					112
87	15					40	$self->{show_progress} = !$mailsaobject->{base_quiet};
88
89	15					79	# $self->test(); exit;
90							return $self;
91							}
92	15					136
93							###########################################################################
94
95							my ($self, $params) = @_;
96							my $conf = $params->{conf};
97							$self->extract_bases($conf);
98	15			15	1	46	}
99	15					43
100	15					63	my ($self, $conf) = @_;
101
102							my $main = $conf->{main};
103							if (!$main->{base_extract}) { return; }
104	15			15	0	47
105							$self->{show_progress} and
106	15					40	info("base extraction starting. this can take a while...");
107	15	50				56
	0					0
108							$self->extract_set($conf, $conf->{body_tests}, 'body');
109							}
110	15	50				53
111							my ($self, $conf, $test_set, $ruletype) = @_;
112	15					102
113							foreach my $pri (keys %{$test_set}) {
114							my $nicepri = $pri; $nicepri =~ s/-/neg/g;
115							$self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
116	15			15	0	99	}
117							}
118	15					32
	15					119
119	15					40	###########################################################################
	15					137
120	15					113
121							my ($self, $conf, $rules, $ruletype) = @_;
122
123							my @good_bases;
124							my @failed;
125							my $yes = 0;
126							my $no = 0;
127	15			15	0	53	my $count = 0;
128							my $start = time;
129	15					33	$self->{main} = $conf->{main}; # for use in extract_hints()
130							$self->{show_progress} and info ("extracting from rules of type $ruletype");
131	15					32	my $tflags = $conf->{tflags};
132	15					75
133	15					25	# attempt to find good "base strings" (simplified regexp subsets) for each
134	15					28	# regexp. We try looking at the regexp from both ends, since there
135	15					42	# may be a good long string of text at the end of the rule.
136	15	50				40
137	15					54	# require this many chars in a base string + delimiters for it to be viable
138							my $min_chars = 5;
139
140							my $progress;
141							$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
142							total => (scalar keys %{$rules} \|\| 1),
143							itemtype => 'rules',
144	15					32	});
145
146	15					40	my $cached = { };
147							my $cachefile;
148	15	50	0			52
149							if ($self->{main}->{bases_cache_dir}) {
150							$cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";
151							dbg("zoom: reading cache file $cachefile");
152	15					35	$cached = $self->read_cachefile($cachefile);
153	15					32	}
154
155	15	50				55	NEXT_RULE:
156	0					0	foreach my $name (keys %{$rules}) {
157	0					0	$self->{show_progress} and $progress and $progress->update(++$count);
158	0					0
159							#my $rule = $rules->{$name};
160							my $rule = qr_to_string($conf->{test_qrs}->{$name});
161							if (!defined $rule) {
162	15					31	die "zoom: error: regexp for $rule not found\n";
	15					66
163	42	50	33			325	}
164							my $cachekey = $name.'#'.$rule;
165
166	42					965	my $cent = $cached->{rule_bases}->{$cachekey};
167	42	50				148	if (defined $cent) {
168	0					0	if (defined $cent->{g}) {
169							dbg("zoom: YES (cached) $rule $name");
170	42					511	foreach my $ent (@{$cent->{g}}) {
171							# note: we have to copy these, since otherwise later
172	42					175	# modifications corrupt the cached data
173	42	50				128	push @good_bases, {
174	0	0				0	base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}
175	0					0	};
176	0					0	}
	0					0
177							$yes++;
178							}
179							else {
180							dbg("zoom: NO (cached) $rule $name");
181	0					0	push @failed, { orig => $rule }; # no need to cache this
182							$no++;
183	0					0	}
184							next NEXT_RULE;
185							}
186	0					0
187	0					0	# ignore ReplaceTags rules
188	0					0	my $is_a_replacetags_rule = $conf->{replace_rules}->{$name};
189							my ($minlen, $lossy, @bases);
190	0					0
191							if (!$is_a_replacetags_rule) {
192							eval { # catch die()s
193							my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
194	42					116	($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);
195	42					99	# dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases));
196							1;
197	42	50				150	} or do {
198							my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
199	42					385	dbg("zoom: giving up on regexp: $eval_stat");
200	41					205	};
201
202	40					578	#if ($lossy && ($tflags->{$name}\|\|'') =~ /\bmultiple\b/) {
203	42	100				81	# warn "\nzoom: $vers rule $name will loop on SpamAssassin older than 3.3.2 ".
204	2	50				36	# "running under Perl 5.12 or older, Bug 6558\n";
	2					17
205	2					57	#}
206
207							# if any of the extracted hints in a set are too short, the entire
208							# set is invalid; this is because each set of N hints represents just
209							# 1 regexp.
210							foreach my $str (@bases) {
211							my $len = length fixup_re($str); # bug 6143: count decoded characters
212							if ($len < $min_chars) { $minlen = undef; @bases = (); last; }
213							elsif (!defined($minlen) \|\| $len < $minlen) { $minlen = $len; }
214							}
215							}
216	42					413
217	135					941	if ($is_a_replacetags_rule \|\| !$minlen \|\| !@bases) {
218	135	100	100			1316	dbg("zoom: ignoring rule %s, %s", $name,
	3	100				56
	3					27
	3					24
219	62					267	$is_a_replacetags_rule ? 'is a replace rule'
220							: !@bases ? 'no bases' : 'no minlen');
221							push @failed, { orig => $rule };
222							$cached->{rule_bases}->{$cachekey} = { };
223	42	100	66			1012	$no++;
			66
224	5	50				132	}
		50
225							else {
226							# dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
227	5					81
228	5					70	# figure out if we have e.g. ["foo", "foob", "foobar"]; in this
229	5					38	# case, we only need to track ["foo"].
230							my %subsumed;
231							foreach my $base1 (@bases) {
232							foreach my $base2 (@bases) {
233							if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {
234							$subsumed{$base1} = 1; # base2 is inside base1; discard the longer
235							}
236	37					113	}
237	37					106	}
238	132					431
239	2524	100	100			22754	my @forcache;
240	4					61	foreach my $base (@bases) {
241							next if $subsumed{$base};
242							push @good_bases, {
243							base => $base, orig => $rule, name => "$name,[l=$lossy]"
244							};
245	37					305	# separate copies for cache -- we modify the @good_bases entry
246	37					138	push @forcache, {
247	132	100				636	base => $base, orig => $rule, name => "$name,[l=$lossy]"
248	128					1736	};
249							}
250
251							$cached->{rule_bases}->{$cachekey} = { g => \@forcache };
252	128					1049	$yes++;
253							}
254							}
255
256							$self->{show_progress} and $progress and $progress->final();
257	37					645
258	37					399	dbg("zoom: $ruletype: found ".(scalar @good_bases).
259							" usable base strings in $yes rules, skipped $no rules");
260
261							# NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
262	15	50	33			279	# ("food" =~ "foo" / "food") will return "food". So therefore if a pattern
263							# subsumes other patterns, we need to return hits for all of them. We also
264	15					638	# need to take care of the case where multiple regexps wind up sharing the
265							# same base.
266							#
267							# Another gotcha, an exception to the subsumption rule; if one pattern isn't
268							# entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
269							# returned as two hits, correctly. So we only have to be smart about the
270							# full-subsumption case; overlapping is taken care of for us, by re2c.
271							#
272							# TODO: there's a bug here. Since the code in extract_hints() has been
273							# modified to support more complex regexps, we can no longer simply assume
274							# that if pattern A is not contained in pattern B, that means that pattern B
275							# doesn't subsume it. Consider, for example, A="foo*bar" and
276							# B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
277							# that without running the A RE match itself somehow against B.
278							# same issue remains with:
279							#
280							# "foo?bar" / "fobar"
281							# "fo(?:o\|oo\|)bar" / "fobar"
282							# "fo(?:o\|oo)?bar" / "fobar"
283							# "fo(?:o*\|baz)bar" / "fobar"
284							# "(?:fo(?:o*\|baz)bar\|blargh)" / "fobar"
285							#
286							# it's worse with this:
287							#
288							# "fo(?:o\|oo\|)bar" / "foo*bar"
289							#
290							# basically, this is impossible to compute without reimplementing most of
291							# re2c, and it appears the re2c developers don't plan to offer this:
292							# https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
293
294							$conf->{base_orig}->{$ruletype} = { };
295							$conf->{base_string}->{$ruletype} = { };
296
297							$count = 0;
298							$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
299							total => (scalar @good_bases \|\| 1),
300	15					232	itemtype => 'bases',
301	15					144	});
302
303	15					43	# this bit is annoyingly O(N^2). Rewrite the data -- the @good_bases
304	15	50	0			60	# array -- into a more efficient format, using arrays and with a little
305							# bit of precomputation, to go (quite a bit) faster
306							my @rewritten;
307							foreach my $set1 (@good_bases) {
308							my $base = $set1->{base};
309							next if (!$base \|\| !$set1->{name});
310							push @rewritten, [
311							$base, # 0 - SLOT_BASE
312	15					62	$set1->{name}, # 1 - SLOT_NAME
313	15					61	$set1->{orig}, # 2 - SLOT_ORIG
314	128					462	length $base, # 3 - SLOT_LEN_BASE
315	128	50	33			702	$base, # 4 - SLOT_BASE_INITIAL
316							0 # 5 - SLOT_HAS_MULTIPLE, has_multiple flag
317							];
318							}
319
320	128					1017	@good_bases = sort {
321							$b->[SLOT_LEN_BASE] <=> $a->[SLOT_LEN_BASE] \|\|
322							$a->[SLOT_BASE] cmp $b->[SLOT_BASE] \|\|
323							$a->[SLOT_NAME] cmp $b->[SLOT_NAME] \|\|
324							$a->[SLOT_ORIG] cmp $b->[SLOT_ORIG]
325							} @rewritten;
326
327	15	50	100			248
	418		100			2105
328							my $base_orig = $conf->{base_orig}->{$ruletype};
329							my $next_base_position = 0;
330							for my $set1 (@good_bases) {
331							$next_base_position++;
332							$self->{show_progress} and $progress and $progress->update(++$count);
333							my $base1 = $set1->[SLOT_BASE] or next; # got clobbered
334	15					77	my $name1 = $set1->[SLOT_NAME];
335	15					48	my $orig1 = $set1->[SLOT_ORIG];
336	15					70	my $len1 = $set1->[SLOT_LEN_BASE];
337	128					297	$base_orig->{$name1} = $orig1;
338	128	50	33			342
339	128	100				481	foreach my $set2 (@good_bases[$next_base_position .. $#good_bases]) { # order from smallest to largest
340	104					270	# clobber false and exact dups; this can happen if a regexp outputs the
341	104					142	# same base string multiple times
342	104					209	if (!$set2->[SLOT_BASE] \|\|
343	104					353	(
344							$base1 eq $set2->[SLOT_BASE] &&
345	104					440	$name1 eq $set2->[SLOT_NAME] &&
346							$orig1 eq $set2->[SLOT_ORIG]
347							)
348	1685	100	100			6466	)
			66
			66
349							{
350							#dbg("clobbering: [base2][$set2->[SLOT_BASE]][name2][$set2->[SLOT_NAME]][orig][$set2->[SLOT_ORIG]]");
351							$set2->[SLOT_BASE] = CLOBBER; # clobber
352							next;
353							}
354
355							# Cannot be a subset if it does not contain the other base string
356							next if index($base1,$set2->[SLOT_BASE_INITIAL]) == -1;
357	24					75
358	24					130	# skip if either already contains the other rule's name
359							# optimize: this can only happen if the base has more than
360							# one rule already attached, ie [5]
361							next if ($set2->[SLOT_HAS_MULTIPLE] && index($set2->[SLOT_NAME],$name1) > -1 && $set2->[SLOT_NAME] =~ /(?: \|^)\Q$name1\E(?: \|$)/);
362	1661	100				6358
363							# don't use $name1 here, since another base in the set2 loop
364							# may have added $name2 since we set that
365							next if ($set1->[SLOT_HAS_MULTIPLE] && index($set1->[SLOT_NAME],$set2->[SLOT_NAME]) > -1 && $set1->[SLOT_NAME] =~ /(?: \|^)\Q$set2->[SLOT_NAME]\E(?: \|$)/);
366
367	33	0	33			107	# $set2->[SLOT_BASE] is just a subset of base1
			33
368							#dbg("zoom: subsuming '$set2->[SLOT_BASE]' ($set2->[SLOT_NAME]) into '$base1': [SLOT_BASE]=$set1->[SLOT_BASE] [SLOT_HAS_MULTIPLE]=$set1->[SLOT_HAS_MULTIPLE]");
369							$set1->[SLOT_NAME] .= " ".$set2->[SLOT_NAME];
370							$set1->[SLOT_HAS_MULTIPLE] = 1;
371	33	50	66			104	}
			33
372							}
373
374							# we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS
375	33					141	# both contain "killed" for example, pointing at different rules, which
376	33					102	# the above search hasn't found. Collapse them here with a hash
377							my %bases;
378							foreach my $set (@good_bases) {
379							my $base = $set->[0];
380							next unless $base;
381
382							if (defined $bases{$base}) {
383	15					63	$bases{$base} .= " ".$set->[1];
384	15					82	} else {
385	128					399	$bases{$base} = $set->[1];
386	128	100				355	}
387							}
388	104	100				251	undef @good_bases;
389	12					66
390							my $base_string = $conf->{base_string}->{$ruletype};
391	92					709	foreach my $base (keys %bases) {
392							# uniq the list, since there are probably dup rules listed
393							my %u;
394	15					94	for my $i (split ' ', $bases{$base}) {
395							next if exists $u{$i}; undef $u{$i};
396	15					56	}
397	15					139	$base_string->{$base} = join ' ', sort keys %u;
398							}
399	92					202
400	92					287	$self->{show_progress} and $progress and $progress->final();
401	137	100				387
	120					307
402							if ($cachefile) {
403	92					478	$self->write_cachefile ($cachefile, $cached);
404							}
405
406	15	50	33			96	my $elapsed = time - $start;
407							$self->{show_progress} and info ("$ruletype: ".
408	15	50				48	(scalar keys %{$conf->{base_string}->{$ruletype}}).
409	0					0	" base strings extracted in $elapsed seconds\n");
410							}
411
412	15					51	###########################################################################
413
414	15	50				1310	# TODO:
	0					0
415							# NO /no.{1,10}P(?:er\|re)scription.{1,10}(?:needed\|require\|necessary)/i
416							# => should extract 'scription' somehow
417							# /time to refinance\|refinanc\w{1,3}\b.{0,16}\bnow\b/i
418							# => should understand alternations; tricky
419
420							my $self = shift;
421							my $rule = shift;
422
423							my $main = $self->{main};
424
425
426							my $mods = '';
427	42			42	0	279
428	42					176	# remove the regexp modifiers, keep for later
429							while ($rule =~ s/^$\?([a-z]*)$//) {
430	42					138	$mods .= $1;
431							}
432
433	42					163	# modifier removal
434							while ($rule =~ s/^$\?-([a-z]*)$//) {
435							foreach my $modchar (split '', $mods) {
436	42					550	$mods =~ s/$modchar//g;
437	20					127	}
438							}
439
440							my $lossy = 0;
441	42					194
442	0					0	# now: simplify aspects of the regexp. Bear in mind that we can
443	0					0	# simplify as long as we cause the regexp to become more general;
444							# more hits is OK, since false positives will be discarded afterwards
445							# anyway. Simplification that causes the regexp to not hit
446							# stuff that the "real" rule would hit, however, is a bad thing.
447	42					74
448							if ($main->{bases_must_be_casei}) {
449							$rule = lc $rule;
450
451							$lossy = 1;
452							$mods =~ s/i// and $lossy = 0;
453
454							# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
455	42	100				183	$rule =~ s/(?<!\\)$\?i\:(.*?)$/$1/gs and $lossy++;
456	37					163
457							# always case-i: /A(?-i:ct)/ => /Act/
458	37					79	$rule =~ s/(?<!\\)$\?-i\:(.*?)$/$1/gs and $lossy++;
459	37	100				360
460							# remove (?i)
461							$rule =~ s/$\?i$//gs;
462	37	100				247	}
463							else {
464							die "case-i" if $rule =~ /$\?i$/;
465	37	50				123	die "case-i" if $mods =~ /i/;
466
467							# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
468	37					99	$rule =~ s/(?<!\\)$\?i\:(.*?)$/$1/gs and die "case-i";
469
470							# we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/
471	5	50				26	$rule =~ s/(?<!\\)$\?-i\:(.*?)$/$1/gs;
472	5	100				32	}
473
474							# remove /m and /s modifiers
475	4	50				21	$mods =~ s/m// and $lossy++;
476							$mods =~ s/s// and $lossy++;
477
478	4					10	# remove (^\|\b)'s
479							# T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS\|(^\|\b)EXTI($\|\b))/is
480							$rule =~ s/$\^\\|\\b$//gs and $lossy++;
481							$rule =~ s/$\$\\|\\b$//gs and $lossy++;
482	41	50				279	$rule =~ s/$\\b\\|\^$//gs and $lossy++;
483	41	100				282	$rule =~ s/$\\b\\|\$$//gs and $lossy++;
484
485							# remove (?!credit)
486							$rule =~ s/$\?\![^$]+\)//gs and $lossy++;
487	41	50				317
488	41	50				154	# remove \b's
489	41	50				163	$rule =~ s/(?<!\\)\\b//gs and $lossy++;
490	41	50				131
491							# remove the "?=" trick
492							# (?=[dehklnswxy])(horny\|nasty\|hot\|wild\|young\|....etc...)
493	41	100				170	$rule =~ s/$\?\=\[[^\]]+\]$//gs;
494
495							$mods .= "L" if $lossy;
496	41	100				187	($rule, $mods);
497							}
498
499							my $self = shift;
500	41					91	my $rawrule = shift;
501							my $rule = shift;
502	41	100				228	my $mods = shift;
503	41					393
504							my $main = $self->{main};
505							my $orig = $rule;
506
507	41			41	0	79	my $lossy = 0;
508	41					67	$mods =~ s/L// and $lossy++;
509	41					213
510	41					192	# if there are anchors, give up; we can't get much
511							# faster than these anyway
512	41					100	die "anchors" if $rule =~ /^\(?(?:\^\|\\A)/;
513	41					245
514							# die "anchors" if $rule =~ /(?:\$\|\\Z)\)?$/;
515	41					80	# just remove end-of-string anchors; they're slow so could gain
516	41	100				430	# from our speedup
517							$rule =~ s/(?<!\\)(?:\$\|\\Z)\)?$// and $lossy++;
518
519							# simplify (?:..) to (..)
520	41	50				254	$main->{bases_allow_noncapture_groups} or
521							$rule =~ s/\(\?:/\(/g;
522
523							# simplify some grouping arrangements so they're easier for us to parse
524							# (foo)? => (foo\|)
525	41	50				208	$rule =~ s/$(.*?)$\?/$$1\\|$/gs;
526							# r? => (r\|)
527							$rule =~ s/(?<!\\)(\w)\?/$$1\\|$/gs;
528
529	41	50				271	my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
530							$tmpfh or die "failed to create a temporary file";
531							untaint_var(\$tmpf);
532
533	41					203	print $tmpfh "use bytes; m{" . $rule . "}" . $mods
534							or die "error writing to $tmpf: $!";
535	41					185	close $tmpfh or die "error closing $tmpf: $!";
536
537	41					537	my $perl = $self->get_perl();
538	41	50				200	local *IN;
539	41					180	open (IN, "$perl -c -Mre=debug $tmpf 2>&1 \|")
540							or die "cannot run $perl: ".exit_status_str($?,$!);
541	41	50				751
542							my($inbuf,$nread,$fullstr); $fullstr = '';
543	41	50				2166	while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }
544							defined $nread or die "error reading from pipe: $!";
545	41					443
546	41					207	unlink $tmpf or die "cannot unlink $tmpf: $!";
547	41	50				193994	close IN or die "error closing pipe: $!";
548							defined $fullstr or warn "empty result from a pipe";
549
550	41					707	# now parse the -Mre=debug output.
	41					412
551	41					385321	# perl 5.10 format
	41					1537
552	41	50				236	$fullstr =~ s/^.*\nFinal program:\n//gs;
553							# perl 5.6/5.8 format
554	41	50				6859	$fullstr =~ s/^(?:.\n\|)size \d[^\n]\n//gs;
555	41	50				2001	$fullstr =~ s/^(?:.\n\|)first at \d[^\n]\n//gs;
556	41	50				281	# common to all
557							$fullstr =~ s/\nOffsets:.*$//gs;
558
559							# clean up every other line that doesn't start with a space
560	41					2302	$fullstr =~ s/^\S.*$//gm;
561
562	41					275	if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
563	41					347	die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";
564							}
565	41					182	my $opsstr = $1;
566
567							# what's left looks like this:
568	41					1037	# 1: EXACTF <v>(3)
569							# 3: ANYOF[1ILil](14)
570	41	50				1128	# 14: EXACTF <a>(16)
571	0					0	# 16: CURLY {2,7}(29)
572							# 18: ANYOF[A-Za-z](0)
573	41					1213	# 29: SPACE(30)
574							# 30: EXACTF <http://>(33)
575							# 33: END(0)
576							#
577							DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";
578
579							my @ops;
580							foreach my $op (split(/\n/s, $opsstr)) {
581							next unless $op;
582
583							if ($op =~ /^\s+\d+: (\s)([A-Z]\w+)\b(.?)\s*(?:$\d+$)?$/) {
584							# perl 5.8: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)
585	41					259	# perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)
586							push @ops, [ $1, $2, $3 ];
587	41					91	}
588	41					2886	elsif ($op =~ /^ (\s)<(.)>\.\.\.\s*$/) {
589	439	50				1808	# 5: TRIE-EXACT[im](44)
590							# <message contained attachments that have been blocked by guin>...
591	439	100				4472	my $spcs = $1;
		100
		50
		0
592							# we could use the entire length here, but it's easier to trim to
593							# the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
594	321					7532	# suite results will match, since the sa-update --list extraction will
595							# be the same for all versions. (The "..." trailer is important btw)
596							my $str = substr ($2, 0, 55);
597							push @ops, [ $spcs, '_moretrie', "<$str...>" ];
598							}
599	4					38	elsif ($op =~ /^ (\s)(<.>)\s*(?:$\d+$)?$/) {
600							# 5: TRIE-EXACT[am](21)
601							# <am> (21)
602							# <might> (12)
603							push @ops, [ $1, '_moretrie', $2 ];
604	4					59	}
605	4					92	elsif ($op =~ /^ at .+ line \d+$/) {
606							next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':
607							}
608							else {
609							warn "cannot parse '$op': $opsstr";
610							next;
611	114					1417	}
612							}
613
614	0					0	# unroll the branches; returns a list of versions.
615							# e.g. /foo(bar\|baz)argh/ => [ "foobarargh", "foobazargh" ]
616							my @unrolled;
617	0					0	if ($main->{bases_split_out_alternations}) {
618	0					0	@unrolled = $self->unroll_branches(0, \@ops);
619							} else {
620							@unrolled = ( \@ops );
621							}
622
623							# now find the longest DFA-friendly string in each unrolled version
624	41					370	my @longests;
625	41	100				336	foreach my $opsarray (@unrolled) {
626	35					742	my $longestexact = '';
627							my $buf = '';
628	6					75
629							# use a closure to keep the code succinct
630							my $add_candidate = sub {
631							if (length $buf > length $longestexact) { $longestexact = $buf; }
632	41					137	$buf = '';
633	41					132	};
634	136					561
635	136					400	my $prevop;
636							foreach my $op (@{$opsarray}) {
637							my ($spcs, $item, $args) = @{$op};
638
639	430	100		430		1396	next if ($item eq 'NOTHING');
	159					391
640	430					839
641	136					1331	# EXACT == case-sensitive
642							# EXACTF == case-i
643	136					258	# we can do both, since we canonicalize to lc.
644	136					178	if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
	136					342
645	635					1070	{
	635					2404
646							my $str = $1;
647	635	100				1777	$buf .= $str;
648							if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
649							# a high Unicode codepoint, interpreted by perl 5.8.x. cut and stop
650							$add_candidate->();
651							}
652	626	100	100			10931	if (length $str >= 55 && $buf =~ s/\.\.\.$//) {
		100	66
		50	100
		100	66
			66
			100
			66
			33
			33
			0
			66
			100
			66
			66
			33
			33
653							# perl 5.8.x truncates with a "..." here! cut and stop
654	142					853	$add_candidate->();
655	142					453	}
656	142	100				580	}
657							# _moretrie == a TRIE-EXACT entry
658	1					30	elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
659							{
660	142	50	33			657	$buf .= $1;
661							if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
662	0					0	# perl 5.8.x truncates with a "..." here! cut and stop
663							$add_candidate->();
664							}
665							}
666							# /(?:foo\|bar\|baz){2}/ results in a CURLYX beforehand
667							elsif ($item =~ /^EXACT/ &&
668	190					529	$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
669	190	100	66			754	$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
670							$args =~ /<(.*)>/)
671	4					28	{
672							$buf .= $1;
673							if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
674							# perl 5.8.x truncates with a "..." here! cut and stop
675							$add_candidate->();
676							}
677							}
678							# CURLYX, for perl >= 5.9.5
679							elsif ($item =~ /^_moretrie/ &&
680	0					0	$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
681	0	0	0			0	$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
682							$args =~ /<(.*)>/)
683	0					0	{
684							$buf .= $1;
685							if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
686							# perl 5.8.x truncates with a "..." here! cut and stop
687							$add_candidate->();
688							}
689							}
690							else {
691							# not an /^EXACT/; clear the buffer
692	5					34	$add_candidate->();
693	5	50	33			44	if ($item !~ /^(?:END\|CLOSE\d\|MINMOD)$/)
694							{
695	0					0	$lossy = 1;
696							DEBUG_RE_PARSING and warn "item $item makes regexp lossy";
697							}
698							}
699							$prevop = $op;
700	289					1014	}
701	289	100				1307	$add_candidate->();
702
703	173					606	if (!$longestexact) {
704	173					272	die "no long-enough string found in $rawrule\n";
705							# all unrolled versions must have a long string, otherwise
706							# we cannot reliably match all variants of the rule
707	626					1809	} else {
708							push @longests, ($main->{bases_must_be_casei}) ?
709	136					315	lc $longestexact : $longestexact;
710							}
711	136	100				328	}
712	1					89
713							DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
714							return ($lossy, @longests);
715							}
716	135	100				1553
717							###########################################################################
718
719							my ($self, $depth, $opslist) = @_;
720
721	40					106	die "too deep" if ($depth++ > 5);
722	40					2812
723							my @ops = (@{$opslist}); # copy
724							my @pre_branch_ops;
725							my $branch_spcs;
726							my $trie_spcs;
727							my $open_spcs;
728	187			187	0	591
729							# our input looks something like this 2-level structure:
730	187	50				533	# 1: BOUND(2)
731							# 2: EXACT <Dear >(5)
732	187					283	# 5: BRANCH(9)
	187					673
733	187					609	# 6: EXACT <IT>(8)
734							# 8: NALNUM(24)
735	187					0	# 9: BRANCH(23)
736	187					0	# 10: EXACT <Int>(12)
737							# 12: BRANCH(14)
738							# 13: NOTHING(21)
739							# 14: BRANCH(17)
740							# 15: EXACT <a>(21)
741							# 17: BRANCH(20)
742							# 18: EXACT <er>(21)
743							# 20: TAIL(21)
744							# 21: EXACT <net>(24)
745							# 23: TAIL(24)
746							# 24: EXACT < shop>(27)
747							# 27: END(0)
748							#
749							# or:
750							#
751							# 1: OPEN1(3)
752							# 3: BRANCH(6)
753							# 4: EXACT <v>(9)
754							# 6: BRANCH(9)
755							# 7: EXACT <\\/>(9)
756							# 9: CLOSE1(11)
757							# 11: CURLY {2,5}(14)
758							# 13: REG_ANY(0)
759							# 14: EXACT < g r a >(17)
760							# 17: ANYOF[a-z](28)
761							# 28: END(0)
762							#
763							# or:
764							#
765							# 1: EXACT <i >(3)
766							# 3: OPEN1(5)
767							# 5: TRIE-EXACT[am](21)
768							# <am> (21)
769							# <might> (12)
770							# 12: OPEN2(14)
771							# 14: TRIE-EXACT[ ](19)
772							# < be>
773							# <>
774							# 19: CLOSE2(21)
775							# 21: CLOSE1(23)
776							# 23: EXACT < c>(25)
777
778							DEBUG_RE_PARSING and warn "starting parse";
779
780							# this happens for /foo\|bar/ instead of /(?:foo\|bar)/ ; transform
781							# it into the latter. bit of a kludge to do this before the loop, but hey.
782							# note that it doesn't fix the CLOSE1/END ordering to be correct
783							if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {
784							my @newops = ([ "", "OPEN1", "" ]);
785							foreach my $op (@ops) {
786							push @newops, [ " ".$op->[0], $op->[1], $op->[2] ];
787	187					273	}
788							push @newops, [ "", "CLOSE1", "" ];
789							@ops = @newops;
790							}
791
792	187	50	66			1317	# iterate until we start a branch set. using
793	0					0	# /dkjfksl(foo\|bar(baz\|argh)boo)gab/ as an example, we're at "dkj..."
794	0					0	# just hitting an OPEN is not enough; wait until we see a TRIE-EXACT
795	0					0	# or a BRANCH, then unroll the most recent OPEN set.
796							while (1) {
797	0					0	my $op = shift @ops;
798	0					0	last unless defined $op;
799
800							my ($spcs, $item, $args) = @{$op};
801							DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";
802
803							if ($item =~ /^OPEN/) {
804							$open_spcs = $spcs;
805	187					374	next; # next will be a BRANCH or TRIE
806	968					1431
807	968	100				1901	} elsif ($item =~ /^TRIE/) {
808							$trie_spcs = $spcs;
809	838					1163	last;
	838					2854
810	838					1230
811							} elsif ($item =~ /^BRANCH/) {
812	838	100	100			5744	$branch_spcs = $spcs;
		100
		100
		100
		100
813	54					346	last;
814	54					260
815							} elsif ($item =~ /^EXACT/ && defined $open_spcs) {
816							# perl 5.9.5 does this; f(o\|oish) => OPEN, EXACT, TRIE-EXACT
817	49					181	push @pre_branch_ops, [ $open_spcs, $item, $args ];
818	49					160	next;
819
820							} elsif (defined $open_spcs) {
821	8					74	# OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:
822	8					111	# ignore this OPEN block entirely and don't try to unroll it
823							undef $open_spcs;
824
825							} else {
826	7					115	push @pre_branch_ops, $op;
827	7					92	}
828							}
829
830							# no branches found? we're done unrolling on this one!
831							if (scalar @ops == 0) {
832	2					25	return [ @pre_branch_ops ];
833							}
834
835	718					2193	# otherwise we're at the start of a new branch set
836							# /(foo\|bar(baz\|argh)boo)gab/
837							my @alts;
838							my @in_this_branch;
839
840	187	100				433	DEBUG_RE_PARSING and warn "entering branch: ".
841	130					950	"open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".
842							"branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".
843							"trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";
844
845							# indentation level to remove from "normal" ops (using a s///)
846	57					129	my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
847							my $trie_sub_spcs = "";
848							while (1) {
849	57					83	my $op = shift @ops;
850							last unless defined $op;
851							my ($spcs, $item, $args) = @{$op};
852							DEBUG_RE_PARSING and warn "in: [$spcs] $item $args";
853
854							if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt
855	57	100				491	push @alts, [ @pre_branch_ops, @in_this_branch ];
856	57					268	@in_this_branch = ();
857	57					124	$open_sub_spcs = $branch_spcs." ";
858	387					614	$trie_sub_spcs = "";
859	387	50				796	next;
860	387					565	}
	387					1440
861	387					645	elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end
862							push @alts, [ @pre_branch_ops, @in_this_branch ];
863	387	100	100			6135	undef $branch_spcs;
		50	66
		100	66
		100	33
		100	100
			100
			100
			66
864	9					71	$open_sub_spcs = "";
865	9					33	$trie_sub_spcs = "";
866	9					31	last;
867	9					29	}
868	9					45	elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {
869							if (scalar @in_this_branch > 0) {
870							push @alts, [ @pre_branch_ops, @in_this_branch ];
871	0					0	}
872	0					0	# use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)
873	0					0	@in_this_branch = ( [ $open_spcs, $item, $args ] );
874	0					0	$open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
875	0					0	$trie_sub_spcs = " ";
876							next;
877							}
878	135	100				390	elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end
879	86					349	push @alts, [ @pre_branch_ops, @in_this_branch ];
880							undef $branch_spcs;
881							undef $open_spcs;
882	135					874	undef $trie_spcs;
883	135	50				345	$open_sub_spcs = "";
884	135					313	$trie_sub_spcs = "";
885	135					365	last;
886							}
887							elsif ($item eq 'END') { # of string
888	52					272	push @alts, [ @pre_branch_ops, @in_this_branch ];
889	52					133	undef $branch_spcs;
890	52					143	undef $open_spcs;
891	52					105	undef $trie_spcs;
892	52					152	$open_sub_spcs = "";
893	52					116	$trie_sub_spcs = "";
894	52					184	last;
895							}
896							else {
897	5					42	if ($open_sub_spcs) {
898	5					34	# deindent the space-level to match the opening brace
899	5					13	$spcs =~ s/^$open_sub_spcs//;
900	5					13	# tries also add one more indent level in
901	5					13	$spcs =~ s/^$trie_sub_spcs//;
902	5					22	}
903	5					28	push @in_this_branch, [ $spcs, $item, $args ];
904							# note that we ignore ops at a deeper $spcs level entirely (until later!)
905							}
906	186	50				471	}
907
908	186					1418	if (defined $branch_spcs) {
909							die "fell off end of string with a branch open: '$branch_spcs'";
910	186					972	}
911
912	186					1361	# we're now after the branch set: /gab/
913							# @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz\|argh)boo/ ]
914							foreach my $alt (@alts) {
915							push @{$alt}, @ops; # add all remaining ops to each one
916							# note that this could include more (?:...); we don't care, since
917	57	50				190	# those can be handled by recursing
918	0					0	}
919
920							# ok, parsed the entire ops list
921							# @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz\|argh)boogab/ ]
922
923	57					250	if (DEBUG_RE_PARSING) {
924	152					399	print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
	152					510
925							}
926
927							# now recurse, to unroll the remaining branches (if any exist)
928							my @rets;
929							foreach my $alt (@alts) {
930							push @rets, $self->unroll_branches($depth, $alt);
931							}
932	57					95
933							if (DEBUG_RE_PARSING) {
934							print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
935							}
936
937	57					123	return @rets;
938	57					112	}
939	152					597
940							###########################################################################
941
942	57					112	my ($self) = @_;
943
944							$self->test_split_alt("foo", "/foo/");
945							$self->test_split_alt("(foo)", "/foo/");
946	57					468	$self->test_split_alt("foo(bar)baz", "/foobarbaz/");
947							$self->test_split_alt("x(foo\|)", "/xfoo/ /x/");
948							$self->test_split_alt("fo(o\|)", "/foo/ /fo/");
949							$self->test_split_alt("(foo\|bar)", "/foo/ /bar/");
950							$self->test_split_alt("foo\|bar", "/foo/ /bar/");
951							$self->test_split_alt("foo (bar\|baz) argh", "/foo bar argh/ /foo baz argh/");
952	0			0	0	0	$self->test_split_alt("foo (bar\|baz\|bl(arg\|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");
953							$self->test_split_alt("(s(otc\|tco)k)", "/sotck/ /stcok/");
954	0					0	$self->test_split_alt("(business partner(s\|ship\|)\|silent partner(s\|ship\|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");
955	0					0	}
956	0					0
957	0					0	my ($self, $in, $out) = @_;
958	0					0
959	0					0	my @got = $self->split_alt($in);
960	0					0	$out =~ s/^\///;
961	0					0	$out =~ s/\/$//;
962	0					0	my @want = split(/\/ \//, $out);
963	0					0
964	0					0	my $failed = 0;
965							if (scalar @want != scalar @got) {
966							warn "FAIL: results count don't match";
967							$failed++;
968	0			0	0	0	}
969							else {
970	0					0	my %got = map { $_ => 1 } @got;
971	0					0	foreach my $w (@want) {
972	0					0	if (!$got{$w}) {
973	0					0	warn "FAIL: '$w' not found";
974							$failed++;
975	0					0	}
976	0	0				0	}
977	0					0	}
978	0					0
979							if ($failed) {
980							print "want: /".join('/ /', @want)."/\n" or die "error writing: $!";
981	0					0	print "got: /".join('/ /', @got)."/\n" or die "error writing: $!";
	0					0
982	0					0	return 0;
983	0	0				0	} else {
984	0					0	print "ok\n" or die "error writing: $!";
985	0					0	return 1;
986							}
987							}
988
989							###########################################################################
990	0	0				0
991	0	0				0	my ($self) = @_;
992	0	0				0	my $perl;
993	0					0
994							# allow user override of the perl interpreter to use when
995	0	0				0	# extracting base strings.
996	0					0	# TODO: expose this via sa-compile command-line option
997							my $fromconf = $self->{main}->{conf}->{re_parser_perl};
998
999							if ($fromconf) {
1000							$perl = $fromconf;
1001							} elsif ($^X =~ m\|^/\|) {
1002							$perl = $^X;
1003	41			41	0	178	} else {
1004	41					85	use Config;
1005							$perl = $Config{perlpath};
1006							$perl =~ s\|/[^/]*$\|/$^X\|;
1007							}
1008							untaint_var(\$perl);
1009	41					139	return $perl;
1010							}
1011	41	50				523
		50
1012	0					0	###########################################################################
1013
1014	41					220	my ($self, $cachefile) = @_;
1015							local *IN;
1016	1			1		9	if (open(IN, "<".$cachefile)) {
	1					3
	1					962
1017	0					0	my($inbuf,$nread,$str); $str = '';
1018	0					0	while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf }
1019							defined $nread or die "error reading from $cachefile: $!";
1020	41					355	close IN or die "error closing $cachefile: $!";
1021	41					283
1022							untaint_var(\$str);
1023							my $VAR1; # Data::Dumper
1024							if (eval $str) {
1025							return $VAR1; # Data::Dumper's naming
1026							}
1027	0			0	0	0	}
1028	0					0	return { };
1029	0	0				0	}
1030	0					0
	0					0
1031	0					0	my ($self, $cachefile, $cached) = @_;
	0					0
1032	0	0				0
1033	0	0				0	my $dump = Data::Dumper->new ([ $cached ]);
1034							$dump->Deepcopy(1);
1035	0					0	$dump->Purity(1);
1036	0					0	$dump->Indent(1);
1037	0	0				0
1038	0					0	my $cachedir = $self->{main}->{bases_cache_dir};
1039							if (mkdir($cachedir)) {
1040							# successfully created
1041	0					0	} elsif ($! == EEXIST) {
1042							dbg("zoom: ok, cache directory already existed");
1043							} else {
1044							warn "zoom: could not create cache directory: $cachedir ($!)\n";
1045	0			0	0	0	return;
1046							}
1047	0					0	open(CACHE, ">$cachefile") or warn "cannot write to $cachefile";
1048	0					0	print CACHE ($dump->Dump, ";1;") or die "error writing: $!";
1049	0					0	close CACHE or die "error closing $cachefile: $!";
1050	0					0	}
1051
1052	0					0	=over 4
1053	0	0				0
		0
1054							=item my ($cleanregexp) = fixup_re($regexp);
1055
1056	0					0	Converts encoded characters in a regular expression pattern into their
1057							equivalent characters
1058	0					0
1059	0					0	=back
1060
1061	0	0				0	=cut
1062	0	0				0
1063	0	0				0	my $re = shift;
1064
1065							if ($fixup_re_test) { print "INPUT: /$re/\n" or die "error writing: $!" }
1066
1067							my $output = "";
1068							my $TOK = qr([\"\\]);
1069
1070							my $STATE;
1071							local ($1,$2);
1072							while ($re =~ /\G(.*?)($TOK)/gcs) {
1073							my $pre = $1;
1074							my $tok = $2;
1075
1076							if (length($pre)) {
1077							$output .= "\"$pre\"";
1078	135			135	1	478	}
1079
1080	135	0				360	if ($tok eq '"') {
	0	50				0
1081							$output .= '"\\""';
1082	135					411	}
1083	135					1324	elsif ($tok eq '\\') {
1084							$re =~ /\G(x\{[^\}]+\}\|[0-7]{1,3}\|.)/gcs or die "\\ at end of string!";
1085	135					234	my $esc = $1;
1086	135					707	if ($esc eq '"') {
1087	135					1377	$output .= '"\\""';
1088	0					0	} elsif ($esc eq '\\') {
1089	0					0	$output .= '"BACKSLASH"'; # avoid hairy escape-parsing
1090							} elsif ($esc =~ /^x\{(\S+)\}\z/) {
1091	0	0				0	$output .= '"'.chr(hex($1)).'"';
1092	0					0	} elsif ($esc =~ /^[0-7]{1,3}\z/) {
1093							$output .= '"'.chr(oct($esc)).'"';
1094							} else {
1095	0	0				0	$output .= "\"$esc\"";
		0
1096	0					0	}
1097							}
1098							else {
1099	0	0				0	print "PRE: $pre\nTOK: $tok\n" or die "error writing: $!";
1100	0					0	}
1101	0	0				0	}
		0
		0
		0
1102	0					0
1103							if (!defined(pos($re))) {
1104	0					0	# no matches
1105							$output .= "\"$re\"";
1106	0					0	# Bug 6649: protect NL, NULL, ^Z, (and controls to stay on the safe side)
1107							$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
1108	0					0	}
1109							elsif (pos($re) <= length($re)) {
1110	0					0	$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
1111							$output .= fixup_re(substr($re, pos($re)));
1112							}
1113
1114	0	0				0	$output =~ s/^""/"/; # protect start and end quotes
1115							$output =~ s/(?<!\\)""\z/"/;
1116							$output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
1117							$output =~ s/\\BACKSLASH\\/\\\\/gs;
1118	135	50				361
		0
1119							if ($fixup_re_test) { print "OUTPUT: $output\n" or die "error writing: $!" }
1120	135					677	return $output;
1121							}
1122	135					574
	0					0
1123							1;