File Coverage

lib/Mail/SpamAssassin/Plugin/BodyRuleBaseExtractor.pm

Criterion	Covered	Total	%
statement	381	502	75.9
branch	160	276	57.9
condition	88	141	62.4
subroutine	22	26	84.6
pod	3	14	21.4
total	654	959	68.2

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Copyright 2006 Apache Software Foundation
3							#
4							# Licensed under the Apache License, Version 2.0 (the "License");
5							# you may not use this file except in compliance with the License.
6							# You may obtain a copy of the License at
7							#
8							# http://www.apache.org/licenses/LICENSE-2.0
9							#
10							# Unless required by applicable law or agreed to in writing, software
11							# distributed under the License is distributed on an "AS IS" BASIS,
12							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13							# See the License for the specific language governing permissions and
14							# limitations under the License.
15							# </@LICENSE>
16
17							=head1 NAME
18
19							Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset
20
21							=head1 SYNOPSIS
22
23							This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
24							suitable for use in Rule2XSBody rules or other parallel matching algorithms.
25
26							=cut
27
28							package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;
29
30	1			1		9	use Mail::SpamAssassin::Plugin;
	1					6
	1					29
31	1			1		6	use Mail::SpamAssassin::Logger;
	1					2
	1					66
32	1			1		9	use Mail::SpamAssassin::Util qw(untaint_var);
	1					2
	1					51
33	1			1		354	use Mail::SpamAssassin::Util::Progress;
	1					3
	1					48
34
35	1			1		10	use Errno qw(ENOENT EACCES EEXIST);
	1					2
	1					120
36	1			1		7	use Data::Dumper;
	1					3
	1					70
37
38	1			1		7	use strict;
	1					1
	1					19
39	1			1		5	use warnings;
	1					1
	1					24
40							# use bytes;
41	1			1		5	use re 'taint';
	1					2
	1					55
42
43							our @ISA = qw(Mail::SpamAssassin::Plugin);
44
45	1			1		6	use constant DEBUG_RE_PARSING => 0; # noisy!
	1					2
	1					5410
46
47							# a few settings that control what kind of bases are output.
48
49							# treat all rules as lowercase for purposes of term extraction?
50							# $main->{bases_must_be_casei} = 1;
51							# $main->{bases_can_use_alternations} = 0; # /(foo\|bar\|baz)/
52							# $main->{bases_can_use_quantifiers} = 0; # /foo.bar/ or /foobar/ or /foooo?bar/
53							# $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
54							# $main->{bases_split_out_alternations} = 1; # /(foo\|bar\|baz)/ => ["foo", "bar", "baz"]
55							# $main->{base_quiet} = 0; # silences progress output
56
57							# TODO: it would be nice to have a clean API to pass such settings
58							# through to plugins instead of hanging them off $main
59
60							##############################################################################
61
62							# testing purposes only
63							my $fixup_re_test;
64							#$fixup_re_test = 1; fixup_re("fr()\|\\\\\|"); die;
65							#$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die;
66							#$fixup_re_test = 1; fixup_re("\\33\$b"); die;
67							#$fixup_re_test = 1; fixup_re("[link]"); die;
68							#$fixup_re_test = 1; fixup_re("please do not resend your original message."); die;
69
70							###########################################################################
71
72							sub new {
73	15			15	1	115	my $class = shift;
74	15					97	my $mailsaobject = shift;
75	15		33			135	$class = ref($class) \|\| $class;
76	15					89	my $self = $class->SUPER::new($mailsaobject);
77	15					51	bless ($self, $class);
78
79	15					118	$self->{show_progress} = !$mailsaobject->{base_quiet};
80
81							# $self->test(); exit;
82	15					180	return $self;
83							}
84
85							###########################################################################
86
87							sub finish_parsing_end {
88	15			15	1	53	my ($self, $params) = @_;
89	15					40	my $conf = $params->{conf};
90	15					49	$self->extract_bases($conf);
91							}
92
93							sub extract_bases {
94	15			15	0	32	my ($self, $conf) = @_;
95
96	15					31	my $main = $conf->{main};
97	15	50				44	if (!$main->{base_extract}) { return; }
	0					0
98
99							$self->{show_progress} and
100	15	50				50	info("base extraction starting. this can take a while...");
101
102	15					46	$self->extract_set($conf, $conf->{body_tests}, 'body');
103							}
104
105							sub extract_set {
106	15			15	0	41	my ($self, $conf, $test_set, $ruletype) = @_;
107
108	15					24	foreach my $pri (keys %{$test_set}) {
	15					57
109	15					28	my $nicepri = $pri; $nicepri =~ s/-/neg/g;
	15					103
110	15					100	$self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
111							}
112							}
113
114							###########################################################################
115
116							sub extract_set_pri {
117	15			15	0	49	my ($self, $conf, $rules, $ruletype) = @_;
118
119	15					36	my @good_bases;
120							my @failed;
121	15					37	my $yes = 0;
122	15					32	my $no = 0;
123	15					23	my $count = 0;
124	15					60	my $start = time;
125	15					33	$self->{main} = $conf->{main}; # for use in extract_hints()
126	15	50				49	$self->{show_progress} and info ("extracting from rules of type $ruletype");
127	15					37	my $tflags = $conf->{tflags};
128
129							# attempt to find good "base strings" (simplified regexp subsets) for each
130							# regexp. We try looking at the regexp from both ends, since there
131							# may be a good long string of text at the end of the rule.
132
133							# require this many chars in a base string + delimiters for it to be viable
134	15					28	my $min_chars = 5;
135
136	15					19	my $progress;
137							$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
138	15	50	0			44	total => (scalar keys %{$rules} \|\| 1),
139							itemtype => 'rules',
140							});
141
142	15					33	my $cached = { };
143	15					34	my $cachefile;
144
145	15	50				42	if ($self->{main}->{bases_cache_dir}) {
146	0					0	$cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype";
147	0					0	dbg("zoom: reading cache file $cachefile");
148	0					0	$cached = $self->read_cachefile($cachefile);
149							}
150
151							NEXT_RULE:
152	15					36	foreach my $name (keys %{$rules}) {
	15					59
153	42	50	33			320	$self->{show_progress} and $progress and $progress->update(++$count);
154
155	42					181	my $rule = $rules->{$name};
156	42					179	my $cachekey = join "#", $name, $rule;
157
158	42					139	my $cent = $cached->{rule_bases}->{$cachekey};
159	42	50				126	if (defined $cent) {
160	0	0				0	if (defined $cent->{g}) {
161	0					0	dbg("zoom: YES (cached) $rule $name");
162	0					0	foreach my $ent (@{$cent->{g}}) {
	0					0
163							# note: we have to copy these, since otherwise later
164							# modifications corrupt the cached data
165							push @good_bases, {
166							base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name}
167	0					0	};
168							}
169	0					0	$yes++;
170							}
171							else {
172	0					0	dbg("zoom: NO (cached) $rule $name");
173	0					0	push @failed, { orig => $rule }; # no need to cache this
174	0					0	$no++;
175							}
176	0					0	next NEXT_RULE;
177							}
178
179							# ignore ReplaceTags rules
180	42					121	my $is_a_replacetags_rule = $conf->{rules_to_replace}->{$name};
181	42					80	my ($minlen, $lossy, @bases);
182
183	42	50				146	if (!$is_a_replacetags_rule) {
184							eval { # catch die()s
185	42					446	my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
186	41					205	($lossy, @bases) = $self->extract_hints($rule, $qr, $mods);
187							# dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases));
188	40					554	1;
189	42	100				87	} or do {
190	2	50				23	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
	2					24
191	2					57	dbg("zoom: giving up on regexp: $eval_stat");
192							};
193
194	42	50	50			903	if ($lossy && ($tflags->{$name}\|\|'') =~ /\bmultiple\b/) {
			66
195	0					0	warn "\nzoom: rule $name will loop on SpamAssassin older than 3.3.2 ".
196							"running under Perl 5.12 or older, Bug 6558\n";
197							}
198
199							# if any of the extracted hints in a set are too short, the entire
200							# set is invalid; this is because each set of N hints represents just
201							# 1 regexp.
202	42					339	foreach my $str (@bases) {
203	135					656	my $len = length fixup_re($str); # bug 6143: count decoded characters
204	135	100	100			766	if ($len < $min_chars) { $minlen = undef; @bases = (); last; }
	3	100				44
	3					34
	3					23
205	62					178	elsif (!defined($minlen) \|\| $len < $minlen) { $minlen = $len; }
206							}
207							}
208
209	42	100	66			1036	if ($is_a_replacetags_rule \|\| !$minlen \|\| !@bases) {
			66
210	5	50				138	dbg("zoom: ignoring rule %s, %s", $name,
		50
211							$is_a_replacetags_rule ? 'is a replace rule'
212							: !@bases ? 'no bases' : 'no minlen');
213	5					80	push @failed, { orig => $rule };
214	5					67	$cached->{rule_bases}->{$cachekey} = { };
215	5					43	$no++;
216							}
217							else {
218							# dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
219
220							# figure out if we have e.g. ["foo", "foob", "foobar"]; in this
221							# case, we only need to track ["foo"].
222	37					120	my %subsumed;
223	37					100	foreach my $base1 (@bases) {
224	132					302	foreach my $base2 (@bases) {
225	2524	100	100			19328	if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {
226	4					83	$subsumed{$base1} = 1; # base2 is inside base1; discard the longer
227							}
228							}
229							}
230
231	37					105	my @forcache;
232	37					96	foreach my $base (@bases) {
233	132	100				413	next if $subsumed{$base};
234	128					1326	push @good_bases, {
235							base => $base, orig => $rule, name => "$name,[l=$lossy]"
236							};
237							# separate copies for cache -- we modify the @good_bases entry
238	128					869	push @forcache, {
239							base => $base, orig => $rule, name => "$name,[l=$lossy]"
240							};
241							}
242
243	37					1046	$cached->{rule_bases}->{$cachekey} = { g => \@forcache };
244	37					438	$yes++;
245							}
246							}
247
248	15	50	33			331	$self->{show_progress} and $progress and $progress->final();
249
250	15					862	dbg("zoom: $ruletype: found ".(scalar @good_bases).
251							" usable base strings in $yes rules, skipped $no rules");
252
253							# NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
254							# ("food" =~ "foo" / "food") will return "food". So therefore if a pattern
255							# subsumes other patterns, we need to return hits for all of them. We also
256							# need to take care of the case where multiple regexps wind up sharing the
257							# same base.
258							#
259							# Another gotcha, an exception to the subsumption rule; if one pattern isn't
260							# entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
261							# returned as two hits, correctly. So we only have to be smart about the
262							# full-subsumption case; overlapping is taken care of for us, by re2c.
263							#
264							# TODO: there's a bug here. Since the code in extract_hints() has been
265							# modified to support more complex regexps, we can no longer simply assume
266							# that if pattern A is not contained in pattern B, that means that pattern B
267							# doesn't subsume it. Consider, for example, A="foo*bar" and
268							# B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
269							# that without running the A RE match itself somehow against B.
270							# same issue remains with:
271							#
272							# "foo?bar" / "fobar"
273							# "fo(?:o\|oo\|)bar" / "fobar"
274							# "fo(?:o\|oo)?bar" / "fobar"
275							# "fo(?:o*\|baz)bar" / "fobar"
276							# "(?:fo(?:o*\|baz)bar\|blargh)" / "fobar"
277							#
278							# it's worse with this:
279							#
280							# "fo(?:o\|oo\|)bar" / "foo*bar"
281							#
282							# basically, this is impossible to compute without reimplementing most of
283							# re2c, and it appears the re2c developers don't plan to offer this:
284							# https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
285
286	15					305	$conf->{base_orig}->{$ruletype} = { };
287	15					174	$conf->{base_string}->{$ruletype} = { };
288
289	15					53	$count = 0;
290	15	50	0			56	$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
291							total => (scalar @good_bases \|\| 1),
292							itemtype => 'bases',
293							});
294
295							# this bit is annoyingly O(N^2). Rewrite the data -- the @good_bases
296							# array -- into a more efficient format, using arrays and with a little
297							# bit of precomputation, to go (quite a bit) faster
298
299	15					62	my @rewritten;
300	15					42	foreach my $set1 (@good_bases) {
301	128					376	my $base = $set1->{base};
302	128	50	33			611	next if (!$base \|\| !$set1->{name});
303							push @rewritten, [
304							$base, # 0
305							$set1->{name}, # 1
306							$set1->{orig}, # 2
307	128					2275	length $base, # 3
308							qr/\Q$base\E/, # 4
309							0 # 5, has_multiple flag
310							];
311							}
312	15					226	@good_bases = @rewritten;
313
314	15					82	foreach my $set1 (@good_bases) {
315	128	50	33			501	$self->{show_progress} and $progress and $progress->update(++$count);
316
317	128	100				271	my $base1 = $set1->[0]; next unless $base1;
	128					273
318	104					226	my $name1 = $set1->[1];
319	104					210	my $orig1 = $set1->[2];
320	104					470	$conf->{base_orig}->{$ruletype}->{$name1} = $orig1;
321	104					316	my $len1 = $set1->[3];
322
323	104					233	foreach my $set2 (@good_bases) {
324	3247	100				6224	next if ($set1 == $set2);
325
326	3143	100				4737	my $base2 = $set2->[0]; next unless $base2;
	3143					5372
327	2660					3895	my $name2 = $set2->[1];
328
329							# clobber exact dups; this can happen if a regexp outputs the
330							# same base string multiple times
331	2660	100	100			5090	if ($base1 eq $base2 &&
			66
332							$name1 eq $name2 &&
333							$orig1 eq $set2->[2])
334							{
335	24					82	$set2->[0] = ''; # clobber
336	24					56	next;
337							}
338
339							# skip if it's too short to contain the other base string
340	2636	100				4776	next if ($len1 < $set2->[3]);
341
342							# skip if either already contains the other rule's name
343							# optimize: this can only happen if the base has more than
344							# one rule already attached, ie [5]
345	1510	100	100			3813	next if ($set2->[5] && $name2 =~ /(?: \|^)\Q$name1\E(?: \|$)/);
346
347							# don't use $name1 here, since another base in the set2 loop
348							# may have added $name2 since we set that
349	1423	100	100			3781	next if ($set1->[5] && $set1->[1] =~ /(?: \|^)\Q$name2\E(?: \|$)/);
350
351							# and finally check to see if it does contain the other base string
352	1369	100				4749	next if ($base1 !~ $set2->[4]);
353
354							# base2 is just a subset of base1
355							# dbg("zoom: subsuming '$base2' ($name2) into '$base1': [1]=$set1->[1] [5]=$set1->[5]");
356	27					137	$set1->[1] .= " ".$name2;
357	27					103	$set1->[5] = 1;
358							}
359							}
360
361							# we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS
362							# both contain "killed" for example, pointing at different rules, which
363							# the above search hasn't found. Collapse them here with a hash
364	15					57	my %bases;
365	15					67	foreach my $set (@good_bases) {
366	128					330	my $base = $set->[0];
367	128	100				330	next unless $base;
368
369	104	100				270	if (defined $bases{$base}) {
370	12					71	$bases{$base} .= " ".$set->[1];
371							} else {
372	92					783	$bases{$base} = $set->[1];
373							}
374							}
375	15					81	undef @good_bases;
376
377	15					100	foreach my $base (keys %bases) {
378							# uniq the list, since there are probably dup rules listed
379	92					200	my %u;
380	92					338	for my $i (split ' ', $bases{$base}) {
381	137	100				322	next if exists $u{$i}; undef $u{$i};
	120					315
382							}
383	92					661	$conf->{base_string}->{$ruletype}->{$base} = join ' ', sort keys %u;
384							}
385	15	50	33			144	$self->{show_progress} and $progress and $progress->final();
386
387	15	50				90	if ($cachefile) {
388	0					0	$self->write_cachefile ($cachefile, $cached);
389							}
390
391	15					55	my $elapsed = time - $start;
392							$self->{show_progress} and info ("$ruletype: ".
393	15	50				1399	(scalar keys %{$conf->{base_string}->{$ruletype}}).
	0					0
394							" base strings extracted in $elapsed seconds\n");
395							}
396
397							###########################################################################
398
399							# TODO:
400							# NO /no.{1,10}P(?:er\|re)scription.{1,10}(?:needed\|require\|necessary)/i
401							# => should extract 'scription' somehow
402							# /time to refinance\|refinanc\w{1,3}\b.{0,16}\bnow\b/i
403							# => should understand alternations; tricky
404
405							sub simplify_and_qr_regexp {
406	42			42	0	137	my $self = shift;
407	42					311	my $rule = shift;
408
409	42					106	my $main = $self->{main};
410	42					940	$rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);
411
412							# remove the regexp modifiers, keep for later
413	42					160	my $mods = '';
414	42					453	while ($rule =~ s/^$\?([a-z]*)$//) { $mods .= $1; }
	20					129
415
416							# modifier removal
417	42					135	while ($rule =~ s/^$\?-([a-z]*)$//) {
418	0					0	foreach my $modchar (split '', $mods) {
419	0					0	$mods =~ s/$modchar//g;
420							}
421							}
422
423	42					96	my $lossy = 0;
424
425							# now: simplify aspects of the regexp. Bear in mind that we can
426							# simplify as long as we cause the regexp to become more general;
427							# more hits is OK, since false positives will be discarded afterwards
428							# anyway. Simplification that causes the regexp to not hit
429							# stuff that the "real" rule would hit, however, is a bad thing.
430
431	42	100				262	if ($main->{bases_must_be_casei}) {
432	37					148	$rule = lc $rule;
433
434	37					66	$lossy = 1;
435	37	100				290	$mods =~ s/i// and $lossy = 0;
436
437							# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
438	37	100				384	$rule =~ s/(?<!\\)$\?i\:(.*?)$/$1/gs and $lossy++;
439
440							# always case-i: /A(?-i:ct)/ => /Act/
441	37	50				168	$rule =~ s/(?<!\\)$\?-i\:(.*?)$/$1/gs and $lossy++;
442
443							# remove (?i)
444	37					86	$rule =~ s/$\?i$//gs;
445							}
446							else {
447	5	50				65	die "case-i" if $rule =~ /$\?i$/;
448	5	100				43	die "case-i" if $mods =~ /i/;
449
450							# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
451	4	50				13	$rule =~ s/(?<!\\)$\?i\:(.*?)$/$1/gs and die "case-i";
452
453							# we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/
454	4					16	$rule =~ s/(?<!\\)$\?-i\:(.*?)$/$1/gs;
455							}
456
457							# remove /m and /s modifiers
458	41	50				122	$mods =~ s/m// and $lossy++;
459	41	100				135	$mods =~ s/s// and $lossy++;
460
461							# remove (^\|\b)'s
462							# T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS\|(^\|\b)EXTI($\|\b))/is
463	41	50				260	$rule =~ s/$\^\\|\\b$//gs and $lossy++;
464	41	50				142	$rule =~ s/$\$\\|\\b$//gs and $lossy++;
465	41	50				116	$rule =~ s/$\\b\\|\^$//gs and $lossy++;
466	41	50				152	$rule =~ s/$\\b\\|\$$//gs and $lossy++;
467
468							# remove (?!credit)
469	41	100				114	$rule =~ s/$\?\![^$]+\)//gs and $lossy++;
470
471							# remove \b's
472	41	100				187	$rule =~ s/(?<!\\)\\b//gs and $lossy++;
473
474							# remove the "?=" trick
475							# (?=[dehklnswxy])(horny\|nasty\|hot\|wild\|young\|....etc...)
476	41					101	$rule =~ s/$\?\=\[[^\]]+\]$//gs;
477
478	41	100				144	$mods .= "L" if $lossy;
479	41					297	($rule, $mods);
480							}
481
482							sub extract_hints {
483	41			41	0	89	my $self = shift;
484	41					275	my $rawrule = shift;
485	41					150	my $rule = shift;
486	41					170	my $mods = shift;
487
488	41					105	my $main = $self->{main};
489	41					185	my $orig = $rule;
490
491	41					109	my $lossy = 0;
492	41	100				257	$mods =~ s/L// and $lossy++;
493
494							# if there are anchors, give up; we can't get much
495							# faster than these anyway
496	41	50				416	die "anchors" if $rule =~ /^\(?(?:\^\|\\A)/;
497
498							# die "anchors" if $rule =~ /(?:\$\|\\Z)\)?$/;
499							# just remove end-of-string anchors; they're slow so could gain
500							# from our speedup
501	41	50				412	$rule =~ s/(?<!\\)(?:\$\|\\Z)\)?$// and $lossy++;
502
503							# simplify (?:..) to (..)
504							$main->{bases_allow_noncapture_groups} or
505	41	50				280	$rule =~ s/\(\?:/\(/g;
506
507							# simplify some grouping arrangements so they're easier for us to parse
508							# (foo)? => (foo\|)
509	41					228	$rule =~ s/$(.*?)$\?/$$1\\|$/gs;
510							# r? => (r\|)
511	41					185	$rule =~ s/(?<!\\)(\w)\?/$$1\\|$/gs;
512
513	41					508	my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
514	41	50				142	$tmpfh or die "failed to create a temporary file";
515	41					173	untaint_var(\$tmpf);
516
517	41	50				586	print $tmpfh "use bytes; m{" . $rule . "}" . $mods
518							or die "error writing to $tmpf: $!";
519	41	50				1631	close $tmpfh or die "error closing $tmpf: $!";
520
521	41					450	my $perl = $self->get_perl();
522	41					196	local *IN;
523	41	50				155031	open (IN, "$perl -c -Mre=debug $tmpf 2>&1 \|")
524							or die "cannot run $perl: ".exit_status_str($?,$!);
525
526	41					704	my($inbuf,$nread,$fullstr); $fullstr = '';
	41					395
527	41					448656	while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf }
	41					1146
528	41	50				221	defined $nread or die "error reading from pipe: $!";
529
530	41	50				3679	unlink $tmpf or die "cannot unlink $tmpf: $!";
531	41	50				1966	close IN or die "error closing pipe: $!";
532	41	50				201	defined $fullstr or warn "empty result from a pipe";
533
534							# now parse the -Mre=debug output.
535							# perl 5.10 format
536	41					2130	$fullstr =~ s/^.*\nFinal program:\n//gs;
537							# perl 5.6/5.8 format
538	41					301	$fullstr =~ s/^(?:.\n\|)size \d[^\n]\n//gs;
539	41					215	$fullstr =~ s/^(?:.\n\|)first at \d[^\n]\n//gs;
540							# common to all
541	41					130	$fullstr =~ s/\nOffsets:.*$//gs;
542
543							# clean up every other line that doesn't start with a space
544	41					1057	$fullstr =~ s/^\S.*$//gm;
545
546	41	50				936	if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
547	0					0	die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule";
548							}
549	41					1121	my $opsstr = $1;
550
551							# what's left looks like this:
552							# 1: EXACTF <v>(3)
553							# 3: ANYOF[1ILil](14)
554							# 14: EXACTF <a>(16)
555							# 16: CURLY {2,7}(29)
556							# 18: ANYOF[A-Za-z](0)
557							# 29: SPACE(30)
558							# 30: EXACTF <http://>(33)
559							# 33: END(0)
560							#
561	41					133	DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";
562
563	41					302	my @ops;
564	41					1470	foreach my $op (split(/\n/s, $opsstr)) {
565	439	50				1306	next unless $op;
566
567	439	100				3767	if ($op =~ /^\s+\d+: (\s)([A-Z]\w+)\b(.?)\s*(?:$\d+$)?$/) {
		100
		50
		0
568							# perl 5.8: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18)
569							# perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18)
570	321					4271	push @ops, [ $1, $2, $3 ];
571							}
572							elsif ($op =~ /^ (\s)<(.)>\.\.\.\s*$/) {
573							# 5: TRIE-EXACT[im](44)
574							# <message contained attachments that have been blocked by guin>...
575	4					35	my $spcs = $1;
576							# we could use the entire length here, but it's easier to trim to
577							# the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
578							# suite results will match, since the sa-update --list extraction will
579							# be the same for all versions. (The "..." trailer is important btw)
580	4					38	my $str = substr ($2, 0, 55);
581	4					67	push @ops, [ $spcs, '_moretrie', "<$str...>" ];
582							}
583							elsif ($op =~ /^ (\s)(<.>)\s*(?:$\d+$)?$/) {
584							# 5: TRIE-EXACT[am](21)
585							# <am> (21)
586							# <might> (12)
587	114					859	push @ops, [ $1, '_moretrie', $2 ];
588							}
589							elsif ($op =~ /^ at .+ line \d+$/) {
590	0					0	next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':
591							}
592							else {
593	0					0	warn "cannot parse '$op': $opsstr";
594	0					0	next;
595							}
596							}
597
598							# unroll the branches; returns a list of versions.
599							# e.g. /foo(bar\|baz)argh/ => [ "foobarargh", "foobazargh" ]
600	41					196	my @unrolled;
601	41	100				269	if ($main->{bases_split_out_alternations}) {
602	35					668	@unrolled = $self->unroll_branches(0, \@ops);
603							} else {
604	6					58	@unrolled = ( \@ops );
605							}
606
607							# now find the longest DFA-friendly string in each unrolled version
608	41					178	my @longests;
609	41					136	foreach my $opsarray (@unrolled) {
610	136					353	my $longestexact = '';
611	136					328	my $buf = '';
612
613							# use a closure to keep the code succinct
614							my $add_candidate = sub {
615	430	100		430		1017	if (length $buf > length $longestexact) { $longestexact = $buf; }
	159					335
616	430					746	$buf = '';
617	136					1408	};
618
619	136					274	my $prevop;
620	136					234	foreach my $op (@{$opsarray}) {
	136					357
621	635					1152	my ($spcs, $item, $args) = @{$op};
	635					1962
622
623	635	100				1834	next if ($item eq 'NOTHING');
624
625							# EXACT == case-sensitive
626							# EXACTF == case-i
627							# we can do both, since we canonicalize to lc.
628	626	100	100			7674	if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
		100	66
		50	100
		100	66
			66
			100
			66
			33
			33
			0
			66
			100
			66
			66
			33
			33
629							{
630	142					653	my $str = $1;
631	142					335	$buf .= $str;
632	142	100				509	if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) {
633							# a high Unicode codepoint, interpreted by perl 5.8.x. cut and stop
634	1					97	$add_candidate->();
635							}
636	142	50	33			514	if (length $str >= 55 && $buf =~ s/\.\.\.$//) {
637							# perl 5.8.x truncates with a "..." here! cut and stop
638	0					0	$add_candidate->();
639							}
640							}
641							# _moretrie == a TRIE-EXACT entry
642							elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
643							{
644	190					484	$buf .= $1;
645	190	100	66			531	if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
646							# perl 5.8.x truncates with a "..." here! cut and stop
647	4					28	$add_candidate->();
648							}
649							}
650							# /(?:foo\|bar\|baz){2}/ results in a CURLYX beforehand
651							elsif ($item =~ /^EXACT/ &&
652							$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
653							$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
654							$args =~ /<(.*)>/)
655							{
656	0					0	$buf .= $1;
657	0	0	0			0	if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
658							# perl 5.8.x truncates with a "..." here! cut and stop
659	0					0	$add_candidate->();
660							}
661							}
662							# CURLYX, for perl >= 5.9.5
663							elsif ($item =~ /^_moretrie/ &&
664							$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
665							$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
666							$args =~ /<(.*)>/)
667							{
668	5					26	$buf .= $1;
669	5	50	33			34	if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
670							# perl 5.8.x truncates with a "..." here! cut and stop
671	0					0	$add_candidate->();
672							}
673							}
674							else {
675							# not an /^EXACT/; clear the buffer
676	289					821	$add_candidate->();
677	289	100				1458	if ($item !~ /^(?:END\|CLOSE\d\|MINMOD)$/)
678							{
679	173					312	$lossy = 1;
680	173					288	DEBUG_RE_PARSING and warn "item $item makes regexp lossy";
681							}
682							}
683	626					1318	$prevop = $op;
684							}
685	136					479	$add_candidate->();
686
687	136	100				299	if (!$longestexact) {
688	1					102	die "no long-enough string found in $rawrule";
689							# all unrolled versions must have a long string, otherwise
690							# we cannot reliably match all variants of the rule
691							} else {
692	135	100				1265	push @longests, ($main->{bases_must_be_casei}) ?
693							lc $longestexact : $longestexact;
694							}
695							}
696
697	40					108	DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
698	40					2537	return ($lossy, @longests);
699							}
700
701							###########################################################################
702
703							sub unroll_branches {
704	187			187	0	605	my ($self, $depth, $opslist) = @_;
705
706	187	50				530	die "too deep" if ($depth++ > 5);
707
708	187					302	my @ops = (@{$opslist}); # copy
	187					558
709	187					744	my @pre_branch_ops;
710							my $branch_spcs;
711	187					0	my $trie_spcs;
712	187					0	my $open_spcs;
713
714							# our input looks something like this 2-level structure:
715							# 1: BOUND(2)
716							# 2: EXACT <Dear >(5)
717							# 5: BRANCH(9)
718							# 6: EXACT <IT>(8)
719							# 8: NALNUM(24)
720							# 9: BRANCH(23)
721							# 10: EXACT <Int>(12)
722							# 12: BRANCH(14)
723							# 13: NOTHING(21)
724							# 14: BRANCH(17)
725							# 15: EXACT <a>(21)
726							# 17: BRANCH(20)
727							# 18: EXACT <er>(21)
728							# 20: TAIL(21)
729							# 21: EXACT <net>(24)
730							# 23: TAIL(24)
731							# 24: EXACT < shop>(27)
732							# 27: END(0)
733							#
734							# or:
735							#
736							# 1: OPEN1(3)
737							# 3: BRANCH(6)
738							# 4: EXACT <v>(9)
739							# 6: BRANCH(9)
740							# 7: EXACT <\\/>(9)
741							# 9: CLOSE1(11)
742							# 11: CURLY {2,5}(14)
743							# 13: REG_ANY(0)
744							# 14: EXACT < g r a >(17)
745							# 17: ANYOF[a-z](28)
746							# 28: END(0)
747							#
748							# or:
749							#
750							# 1: EXACT <i >(3)
751							# 3: OPEN1(5)
752							# 5: TRIE-EXACT[am](21)
753							# <am> (21)
754							# <might> (12)
755							# 12: OPEN2(14)
756							# 14: TRIE-EXACT[ ](19)
757							# < be>
758							# <>
759							# 19: CLOSE2(21)
760							# 21: CLOSE1(23)
761							# 23: EXACT < c>(25)
762
763	187					323	DEBUG_RE_PARSING and warn "starting parse";
764
765							# this happens for /foo\|bar/ instead of /(?:foo\|bar)/ ; transform
766							# it into the latter. bit of a kludge to do this before the loop, but hey.
767							# note that it doesn't fix the CLOSE1/END ordering to be correct
768	187	50	66			1317	if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {
769	0					0	my @newops = ([ "", "OPEN1", "" ]);
770	0					0	foreach my $op (@ops) {
771	0					0	push @newops, [ " ".$op->[0], $op->[1], $op->[2] ];
772							}
773	0					0	push @newops, [ "", "CLOSE1", "" ];
774	0					0	@ops = @newops;
775							}
776
777							# iterate until we start a branch set. using
778							# /dkjfksl(foo\|bar(baz\|argh)boo)gab/ as an example, we're at "dkj..."
779							# just hitting an OPEN is not enough; wait until we see a TRIE-EXACT
780							# or a BRANCH, then unroll the most recent OPEN set.
781	187					390	while (1) {
782	968					1652	my $op = shift @ops;
783	968	100				2061	last unless defined $op;
784
785	838					1227	my ($spcs, $item, $args) = @{$op};
	838					2801
786	838					1360	DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";
787
788	838	100	100			4602	if ($item =~ /^OPEN/) {
		100
		100
		100
		100
789	54					158	$open_spcs = $spcs;
790	54					127	next; # next will be a BRANCH or TRIE
791
792							} elsif ($item =~ /^TRIE/) {
793	49					121	$trie_spcs = $spcs;
794	49					120	last;
795
796							} elsif ($item =~ /^BRANCH/) {
797	8					90	$branch_spcs = $spcs;
798	8					53	last;
799
800							} elsif ($item =~ /^EXACT/ && defined $open_spcs) {
801							# perl 5.9.5 does this; f(o\|oish) => OPEN, EXACT, TRIE-EXACT
802	7					75	push @pre_branch_ops, [ $open_spcs, $item, $args ];
803	7					34	next;
804
805							} elsif (defined $open_spcs) {
806							# OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:
807							# ignore this OPEN block entirely and don't try to unroll it
808	2					23	undef $open_spcs;
809
810							} else {
811	718					1515	push @pre_branch_ops, $op;
812							}
813							}
814
815							# no branches found? we're done unrolling on this one!
816	187	100				537	if (scalar @ops == 0) {
817	130					733	return [ @pre_branch_ops ];
818							}
819
820							# otherwise we're at the start of a new branch set
821							# /(foo\|bar(baz\|argh)boo)gab/
822	57					151	my @alts;
823							my @in_this_branch;
824
825	57					141	DEBUG_RE_PARSING and warn "entering branch: ".
826							"open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".
827							"branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".
828							"trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";
829
830							# indentation level to remove from "normal" ops (using a s///)
831	57	100				319	my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
832	57					181	my $trie_sub_spcs = "";
833	57					129	while (1) {
834	387					683	my $op = shift @ops;
835	387	50				846	last unless defined $op;
836	387					641	my ($spcs, $item, $args) = @{$op};
	387					1155
837	387					648	DEBUG_RE_PARSING and warn "in: [$spcs] $item $args";
838
839	387	100	100			4072	if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt
		50	66
		100	66
		100	33
		100	100
			100
			100
			66
840	9					59	push @alts, [ @pre_branch_ops, @in_this_branch ];
841	9					38	@in_this_branch = ();
842	9					65	$open_sub_spcs = $branch_spcs." ";
843	9					38	$trie_sub_spcs = "";
844	9					35	next;
845							}
846							elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end
847	0					0	push @alts, [ @pre_branch_ops, @in_this_branch ];
848	0					0	undef $branch_spcs;
849	0					0	$open_sub_spcs = "";
850	0					0	$trie_sub_spcs = "";
851	0					0	last;
852							}
853							elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {
854	135	100				387	if (scalar @in_this_branch > 0) {
855	86					355	push @alts, [ @pre_branch_ops, @in_this_branch ];
856							}
857							# use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)
858	135					679	@in_this_branch = ( [ $open_spcs, $item, $args ] );
859	135	50				331	$open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
860	135					311	$trie_sub_spcs = " ";
861	135					261	next;
862							}
863							elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end
864	52					215	push @alts, [ @pre_branch_ops, @in_this_branch ];
865	52					165	undef $branch_spcs;
866	52					121	undef $open_spcs;
867	52					97	undef $trie_spcs;
868	52					90	$open_sub_spcs = "";
869	52					111	$trie_sub_spcs = "";
870	52					116	last;
871							}
872							elsif ($item eq 'END') { # of string
873	5					29	push @alts, [ @pre_branch_ops, @in_this_branch ];
874	5					24	undef $branch_spcs;
875	5					16	undef $open_spcs;
876	5					16	undef $trie_spcs;
877	5					19	$open_sub_spcs = "";
878	5					29	$trie_sub_spcs = "";
879	5					22	last;
880							}
881							else {
882	186	50				467	if ($open_sub_spcs) {
883							# deindent the space-level to match the opening brace
884	186					1151	$spcs =~ s/^$open_sub_spcs//;
885							# tries also add one more indent level in
886	186					906	$spcs =~ s/^$trie_sub_spcs//;
887							}
888	186					796	push @in_this_branch, [ $spcs, $item, $args ];
889							# note that we ignore ops at a deeper $spcs level entirely (until later!)
890							}
891							}
892
893	57	50				203	if (defined $branch_spcs) {
894	0					0	die "fell off end of string with a branch open: '$branch_spcs'";
895							}
896
897							# we're now after the branch set: /gab/
898							# @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz\|argh)boo/ ]
899	57					257	foreach my $alt (@alts) {
900	152					279	push @{$alt}, @ops; # add all remaining ops to each one
	152					435
901							# note that this could include more (?:...); we don't care, since
902							# those can be handled by recursing
903							}
904
905							# ok, parsed the entire ops list
906							# @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz\|argh)boogab/ ]
907
908	57					114	if (DEBUG_RE_PARSING) {
909							print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
910							}
911
912							# now recurse, to unroll the remaining branches (if any exist)
913	57					134	my @rets;
914	57					155	foreach my $alt (@alts) {
915	152					745	push @rets, $self->unroll_branches($depth, $alt);
916							}
917
918	57					125	if (DEBUG_RE_PARSING) {
919							print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
920							}
921
922	57					413	return @rets;
923							}
924
925							###########################################################################
926
927							sub test {
928	0			0	0	0	my ($self) = @_;
929
930	0					0	$self->test_split_alt("foo", "/foo/");
931	0					0	$self->test_split_alt("(foo)", "/foo/");
932	0					0	$self->test_split_alt("foo(bar)baz", "/foobarbaz/");
933	0					0	$self->test_split_alt("x(foo\|)", "/xfoo/ /x/");
934	0					0	$self->test_split_alt("fo(o\|)", "/foo/ /fo/");
935	0					0	$self->test_split_alt("(foo\|bar)", "/foo/ /bar/");
936	0					0	$self->test_split_alt("foo\|bar", "/foo/ /bar/");
937	0					0	$self->test_split_alt("foo (bar\|baz) argh", "/foo bar argh/ /foo baz argh/");
938	0					0	$self->test_split_alt("foo (bar\|baz\|bl(arg\|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");
939	0					0	$self->test_split_alt("(s(otc\|tco)k)", "/sotck/ /stcok/");
940	0					0	$self->test_split_alt("(business partner(s\|ship\|)\|silent partner(s\|ship\|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");
941							}
942
943							sub test_split_alt {
944	0			0	0	0	my ($self, $in, $out) = @_;
945
946	0					0	my @got = $self->split_alt($in);
947	0					0	$out =~ s/^\///;
948	0					0	$out =~ s/\/$//;
949	0					0	my @want = split(/\/ \//, $out);
950
951	0					0	my $failed = 0;
952	0	0				0	if (scalar @want != scalar @got) {
953	0					0	warn "FAIL: results count don't match";
954	0					0	$failed++;
955							}
956							else {
957	0					0	my %got = map { $_ => 1 } @got;
	0					0
958	0					0	foreach my $w (@want) {
959	0	0				0	if (!$got{$w}) {
960	0					0	warn "FAIL: '$w' not found";
961	0					0	$failed++;
962							}
963							}
964							}
965
966	0	0				0	if ($failed) {
967	0	0				0	print "want: /".join('/ /', @want)."/\n" or die "error writing: $!";
968	0	0				0	print "got: /".join('/ /', @got)."/\n" or die "error writing: $!";
969	0					0	return 0;
970							} else {
971	0	0				0	print "ok\n" or die "error writing: $!";
972	0					0	return 1;
973							}
974							}
975
976							###########################################################################
977
978							sub get_perl {
979	41			41	0	132	my ($self) = @_;
980	41					85	my $perl;
981
982							# allow user override of the perl interpreter to use when
983							# extracting base strings.
984							# TODO: expose this via sa-compile command-line option
985	41					173	my $fromconf = $self->{main}->{conf}->{re_parser_perl};
986
987	41	50				493	if ($fromconf) {
		50
988	0					0	$perl = $fromconf;
989							} elsif ($^X =~ m\|^/\|) {
990	41					116	$perl = $^X;
991							} else {
992	1			1		12	use Config;
	1					9
	1					1126
993	0					0	$perl = $Config{perlpath};
994	0					0	$perl =~ s\|/[^/]*$\|/$^X\|;
995							}
996	41					266	untaint_var(\$perl);
997	41					116	return $perl;
998							}
999
1000							###########################################################################
1001
1002							sub read_cachefile {
1003	0			0	0	0	my ($self, $cachefile) = @_;
1004	0					0	local *IN;
1005	0	0				0	if (open(IN, "<".$cachefile)) {
1006	0					0	my($inbuf,$nread,$str); $str = '';
	0					0
1007	0					0	while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf }
	0					0
1008	0	0				0	defined $nread or die "error reading from $cachefile: $!";
1009	0	0				0	close IN or die "error closing $cachefile: $!";
1010
1011	0					0	untaint_var(\$str);
1012	0					0	my $VAR1; # Data::Dumper
1013	0	0				0	if (eval $str) {
1014	0					0	return $VAR1; # Data::Dumper's naming
1015							}
1016							}
1017	0					0	return { };
1018							}
1019
1020							sub write_cachefile {
1021	0			0	0	0	my ($self, $cachefile, $cached) = @_;
1022
1023	0					0	my $dump = Data::Dumper->new ([ $cached ]);
1024	0					0	$dump->Deepcopy(1);
1025	0					0	$dump->Purity(1);
1026	0					0	$dump->Indent(1);
1027	0	0				0	if (mkdir($self->{main}->{bases_cache_dir})) {
		0
1028							# successfully created
1029							} elsif ($! == EEXIST) {
1030	0					0	dbg("zoom: ok, cache directory already existed");
1031							} else {
1032	0					0	warn "cannot create a directory: $!";
1033							}
1034	0	0				0	open(CACHE, ">$cachefile") or warn "cannot write to $cachefile";
1035	0	0				0	print CACHE ($dump->Dump, ";1;") or die "error writing: $!";
1036	0	0				0	close CACHE or die "error closing $cachefile: $!";
1037							}
1038
1039							=over 4
1040
1041							=item my ($cleanregexp) = fixup_re($regexp);
1042
1043							Converts encoded characters in a regular expression pattern into their
1044							equivalent characters
1045
1046							=back
1047
1048							=cut
1049
1050							sub fixup_re {
1051	135			135	1	372	my $re = shift;
1052
1053	135	0				298	if ($fixup_re_test) { print "INPUT: /$re/\n" or die "error writing: $!" }
	0	50				0
1054
1055	135					562	my $output = "";
1056	135					1373	my $TOK = qr([\"\\]);
1057
1058	135					396	my $STATE;
1059	135					1029	local ($1,$2);
1060	135					1398	while ($re =~ /\G(.*?)($TOK)/gcs) {
1061	0					0	my $pre = $1;
1062	0					0	my $tok = $2;
1063
1064	0	0				0	if (length($pre)) {
1065	0					0	$output .= "\"$pre\"";
1066							}
1067
1068	0	0				0	if ($tok eq '"') {
		0
1069	0					0	$output .= '"\\""';
1070							}
1071							elsif ($tok eq '\\') {
1072	0	0				0	$re =~ /\G(x\{[^\}]+\}\|[0-7]{1,3}\|.)/gcs or die "\\ at end of string!";
1073	0					0	my $esc = $1;
1074	0	0				0	if ($esc eq '"') {
		0
		0
		0
1075	0					0	$output .= '"\\""';
1076							} elsif ($esc eq '\\') {
1077	0					0	$output .= '"BACKSLASH"'; # avoid hairy escape-parsing
1078							} elsif ($esc =~ /^x\{(\S+)\}\z/) {
1079	0					0	$output .= '"'.chr(hex($1)).'"';
1080							} elsif ($esc =~ /^[0-7]{1,3}\z/) {
1081	0					0	$output .= '"'.chr(oct($esc)).'"';
1082							} else {
1083	0					0	$output .= "\"$esc\"";
1084							}
1085							}
1086							else {
1087	0	0				0	print "PRE: $pre\nTOK: $tok\n" or die "error writing: $!";
1088							}
1089							}
1090
1091	135	50				398	if (!defined(pos($re))) {
		0
1092							# no matches
1093	135					673	$output .= "\"$re\"";
1094							# Bug 6649: protect NL, NULL, ^Z, (and controls to stay on the safe side)
1095	135					529	$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
	0					0
1096							}
1097							elsif (pos($re) <= length($re)) {
1098	0					0	$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse;
	0					0
1099	0					0	$output .= fixup_re(substr($re, pos($re)));
1100							}
1101
1102	135					298	$output =~ s/^""/"/; # protect start and end quotes
1103	135					348	$output =~ s/(?<!\\)""\z/"/;
1104	135					266	$output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef"
1105	135					243	$output =~ s/\\BACKSLASH\\/\\\\/gs;
1106
1107	135	0				314	if ($fixup_re_test) { print "OUTPUT: $output\n" or die "error writing: $!" }
	0	50				0
1108	135					604	return $output;
1109							}
1110
1111							1;