| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# <@LICENSE> |
|
2
|
|
|
|
|
|
|
# Copyright 2006 Apache Software Foundation |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
|
5
|
|
|
|
|
|
|
# you may not use this file except in compliance with the License. |
|
6
|
|
|
|
|
|
|
# You may obtain a copy of the License at |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
|
11
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
12
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
13
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
|
14
|
|
|
|
|
|
|
# limitations under the License. |
|
15
|
|
|
|
|
|
|
# </@LICENSE> |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 NAME |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
This is a plugin to extract "base" strings from SpamAssassin 'body' rules, |
|
24
|
|
|
|
|
|
|
suitable for use in Rule2XSBody rules or other parallel matching algorithms. |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=cut |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor; |
|
29
|
|
|
|
|
|
|
|
|
30
|
1
|
|
|
1
|
|
8
|
use Mail::SpamAssassin::Plugin; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
31
|
|
|
31
|
1
|
|
|
1
|
|
7
|
use Mail::SpamAssassin::Logger; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
81
|
|
|
32
|
1
|
|
|
1
|
|
7
|
use Mail::SpamAssassin::Util qw(untaint_var qr_to_string); |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
83
|
|
|
33
|
1
|
|
|
1
|
|
299
|
use Mail::SpamAssassin::Util::Progress; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
47
|
|
|
34
|
|
|
|
|
|
|
|
|
35
|
1
|
|
|
1
|
|
8
|
use Errno qw(ENOENT EACCES EEXIST); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
104
|
|
|
36
|
1
|
|
|
1
|
|
792
|
use Data::Dumper; |
|
|
1
|
|
|
|
|
6562
|
|
|
|
1
|
|
|
|
|
187
|
|
|
37
|
|
|
|
|
|
|
|
|
38
|
1
|
|
|
1
|
|
12
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
26
|
|
|
39
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
35
|
|
|
40
|
|
|
|
|
|
|
# use bytes; |
|
41
|
1
|
|
|
1
|
|
5
|
use re 'taint'; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
37
|
|
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# Not a constant hashref for 5.6 compat |
|
44
|
1
|
|
|
1
|
|
5
|
use constant SLOT_BASE => 0; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
76
|
|
|
45
|
1
|
|
|
1
|
|
7
|
use constant SLOT_NAME => 1; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
40
|
|
|
46
|
1
|
|
|
1
|
|
5
|
use constant SLOT_ORIG => 2; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
36
|
|
|
47
|
1
|
|
|
1
|
|
5
|
use constant SLOT_LEN_BASE => 3; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
36
|
|
|
48
|
1
|
|
|
1
|
|
5
|
use constant SLOT_BASE_INITIAL => 4; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
52
|
|
|
49
|
1
|
|
|
1
|
|
7
|
use constant SLOT_HAS_MULTIPLE => 5; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
42
|
|
|
50
|
|
|
|
|
|
|
|
|
51
|
1
|
|
|
1
|
|
5
|
use constant CLOBBER => ''; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
109
|
|
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
our @ISA = qw(Mail::SpamAssassin::Plugin); |
|
54
|
|
|
|
|
|
|
|
|
55
|
1
|
|
|
1
|
|
7
|
use constant DEBUG_RE_PARSING => 0; # noisy! |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
5428
|
|
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# a few settings that control what kind of bases are output. |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# treat all rules as lowercase for purposes of term extraction? |
|
60
|
|
|
|
|
|
|
# $main->{bases_must_be_casei} = 1; |
|
61
|
|
|
|
|
|
|
# $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/ |
|
62
|
|
|
|
|
|
|
# $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/ |
|
63
|
|
|
|
|
|
|
# $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/ |
|
64
|
|
|
|
|
|
|
# $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"] |
|
65
|
|
|
|
|
|
|
# $main->{base_quiet} = 0; # silences progress output |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# TODO: it would be nice to have a clean API to pass such settings |
|
68
|
|
|
|
|
|
|
# through to plugins instead of hanging them off $main |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
############################################################################## |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# testing purposes only |
|
73
|
|
|
|
|
|
|
my $fixup_re_test; |
|
74
|
|
|
|
|
|
|
#$fixup_re_test = 1; fixup_re("fr()|\\\\|"); die; |
|
75
|
|
|
|
|
|
|
#$fixup_re_test = 1; fixup_re("\\x{1b}\$b"); die; |
|
76
|
|
|
|
|
|
|
#$fixup_re_test = 1; fixup_re("\\33\$b"); die; |
|
77
|
|
|
|
|
|
|
#$fixup_re_test = 1; fixup_re("[link]"); die; |
|
78
|
|
|
|
|
|
|
#$fixup_re_test = 1; fixup_re("please do not resend your original message."); die; |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
########################################################################### |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub new { |
|
83
|
15
|
|
|
15
|
1
|
100
|
my $class = shift; |
|
84
|
15
|
|
|
|
|
88
|
my $mailsaobject = shift; |
|
85
|
15
|
|
33
|
|
|
204
|
$class = ref($class) || $class; |
|
86
|
15
|
|
|
|
|
91
|
my $self = $class->SUPER::new($mailsaobject); |
|
87
|
15
|
|
|
|
|
31
|
bless ($self, $class); |
|
88
|
|
|
|
|
|
|
|
|
89
|
15
|
|
|
|
|
116
|
$self->{show_progress} = !$mailsaobject->{base_quiet}; |
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# $self->test(); exit; |
|
92
|
15
|
|
|
|
|
224
|
return $self; |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
########################################################################### |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub finish_parsing_end { |
|
98
|
15
|
|
|
15
|
1
|
46
|
my ($self, $params) = @_; |
|
99
|
15
|
|
|
|
|
38
|
my $conf = $params->{conf}; |
|
100
|
15
|
|
|
|
|
60
|
$self->extract_bases($conf); |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
sub extract_bases { |
|
104
|
15
|
|
|
15
|
0
|
44
|
my ($self, $conf) = @_; |
|
105
|
|
|
|
|
|
|
|
|
106
|
15
|
|
|
|
|
40
|
my $main = $conf->{main}; |
|
107
|
15
|
50
|
|
|
|
45
|
if (!$main->{base_extract}) { return; } |
|
|
0
|
|
|
|
|
0
|
|
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
$self->{show_progress} and |
|
110
|
15
|
50
|
|
|
|
41
|
info("base extraction starting. this can take a while..."); |
|
111
|
|
|
|
|
|
|
|
|
112
|
15
|
|
|
|
|
179
|
$self->extract_set($conf, $conf->{body_tests}, 'body'); |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub extract_set { |
|
116
|
15
|
|
|
15
|
0
|
136
|
my ($self, $conf, $test_set, $ruletype) = @_; |
|
117
|
|
|
|
|
|
|
|
|
118
|
15
|
|
|
|
|
49
|
foreach my $pri (keys %{$test_set}) { |
|
|
15
|
|
|
|
|
106
|
|
|
119
|
15
|
|
|
|
|
45
|
my $nicepri = $pri; $nicepri =~ s/-/neg/g; |
|
|
15
|
|
|
|
|
133
|
|
|
120
|
15
|
|
|
|
|
167
|
$self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri); |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
########################################################################### |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub extract_set_pri { |
|
127
|
15
|
|
|
15
|
0
|
54
|
my ($self, $conf, $rules, $ruletype) = @_; |
|
128
|
|
|
|
|
|
|
|
|
129
|
15
|
|
|
|
|
34
|
my @good_bases; |
|
130
|
|
|
|
|
|
|
my @failed; |
|
131
|
15
|
|
|
|
|
26
|
my $yes = 0; |
|
132
|
15
|
|
|
|
|
23
|
my $no = 0; |
|
133
|
15
|
|
|
|
|
21
|
my $count = 0; |
|
134
|
15
|
|
|
|
|
31
|
my $start = time; |
|
135
|
15
|
|
|
|
|
40
|
$self->{main} = $conf->{main}; # for use in extract_hints() |
|
136
|
15
|
50
|
|
|
|
50
|
$self->{show_progress} and info ("extracting from rules of type $ruletype"); |
|
137
|
15
|
|
|
|
|
29
|
my $tflags = $conf->{tflags}; |
|
138
|
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# attempt to find good "base strings" (simplified regexp subsets) for each |
|
140
|
|
|
|
|
|
|
# regexp. We try looking at the regexp from both ends, since there |
|
141
|
|
|
|
|
|
|
# may be a good long string of text at the end of the rule. |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# require this many chars in a base string + delimiters for it to be viable |
|
144
|
15
|
|
|
|
|
26
|
my $min_chars = 5; |
|
145
|
|
|
|
|
|
|
|
|
146
|
15
|
|
|
|
|
20
|
my $progress; |
|
147
|
|
|
|
|
|
|
$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({ |
|
148
|
15
|
50
|
0
|
|
|
36
|
total => (scalar keys %{$rules} || 1), |
|
149
|
|
|
|
|
|
|
itemtype => 'rules', |
|
150
|
|
|
|
|
|
|
}); |
|
151
|
|
|
|
|
|
|
|
|
152
|
15
|
|
|
|
|
33
|
my $cached = { }; |
|
153
|
15
|
|
|
|
|
42
|
my $cachefile; |
|
154
|
|
|
|
|
|
|
|
|
155
|
15
|
50
|
|
|
|
51
|
if ($self->{main}->{bases_cache_dir}) { |
|
156
|
0
|
|
|
|
|
0
|
$cachefile = $self->{main}->{bases_cache_dir}."/rules.$ruletype"; |
|
157
|
0
|
|
|
|
|
0
|
dbg("zoom: reading cache file $cachefile"); |
|
158
|
0
|
|
|
|
|
0
|
$cached = $self->read_cachefile($cachefile); |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
NEXT_RULE: |
|
162
|
15
|
|
|
|
|
30
|
foreach my $name (keys %{$rules}) { |
|
|
15
|
|
|
|
|
134
|
|
|
163
|
42
|
50
|
33
|
|
|
341
|
$self->{show_progress} and $progress and $progress->update(++$count); |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
#my $rule = $rules->{$name}; |
|
166
|
42
|
|
|
|
|
824
|
my $rule = qr_to_string($conf->{test_qrs}->{$name}); |
|
167
|
42
|
50
|
|
|
|
123
|
if (!defined $rule) { |
|
168
|
0
|
|
|
|
|
0
|
die "zoom: error: regexp for $rule not found\n"; |
|
169
|
|
|
|
|
|
|
} |
|
170
|
42
|
|
|
|
|
247
|
my $cachekey = $name.'#'.$rule; |
|
171
|
|
|
|
|
|
|
|
|
172
|
42
|
|
|
|
|
213
|
my $cent = $cached->{rule_bases}->{$cachekey}; |
|
173
|
42
|
50
|
|
|
|
129
|
if (defined $cent) { |
|
174
|
0
|
0
|
|
|
|
0
|
if (defined $cent->{g}) { |
|
175
|
0
|
|
|
|
|
0
|
dbg("zoom: YES (cached) $rule $name"); |
|
176
|
0
|
|
|
|
|
0
|
foreach my $ent (@{$cent->{g}}) { |
|
|
0
|
|
|
|
|
0
|
|
|
177
|
|
|
|
|
|
|
# note: we have to copy these, since otherwise later |
|
178
|
|
|
|
|
|
|
# modifications corrupt the cached data |
|
179
|
|
|
|
|
|
|
push @good_bases, { |
|
180
|
|
|
|
|
|
|
base => $ent->{base}, orig => $ent->{orig}, name => $ent->{name} |
|
181
|
0
|
|
|
|
|
0
|
}; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
0
|
|
|
|
|
0
|
$yes++; |
|
184
|
|
|
|
|
|
|
} |
|
185
|
|
|
|
|
|
|
else { |
|
186
|
0
|
|
|
|
|
0
|
dbg("zoom: NO (cached) $rule $name"); |
|
187
|
0
|
|
|
|
|
0
|
push @failed, { orig => $rule }; # no need to cache this |
|
188
|
0
|
|
|
|
|
0
|
$no++; |
|
189
|
|
|
|
|
|
|
} |
|
190
|
0
|
|
|
|
|
0
|
next NEXT_RULE; |
|
191
|
|
|
|
|
|
|
} |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# ignore ReplaceTags rules |
|
194
|
42
|
|
|
|
|
108
|
my $is_a_replacetags_rule = $conf->{replace_rules}->{$name}; |
|
195
|
42
|
|
|
|
|
95
|
my ($minlen, $lossy, @bases); |
|
196
|
|
|
|
|
|
|
|
|
197
|
42
|
50
|
|
|
|
124
|
if (!$is_a_replacetags_rule) { |
|
198
|
|
|
|
|
|
|
eval { # catch die()s |
|
199
|
42
|
|
|
|
|
439
|
my ($qr, $mods) = $self->simplify_and_qr_regexp($rule); |
|
200
|
41
|
|
|
|
|
180
|
($lossy, @bases) = $self->extract_hints($rule, $qr, $mods); |
|
201
|
|
|
|
|
|
|
# dbg("zoom: %s %s -> %s", $name, $rule, join(", ", @bases)); |
|
202
|
40
|
|
|
|
|
496
|
1; |
|
203
|
42
|
100
|
|
|
|
106
|
} or do { |
|
204
|
2
|
50
|
|
|
|
22
|
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; |
|
|
2
|
|
|
|
|
8
|
|
|
205
|
2
|
|
|
|
|
47
|
dbg("zoom: giving up on regexp: $eval_stat"); |
|
206
|
|
|
|
|
|
|
}; |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
#if ($lossy && ($tflags->{$name}||'') =~ /\bmultiple\b/) { |
|
209
|
|
|
|
|
|
|
# warn "\nzoom: $vers rule $name will loop on SpamAssassin older than 3.3.2 ". |
|
210
|
|
|
|
|
|
|
# "running under Perl 5.12 or older, Bug 6558\n"; |
|
211
|
|
|
|
|
|
|
#} |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# if any of the extracted hints in a set are too short, the entire |
|
214
|
|
|
|
|
|
|
# set is invalid; this is because each set of N hints represents just |
|
215
|
|
|
|
|
|
|
# 1 regexp. |
|
216
|
42
|
|
|
|
|
360
|
foreach my $str (@bases) { |
|
217
|
135
|
|
|
|
|
787
|
my $len = length fixup_re($str); # bug 6143: count decoded characters |
|
218
|
135
|
100
|
100
|
|
|
1447
|
if ($len < $min_chars) { $minlen = undef; @bases = (); last; } |
|
|
3
|
100
|
|
|
|
46
|
|
|
|
3
|
|
|
|
|
18
|
|
|
|
3
|
|
|
|
|
21
|
|
|
219
|
62
|
|
|
|
|
311
|
elsif (!defined($minlen) || $len < $minlen) { $minlen = $len; } |
|
220
|
|
|
|
|
|
|
} |
|
221
|
|
|
|
|
|
|
} |
|
222
|
|
|
|
|
|
|
|
|
223
|
42
|
100
|
66
|
|
|
915
|
if ($is_a_replacetags_rule || !$minlen || !@bases) { |
|
|
|
|
66
|
|
|
|
|
|
224
|
5
|
50
|
|
|
|
150
|
dbg("zoom: ignoring rule %s, %s", $name, |
|
|
|
50
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
$is_a_replacetags_rule ? 'is a replace rule' |
|
226
|
|
|
|
|
|
|
: !@bases ? 'no bases' : 'no minlen'); |
|
227
|
5
|
|
|
|
|
55
|
push @failed, { orig => $rule }; |
|
228
|
5
|
|
|
|
|
45
|
$cached->{rule_bases}->{$cachekey} = { }; |
|
229
|
5
|
|
|
|
|
31
|
$no++; |
|
230
|
|
|
|
|
|
|
} |
|
231
|
|
|
|
|
|
|
else { |
|
232
|
|
|
|
|
|
|
# dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>"); |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
# figure out if we have e.g. ["foo", "foob", "foobar"]; in this |
|
235
|
|
|
|
|
|
|
# case, we only need to track ["foo"]. |
|
236
|
37
|
|
|
|
|
96
|
my %subsumed; |
|
237
|
37
|
|
|
|
|
85
|
foreach my $base1 (@bases) { |
|
238
|
132
|
|
|
|
|
431
|
foreach my $base2 (@bases) { |
|
239
|
2524
|
100
|
100
|
|
|
22737
|
if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) { |
|
240
|
4
|
|
|
|
|
77
|
$subsumed{$base1} = 1; # base2 is inside base1; discard the longer |
|
241
|
|
|
|
|
|
|
} |
|
242
|
|
|
|
|
|
|
} |
|
243
|
|
|
|
|
|
|
} |
|
244
|
|
|
|
|
|
|
|
|
245
|
37
|
|
|
|
|
241
|
my @forcache; |
|
246
|
37
|
|
|
|
|
109
|
foreach my $base (@bases) { |
|
247
|
132
|
100
|
|
|
|
580
|
next if $subsumed{$base}; |
|
248
|
128
|
|
|
|
|
1475
|
push @good_bases, { |
|
249
|
|
|
|
|
|
|
base => $base, orig => $rule, name => "$name,[l=$lossy]" |
|
250
|
|
|
|
|
|
|
}; |
|
251
|
|
|
|
|
|
|
# *separate* copies for cache -- we modify the @good_bases entry |
|
252
|
128
|
|
|
|
|
961
|
push @forcache, { |
|
253
|
|
|
|
|
|
|
base => $base, orig => $rule, name => "$name,[l=$lossy]" |
|
254
|
|
|
|
|
|
|
}; |
|
255
|
|
|
|
|
|
|
} |
|
256
|
|
|
|
|
|
|
|
|
257
|
37
|
|
|
|
|
512
|
$cached->{rule_bases}->{$cachekey} = { g => \@forcache }; |
|
258
|
37
|
|
|
|
|
314
|
$yes++; |
|
259
|
|
|
|
|
|
|
} |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
|
|
262
|
15
|
50
|
33
|
|
|
259
|
$self->{show_progress} and $progress and $progress->final(); |
|
263
|
|
|
|
|
|
|
|
|
264
|
15
|
|
|
|
|
558
|
dbg("zoom: $ruletype: found ".(scalar @good_bases). |
|
265
|
|
|
|
|
|
|
" usable base strings in $yes rules, skipped $no rules"); |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
# NOTE: re2c will attempt to provide the longest pattern that matched; e.g. |
|
268
|
|
|
|
|
|
|
# ("food" =~ "foo" / "food") will return "food". So therefore if a pattern |
|
269
|
|
|
|
|
|
|
# subsumes other patterns, we need to return hits for all of them. We also |
|
270
|
|
|
|
|
|
|
# need to take care of the case where multiple regexps wind up sharing the |
|
271
|
|
|
|
|
|
|
# same base. |
|
272
|
|
|
|
|
|
|
# |
|
273
|
|
|
|
|
|
|
# Another gotcha, an exception to the subsumption rule; if one pattern isn't |
|
274
|
|
|
|
|
|
|
# entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be |
|
275
|
|
|
|
|
|
|
# returned as two hits, correctly. So we only have to be smart about the |
|
276
|
|
|
|
|
|
|
# full-subsumption case; overlapping is taken care of for us, by re2c. |
|
277
|
|
|
|
|
|
|
# |
|
278
|
|
|
|
|
|
|
# TODO: there's a bug here. Since the code in extract_hints() has been |
|
279
|
|
|
|
|
|
|
# modified to support more complex regexps, we can no longer simply assume |
|
280
|
|
|
|
|
|
|
# that if pattern A is not contained in pattern B, that means that pattern B |
|
281
|
|
|
|
|
|
|
# doesn't subsume it. Consider, for example, A="foo*bar" and |
|
282
|
|
|
|
|
|
|
# B="morefobarry"; A is indeed subsumed by B, but we won't be able to test |
|
283
|
|
|
|
|
|
|
# that without running the A RE match itself somehow against B. |
|
284
|
|
|
|
|
|
|
# same issue remains with: |
|
285
|
|
|
|
|
|
|
# |
|
286
|
|
|
|
|
|
|
# "foo?bar" / "fobar" |
|
287
|
|
|
|
|
|
|
# "fo(?:o|oo|)bar" / "fobar" |
|
288
|
|
|
|
|
|
|
# "fo(?:o|oo)?bar" / "fobar" |
|
289
|
|
|
|
|
|
|
# "fo(?:o*|baz)bar" / "fobar" |
|
290
|
|
|
|
|
|
|
# "(?:fo(?:o*|baz)bar|blargh)" / "fobar" |
|
291
|
|
|
|
|
|
|
# |
|
292
|
|
|
|
|
|
|
# it's worse with this: |
|
293
|
|
|
|
|
|
|
# |
|
294
|
|
|
|
|
|
|
# "fo(?:o|oo|)bar" / "foo*bar" |
|
295
|
|
|
|
|
|
|
# |
|
296
|
|
|
|
|
|
|
# basically, this is impossible to compute without reimplementing most of |
|
297
|
|
|
|
|
|
|
# re2c, and it appears the re2c developers don't plan to offer this: |
|
298
|
|
|
|
|
|
|
# https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203 |
|
299
|
|
|
|
|
|
|
|
|
300
|
15
|
|
|
|
|
196
|
$conf->{base_orig}->{$ruletype} = { }; |
|
301
|
15
|
|
|
|
|
121
|
$conf->{base_string}->{$ruletype} = { }; |
|
302
|
|
|
|
|
|
|
|
|
303
|
15
|
|
|
|
|
38
|
$count = 0; |
|
304
|
15
|
50
|
0
|
|
|
51
|
$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({ |
|
305
|
|
|
|
|
|
|
total => (scalar @good_bases || 1), |
|
306
|
|
|
|
|
|
|
itemtype => 'bases', |
|
307
|
|
|
|
|
|
|
}); |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# this bit is annoyingly O(N^2). Rewrite the data -- the @good_bases |
|
310
|
|
|
|
|
|
|
# array -- into a more efficient format, using arrays and with a little |
|
311
|
|
|
|
|
|
|
# bit of precomputation, to go (quite a bit) faster |
|
312
|
15
|
|
|
|
|
26
|
my @rewritten; |
|
313
|
15
|
|
|
|
|
37
|
foreach my $set1 (@good_bases) { |
|
314
|
128
|
|
|
|
|
395
|
my $base = $set1->{base}; |
|
315
|
128
|
50
|
33
|
|
|
588
|
next if (!$base || !$set1->{name}); |
|
316
|
|
|
|
|
|
|
push @rewritten, [ |
|
317
|
|
|
|
|
|
|
$base, # 0 - SLOT_BASE |
|
318
|
|
|
|
|
|
|
$set1->{name}, # 1 - SLOT_NAME |
|
319
|
|
|
|
|
|
|
$set1->{orig}, # 2 - SLOT_ORIG |
|
320
|
128
|
|
|
|
|
772
|
length $base, # 3 - SLOT_LEN_BASE |
|
321
|
|
|
|
|
|
|
$base, # 4 - SLOT_BASE_INITIAL |
|
322
|
|
|
|
|
|
|
0 # 5 - SLOT_HAS_MULTIPLE, has_multiple flag |
|
323
|
|
|
|
|
|
|
]; |
|
324
|
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
@good_bases = sort { |
|
327
|
15
|
50
|
100
|
|
|
194
|
$b->[SLOT_LEN_BASE] <=> $a->[SLOT_LEN_BASE] || |
|
|
414
|
|
100
|
|
|
1826
|
|
|
328
|
|
|
|
|
|
|
$a->[SLOT_BASE] cmp $b->[SLOT_BASE] || |
|
329
|
|
|
|
|
|
|
$a->[SLOT_NAME] cmp $b->[SLOT_NAME] || |
|
330
|
|
|
|
|
|
|
$a->[SLOT_ORIG] cmp $b->[SLOT_ORIG] |
|
331
|
|
|
|
|
|
|
} @rewritten; |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
|
|
334
|
15
|
|
|
|
|
63
|
my $base_orig = $conf->{base_orig}->{$ruletype}; |
|
335
|
15
|
|
|
|
|
32
|
my $next_base_position = 0; |
|
336
|
15
|
|
|
|
|
50
|
for my $set1 (@good_bases) { |
|
337
|
128
|
|
|
|
|
259
|
$next_base_position++; |
|
338
|
128
|
50
|
33
|
|
|
273
|
$self->{show_progress} and $progress and $progress->update(++$count); |
|
339
|
128
|
100
|
|
|
|
427
|
my $base1 = $set1->[SLOT_BASE] or next; # got clobbered |
|
340
|
104
|
|
|
|
|
194
|
my $name1 = $set1->[SLOT_NAME]; |
|
341
|
104
|
|
|
|
|
140
|
my $orig1 = $set1->[SLOT_ORIG]; |
|
342
|
104
|
|
|
|
|
162
|
my $len1 = $set1->[SLOT_LEN_BASE]; |
|
343
|
104
|
|
|
|
|
305
|
$base_orig->{$name1} = $orig1; |
|
344
|
|
|
|
|
|
|
|
|
345
|
104
|
|
|
|
|
277
|
foreach my $set2 (@good_bases[$next_base_position .. $#good_bases]) { # order from smallest to largest |
|
346
|
|
|
|
|
|
|
# clobber false and exact dups; this can happen if a regexp outputs the |
|
347
|
|
|
|
|
|
|
# same base string multiple times |
|
348
|
1685
|
100
|
100
|
|
|
6016
|
if (!$set2->[SLOT_BASE] || |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
349
|
|
|
|
|
|
|
( |
|
350
|
|
|
|
|
|
|
$base1 eq $set2->[SLOT_BASE] && |
|
351
|
|
|
|
|
|
|
$name1 eq $set2->[SLOT_NAME] && |
|
352
|
|
|
|
|
|
|
$orig1 eq $set2->[SLOT_ORIG] |
|
353
|
|
|
|
|
|
|
) |
|
354
|
|
|
|
|
|
|
) |
|
355
|
|
|
|
|
|
|
{ |
|
356
|
|
|
|
|
|
|
#dbg("clobbering: [base2][$set2->[SLOT_BASE]][name2][$set2->[SLOT_NAME]][orig][$set2->[SLOT_ORIG]]"); |
|
357
|
24
|
|
|
|
|
174
|
$set2->[SLOT_BASE] = CLOBBER; # clobber |
|
358
|
24
|
|
|
|
|
102
|
next; |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# Cannot be a subset if it does not contain the other base string |
|
362
|
1661
|
100
|
|
|
|
6060
|
next if index($base1,$set2->[SLOT_BASE_INITIAL]) == -1; |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# skip if either already contains the other rule's name |
|
365
|
|
|
|
|
|
|
# optimize: this can only happen if the base has more than |
|
366
|
|
|
|
|
|
|
# one rule already attached, ie [5] |
|
367
|
33
|
0
|
33
|
|
|
100
|
next if ($set2->[SLOT_HAS_MULTIPLE] && index($set2->[SLOT_NAME],$name1) > -1 && $set2->[SLOT_NAME] =~ /(?: |^)\Q$name1\E(?: |$)/); |
|
|
|
|
33
|
|
|
|
|
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# don't use $name1 here, since another base in the set2 loop |
|
370
|
|
|
|
|
|
|
# may have added $name2 since we set that |
|
371
|
33
|
50
|
66
|
|
|
95
|
next if ($set1->[SLOT_HAS_MULTIPLE] && index($set1->[SLOT_NAME],$set2->[SLOT_NAME]) > -1 && $set1->[SLOT_NAME] =~ /(?: |^)\Q$set2->[SLOT_NAME]\E(?: |$)/); |
|
|
|
|
33
|
|
|
|
|
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# $set2->[SLOT_BASE] is just a subset of base1 |
|
374
|
|
|
|
|
|
|
#dbg("zoom: subsuming '$set2->[SLOT_BASE]' ($set2->[SLOT_NAME]) into '$base1': [SLOT_BASE]=$set1->[SLOT_BASE] [SLOT_HAS_MULTIPLE]=$set1->[SLOT_HAS_MULTIPLE]"); |
|
375
|
33
|
|
|
|
|
113
|
$set1->[SLOT_NAME] .= " ".$set2->[SLOT_NAME]; |
|
376
|
33
|
|
|
|
|
82
|
$set1->[SLOT_HAS_MULTIPLE] = 1; |
|
377
|
|
|
|
|
|
|
} |
|
378
|
|
|
|
|
|
|
} |
|
379
|
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
# we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS |
|
381
|
|
|
|
|
|
|
# both contain "killed" for example, pointing at different rules, which |
|
382
|
|
|
|
|
|
|
# the above search hasn't found. Collapse them here with a hash |
|
383
|
15
|
|
|
|
|
68
|
my %bases; |
|
384
|
15
|
|
|
|
|
67
|
foreach my $set (@good_bases) { |
|
385
|
128
|
|
|
|
|
373
|
my $base = $set->[0]; |
|
386
|
128
|
100
|
|
|
|
305
|
next unless $base; |
|
387
|
|
|
|
|
|
|
|
|
388
|
104
|
100
|
|
|
|
233
|
if (defined $bases{$base}) { |
|
389
|
12
|
|
|
|
|
62
|
$bases{$base} .= " ".$set->[1]; |
|
390
|
|
|
|
|
|
|
} else { |
|
391
|
92
|
|
|
|
|
630
|
$bases{$base} = $set->[1]; |
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
} |
|
394
|
15
|
|
|
|
|
55
|
undef @good_bases; |
|
395
|
|
|
|
|
|
|
|
|
396
|
15
|
|
|
|
|
41
|
my $base_string = $conf->{base_string}->{$ruletype}; |
|
397
|
15
|
|
|
|
|
152
|
foreach my $base (keys %bases) { |
|
398
|
|
|
|
|
|
|
# uniq the list, since there are probably dup rules listed |
|
399
|
92
|
|
|
|
|
123
|
my %u; |
|
400
|
92
|
|
|
|
|
247
|
for my $i (split ' ', $bases{$base}) { |
|
401
|
137
|
100
|
|
|
|
297
|
next if exists $u{$i}; undef $u{$i}; |
|
|
120
|
|
|
|
|
241
|
|
|
402
|
|
|
|
|
|
|
} |
|
403
|
92
|
|
|
|
|
409
|
$base_string->{$base} = join ' ', sort keys %u; |
|
404
|
|
|
|
|
|
|
} |
|
405
|
|
|
|
|
|
|
|
|
406
|
15
|
50
|
33
|
|
|
73
|
$self->{show_progress} and $progress and $progress->final(); |
|
407
|
|
|
|
|
|
|
|
|
408
|
15
|
50
|
|
|
|
43
|
if ($cachefile) { |
|
409
|
0
|
|
|
|
|
0
|
$self->write_cachefile ($cachefile, $cached); |
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
|
|
412
|
15
|
|
|
|
|
34
|
my $elapsed = time - $start; |
|
413
|
|
|
|
|
|
|
$self->{show_progress} and info ("$ruletype: ". |
|
414
|
15
|
50
|
|
|
|
1144
|
(scalar keys %{$conf->{base_string}->{$ruletype}}). |
|
|
0
|
|
|
|
|
0
|
|
|
415
|
|
|
|
|
|
|
" base strings extracted in $elapsed seconds\n"); |
|
416
|
|
|
|
|
|
|
} |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
########################################################################### |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
# TODO: |
|
421
|
|
|
|
|
|
|
# NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i |
|
422
|
|
|
|
|
|
|
# => should extract 'scription' somehow |
|
423
|
|
|
|
|
|
|
# /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i |
|
424
|
|
|
|
|
|
|
# => should understand alternations; tricky |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub simplify_and_qr_regexp { |
|
427
|
42
|
|
|
42
|
0
|
221
|
my $self = shift; |
|
428
|
42
|
|
|
|
|
111
|
my $rule = shift; |
|
429
|
|
|
|
|
|
|
|
|
430
|
42
|
|
|
|
|
93
|
my $main = $self->{main}; |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
|
|
433
|
42
|
|
|
|
|
173
|
my $mods = ''; |
|
434
|
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
# remove the regexp modifiers, keep for later |
|
436
|
42
|
|
|
|
|
526
|
while ($rule =~ s/^\(\?([a-z]*)\)//) { |
|
437
|
20
|
|
|
|
|
105
|
$mods .= $1; |
|
438
|
|
|
|
|
|
|
} |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
# modifier removal |
|
441
|
42
|
|
|
|
|
215
|
while ($rule =~ s/^\(\?-([a-z]*)\)//) { |
|
442
|
0
|
|
|
|
|
0
|
foreach my $modchar (split '', $mods) { |
|
443
|
0
|
|
|
|
|
0
|
$mods =~ s/$modchar//g; |
|
444
|
|
|
|
|
|
|
} |
|
445
|
|
|
|
|
|
|
} |
|
446
|
|
|
|
|
|
|
|
|
447
|
42
|
|
|
|
|
78
|
my $lossy = 0; |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# now: simplify aspects of the regexp. Bear in mind that we can |
|
450
|
|
|
|
|
|
|
# simplify as long as we cause the regexp to become more general; |
|
451
|
|
|
|
|
|
|
# more hits is OK, since false positives will be discarded afterwards |
|
452
|
|
|
|
|
|
|
# anyway. Simplification that causes the regexp to *not* hit |
|
453
|
|
|
|
|
|
|
# stuff that the "real" rule would hit, however, is a bad thing. |
|
454
|
|
|
|
|
|
|
|
|
455
|
42
|
100
|
|
|
|
243
|
if ($main->{bases_must_be_casei}) { |
|
456
|
37
|
|
|
|
|
180
|
$rule = lc $rule; |
|
457
|
|
|
|
|
|
|
|
|
458
|
37
|
|
|
|
|
80
|
$lossy = 1; |
|
459
|
37
|
100
|
|
|
|
335
|
$mods =~ s/i// and $lossy = 0; |
|
460
|
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/ |
|
462
|
37
|
100
|
|
|
|
353
|
$rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and $lossy++; |
|
463
|
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
# always case-i: /A(?-i:ct)/ => /Act/ |
|
465
|
37
|
50
|
|
|
|
158
|
$rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs and $lossy++; |
|
466
|
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
# remove (?i) |
|
468
|
37
|
|
|
|
|
88
|
$rule =~ s/\(\?i\)//gs; |
|
469
|
|
|
|
|
|
|
} |
|
470
|
|
|
|
|
|
|
else { |
|
471
|
5
|
50
|
|
|
|
49
|
die "case-i" if $rule =~ /\(\?i\)/; |
|
472
|
5
|
100
|
|
|
|
28
|
die "case-i" if $mods =~ /i/; |
|
473
|
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/ |
|
475
|
4
|
50
|
|
|
|
13
|
$rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs and die "case-i"; |
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
# we're already non-case-i so this is a no-op: /A(?-i:ct)/ => /Act/ |
|
478
|
4
|
|
|
|
|
11
|
$rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs; |
|
479
|
|
|
|
|
|
|
} |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
# remove /m and /s modifiers |
|
482
|
41
|
50
|
|
|
|
173
|
$mods =~ s/m// and $lossy++; |
|
483
|
41
|
100
|
|
|
|
129
|
$mods =~ s/s// and $lossy++; |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
# remove (^|\b)'s |
|
486
|
|
|
|
|
|
|
# T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is |
|
487
|
41
|
50
|
|
|
|
110
|
$rule =~ s/\(\^\|\\b\)//gs and $lossy++; |
|
488
|
41
|
50
|
|
|
|
97
|
$rule =~ s/\(\$\|\\b\)//gs and $lossy++; |
|
489
|
41
|
50
|
|
|
|
126
|
$rule =~ s/\(\\b\|\^\)//gs and $lossy++; |
|
490
|
41
|
50
|
|
|
|
100
|
$rule =~ s/\(\\b\|\$\)//gs and $lossy++; |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
# remove (?!credit) |
|
493
|
41
|
100
|
|
|
|
198
|
$rule =~ s/\(\?\![^\)]+\)//gs and $lossy++; |
|
494
|
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
# remove \b's |
|
496
|
41
|
100
|
|
|
|
311
|
$rule =~ s/(?<!\\)\\b//gs and $lossy++; |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
# remove the "?=" trick |
|
499
|
|
|
|
|
|
|
# (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...) |
|
500
|
41
|
|
|
|
|
90
|
$rule =~ s/\(\?\=\[[^\]]+\]\)//gs; |
|
501
|
|
|
|
|
|
|
|
|
502
|
41
|
100
|
|
|
|
245
|
$mods .= "L" if $lossy; |
|
503
|
41
|
|
|
|
|
311
|
($rule, $mods); |
|
504
|
|
|
|
|
|
|
} |
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
sub extract_hints { |
|
507
|
41
|
|
|
41
|
0
|
134
|
my $self = shift; |
|
508
|
41
|
|
|
|
|
83
|
my $rawrule = shift; |
|
509
|
41
|
|
|
|
|
139
|
my $rule = shift; |
|
510
|
41
|
|
|
|
|
136
|
my $mods = shift; |
|
511
|
|
|
|
|
|
|
|
|
512
|
41
|
|
|
|
|
89
|
my $main = $self->{main}; |
|
513
|
41
|
|
|
|
|
187
|
my $orig = $rule; |
|
514
|
|
|
|
|
|
|
|
|
515
|
41
|
|
|
|
|
95
|
my $lossy = 0; |
|
516
|
41
|
100
|
|
|
|
280
|
$mods =~ s/L// and $lossy++; |
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
# if there are anchors, give up; we can't get much |
|
519
|
|
|
|
|
|
|
# faster than these anyway |
|
520
|
41
|
50
|
|
|
|
239
|
die "anchors" if $rule =~ /^\(?(?:\^|\\A)/; |
|
521
|
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
# die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/; |
|
523
|
|
|
|
|
|
|
# just remove end-of-string anchors; they're slow so could gain |
|
524
|
|
|
|
|
|
|
# from our speedup |
|
525
|
41
|
50
|
|
|
|
237
|
$rule =~ s/(?<!\\)(?:\$|\\Z)\)?$// and $lossy++; |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
# simplify (?:..) to (..) |
|
528
|
|
|
|
|
|
|
$main->{bases_allow_noncapture_groups} or |
|
529
|
41
|
50
|
|
|
|
377
|
$rule =~ s/\(\?:/\(/g; |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
# simplify some grouping arrangements so they're easier for us to parse |
|
532
|
|
|
|
|
|
|
# (foo)? => (foo|) |
|
533
|
41
|
|
|
|
|
258
|
$rule =~ s/\((.*?)\)\?/\($1\|\)/gs; |
|
534
|
|
|
|
|
|
|
# r? => (r|) |
|
535
|
41
|
|
|
|
|
153
|
$rule =~ s/(?<!\\)(\w)\?/\($1\|\)/gs; |
|
536
|
|
|
|
|
|
|
|
|
537
|
41
|
|
|
|
|
509
|
my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile(); |
|
538
|
41
|
50
|
|
|
|
230
|
$tmpfh or die "failed to create a temporary file"; |
|
539
|
41
|
|
|
|
|
178
|
untaint_var(\$tmpf); |
|
540
|
|
|
|
|
|
|
|
|
541
|
41
|
50
|
|
|
|
432
|
print $tmpfh "use bytes; m{" . $rule . "}" . $mods |
|
542
|
|
|
|
|
|
|
or die "error writing to $tmpf: $!"; |
|
543
|
41
|
50
|
|
|
|
1874
|
close $tmpfh or die "error closing $tmpf: $!"; |
|
544
|
|
|
|
|
|
|
|
|
545
|
41
|
|
|
|
|
410
|
my $perl = $self->get_perl(); |
|
546
|
41
|
|
|
|
|
148
|
local *IN; |
|
547
|
41
|
50
|
|
|
|
159795
|
open (IN, "$perl -c -Mre=debug $tmpf 2>&1 |") |
|
548
|
|
|
|
|
|
|
or die "cannot run $perl: ".exit_status_str($?,$!); |
|
549
|
|
|
|
|
|
|
|
|
550
|
41
|
|
|
|
|
597
|
my($inbuf,$nread,$fullstr); $fullstr = ''; |
|
|
41
|
|
|
|
|
368
|
|
|
551
|
41
|
|
|
|
|
348142
|
while ( $nread=read(IN,$inbuf,16384) ) { $fullstr .= $inbuf } |
|
|
41
|
|
|
|
|
1260
|
|
|
552
|
41
|
50
|
|
|
|
236
|
defined $nread or die "error reading from pipe: $!"; |
|
553
|
|
|
|
|
|
|
|
|
554
|
41
|
50
|
|
|
|
3865
|
unlink $tmpf or die "cannot unlink $tmpf: $!"; |
|
555
|
41
|
50
|
|
|
|
1690
|
close IN or die "error closing pipe: $!"; |
|
556
|
41
|
50
|
|
|
|
426
|
defined $fullstr or warn "empty result from a pipe"; |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
# now parse the -Mre=debug output. |
|
559
|
|
|
|
|
|
|
# perl 5.10 format |
|
560
|
41
|
|
|
|
|
2161
|
$fullstr =~ s/^.*\nFinal program:\n//gs; |
|
561
|
|
|
|
|
|
|
# perl 5.6/5.8 format |
|
562
|
41
|
|
|
|
|
247
|
$fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs; |
|
563
|
41
|
|
|
|
|
173
|
$fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs; |
|
564
|
|
|
|
|
|
|
# common to all |
|
565
|
41
|
|
|
|
|
107
|
$fullstr =~ s/\nOffsets:.*$//gs; |
|
566
|
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
# clean up every other line that doesn't start with a space |
|
568
|
41
|
|
|
|
|
1305
|
$fullstr =~ s/^\S.*$//gm; |
|
569
|
|
|
|
|
|
|
|
|
570
|
41
|
50
|
|
|
|
1019
|
if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) { |
|
571
|
0
|
|
|
|
|
0
|
die "failed to parse Mre=debug output: $fullstr m{".$rule."}".$mods." $rawrule"; |
|
572
|
|
|
|
|
|
|
} |
|
573
|
41
|
|
|
|
|
949
|
my $opsstr = $1; |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
# what's left looks like this: |
|
576
|
|
|
|
|
|
|
# 1: EXACTF <v>(3) |
|
577
|
|
|
|
|
|
|
# 3: ANYOF[1ILil](14) |
|
578
|
|
|
|
|
|
|
# 14: EXACTF <a>(16) |
|
579
|
|
|
|
|
|
|
# 16: CURLY {2,7}(29) |
|
580
|
|
|
|
|
|
|
# 18: ANYOF[A-Za-z](0) |
|
581
|
|
|
|
|
|
|
# 29: SPACE(30) |
|
582
|
|
|
|
|
|
|
# 30: EXACTF <http://>(33) |
|
583
|
|
|
|
|
|
|
# 33: END(0) |
|
584
|
|
|
|
|
|
|
# |
|
585
|
41
|
|
|
|
|
136
|
DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr"; |
|
586
|
|
|
|
|
|
|
|
|
587
|
41
|
|
|
|
|
95
|
my @ops; |
|
588
|
41
|
|
|
|
|
2171
|
foreach my $op (split(/\n/s, $opsstr)) { |
|
589
|
439
|
50
|
|
|
|
1239
|
next unless $op; |
|
590
|
|
|
|
|
|
|
|
|
591
|
439
|
100
|
|
|
|
3899
|
if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*?)\s*(?:\(\d+\))?$/) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
# perl 5.8: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx...>(18) |
|
593
|
|
|
|
|
|
|
# perl 5.10, 5.12, 5.14: <xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>... (18) |
|
594
|
321
|
|
|
|
|
6180
|
push @ops, [ $1, $2, $3 ]; |
|
595
|
|
|
|
|
|
|
} |
|
596
|
|
|
|
|
|
|
elsif ($op =~ /^ (\s*)<(.*)>\.\.\.\s*$/) { |
|
597
|
|
|
|
|
|
|
# 5: TRIE-EXACT[im](44) |
|
598
|
|
|
|
|
|
|
# <message contained attachments that have been blocked by guin>... |
|
599
|
4
|
|
|
|
|
49
|
my $spcs = $1; |
|
600
|
|
|
|
|
|
|
# we could use the entire length here, but it's easier to trim to |
|
601
|
|
|
|
|
|
|
# the length of a perl 5.8.x/5.6.x EXACT* string; that way our test |
|
602
|
|
|
|
|
|
|
# suite results will match, since the sa-update --list extraction will |
|
603
|
|
|
|
|
|
|
# be the same for all versions. (The "..." trailer is important btw) |
|
604
|
4
|
|
|
|
|
83
|
my $str = substr ($2, 0, 55); |
|
605
|
4
|
|
|
|
|
97
|
push @ops, [ $spcs, '_moretrie', "<$str...>" ]; |
|
606
|
|
|
|
|
|
|
} |
|
607
|
|
|
|
|
|
|
elsif ($op =~ /^ (\s*)(<.*>)\s*(?:\(\d+\))?$/) { |
|
608
|
|
|
|
|
|
|
# 5: TRIE-EXACT[am](21) |
|
609
|
|
|
|
|
|
|
# <am> (21) |
|
610
|
|
|
|
|
|
|
# <might> (12) |
|
611
|
114
|
|
|
|
|
1411
|
push @ops, [ $1, '_moretrie', $2 ]; |
|
612
|
|
|
|
|
|
|
} |
|
613
|
|
|
|
|
|
|
elsif ($op =~ /^ at .+ line \d+$/) { |
|
614
|
0
|
|
|
|
|
0
|
next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109': |
|
615
|
|
|
|
|
|
|
} |
|
616
|
|
|
|
|
|
|
else { |
|
617
|
0
|
|
|
|
|
0
|
warn "cannot parse '$op': $opsstr"; |
|
618
|
0
|
|
|
|
|
0
|
next; |
|
619
|
|
|
|
|
|
|
} |
|
620
|
|
|
|
|
|
|
} |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# unroll the branches; returns a list of versions. |
|
623
|
|
|
|
|
|
|
# e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ] |
|
624
|
41
|
|
|
|
|
336
|
my @unrolled; |
|
625
|
41
|
100
|
|
|
|
275
|
if ($main->{bases_split_out_alternations}) { |
|
626
|
35
|
|
|
|
|
764
|
@unrolled = $self->unroll_branches(0, \@ops); |
|
627
|
|
|
|
|
|
|
} else { |
|
628
|
6
|
|
|
|
|
61
|
@unrolled = ( \@ops ); |
|
629
|
|
|
|
|
|
|
} |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
# now find the longest DFA-friendly string in each unrolled version |
|
632
|
41
|
|
|
|
|
116
|
my @longests; |
|
633
|
41
|
|
|
|
|
153
|
foreach my $opsarray (@unrolled) { |
|
634
|
136
|
|
|
|
|
537
|
my $longestexact = ''; |
|
635
|
136
|
|
|
|
|
335
|
my $buf = ''; |
|
636
|
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
# use a closure to keep the code succinct |
|
638
|
|
|
|
|
|
|
my $add_candidate = sub { |
|
639
|
430
|
100
|
|
430
|
|
1249
|
if (length $buf > length $longestexact) { $longestexact = $buf; } |
|
|
159
|
|
|
|
|
366
|
|
|
640
|
430
|
|
|
|
|
699
|
$buf = ''; |
|
641
|
136
|
|
|
|
|
1146
|
}; |
|
642
|
|
|
|
|
|
|
|
|
643
|
136
|
|
|
|
|
396
|
my $prevop; |
|
644
|
136
|
|
|
|
|
180
|
foreach my $op (@{$opsarray}) { |
|
|
136
|
|
|
|
|
277
|
|
|
645
|
635
|
|
|
|
|
966
|
my ($spcs, $item, $args) = @{$op}; |
|
|
635
|
|
|
|
|
1844
|
|
|
646
|
|
|
|
|
|
|
|
|
647
|
635
|
100
|
|
|
|
1559
|
next if ($item eq 'NOTHING'); |
|
648
|
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
# EXACT == case-sensitive |
|
650
|
|
|
|
|
|
|
# EXACTF == case-i |
|
651
|
|
|
|
|
|
|
# we can do both, since we canonicalize to lc. |
|
652
|
626
|
100
|
100
|
|
|
8942
|
if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/) |
|
|
|
100
|
66
|
|
|
|
|
|
|
|
50
|
100
|
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
653
|
|
|
|
|
|
|
{ |
|
654
|
142
|
|
|
|
|
589
|
my $str = $1; |
|
655
|
142
|
|
|
|
|
395
|
$buf .= $str; |
|
656
|
142
|
100
|
|
|
|
659
|
if ($buf =~ s/\\x\{[0-9a-fA-F]{4,}\}.*$//) { |
|
657
|
|
|
|
|
|
|
# a high Unicode codepoint, interpreted by perl 5.8.x. cut and stop |
|
658
|
1
|
|
|
|
|
28
|
$add_candidate->(); |
|
659
|
|
|
|
|
|
|
} |
|
660
|
142
|
50
|
33
|
|
|
617
|
if (length $str >= 55 && $buf =~ s/\.\.\.$//) { |
|
661
|
|
|
|
|
|
|
# perl 5.8.x truncates with a "..." here! cut and stop |
|
662
|
0
|
|
|
|
|
0
|
$add_candidate->(); |
|
663
|
|
|
|
|
|
|
} |
|
664
|
|
|
|
|
|
|
} |
|
665
|
|
|
|
|
|
|
# _moretrie == a TRIE-EXACT entry |
|
666
|
|
|
|
|
|
|
elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/) |
|
667
|
|
|
|
|
|
|
{ |
|
668
|
190
|
|
|
|
|
494
|
$buf .= $1; |
|
669
|
190
|
100
|
66
|
|
|
670
|
if (length $1 >= 55 && $buf =~ s/\.\.\.$//) { |
|
670
|
|
|
|
|
|
|
# perl 5.8.x truncates with a "..." here! cut and stop |
|
671
|
4
|
|
|
|
|
37
|
$add_candidate->(); |
|
672
|
|
|
|
|
|
|
} |
|
673
|
|
|
|
|
|
|
} |
|
674
|
|
|
|
|
|
|
# /(?:foo|bar|baz){2}/ results in a CURLYX beforehand |
|
675
|
|
|
|
|
|
|
elsif ($item =~ /^EXACT/ && |
|
676
|
|
|
|
|
|
|
$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ && |
|
677
|
|
|
|
|
|
|
$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 && |
|
678
|
|
|
|
|
|
|
$args =~ /<(.*)>/) |
|
679
|
|
|
|
|
|
|
{ |
|
680
|
0
|
|
|
|
|
0
|
$buf .= $1; |
|
681
|
0
|
0
|
0
|
|
|
0
|
if (length $1 >= 55 && $buf =~ s/\.\.\.$//) { |
|
682
|
|
|
|
|
|
|
# perl 5.8.x truncates with a "..." here! cut and stop |
|
683
|
0
|
|
|
|
|
0
|
$add_candidate->(); |
|
684
|
|
|
|
|
|
|
} |
|
685
|
|
|
|
|
|
|
} |
|
686
|
|
|
|
|
|
|
# CURLYX, for perl >= 5.9.5 |
|
687
|
|
|
|
|
|
|
elsif ($item =~ /^_moretrie/ && |
|
688
|
|
|
|
|
|
|
$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ && |
|
689
|
|
|
|
|
|
|
$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 && |
|
690
|
|
|
|
|
|
|
$args =~ /<(.*)>/) |
|
691
|
|
|
|
|
|
|
{ |
|
692
|
5
|
|
|
|
|
44
|
$buf .= $1; |
|
693
|
5
|
50
|
33
|
|
|
38
|
if (length $1 >= 60 && $buf =~ s/\.\.\.$//) { |
|
694
|
|
|
|
|
|
|
# perl 5.8.x truncates with a "..." here! cut and stop |
|
695
|
0
|
|
|
|
|
0
|
$add_candidate->(); |
|
696
|
|
|
|
|
|
|
} |
|
697
|
|
|
|
|
|
|
} |
|
698
|
|
|
|
|
|
|
else { |
|
699
|
|
|
|
|
|
|
# not an /^EXACT/; clear the buffer |
|
700
|
289
|
|
|
|
|
682
|
$add_candidate->(); |
|
701
|
289
|
100
|
|
|
|
1280
|
if ($item !~ /^(?:END|CLOSE\d|MINMOD)$/) |
|
702
|
|
|
|
|
|
|
{ |
|
703
|
173
|
|
|
|
|
310
|
$lossy = 1; |
|
704
|
173
|
|
|
|
|
241
|
DEBUG_RE_PARSING and warn "item $item makes regexp lossy"; |
|
705
|
|
|
|
|
|
|
} |
|
706
|
|
|
|
|
|
|
} |
|
707
|
626
|
|
|
|
|
1550
|
$prevop = $op; |
|
708
|
|
|
|
|
|
|
} |
|
709
|
136
|
|
|
|
|
348
|
$add_candidate->(); |
|
710
|
|
|
|
|
|
|
|
|
711
|
136
|
100
|
|
|
|
296
|
if (!$longestexact) { |
|
712
|
1
|
|
|
|
|
90
|
die "no long-enough string found in $rawrule\n"; |
|
713
|
|
|
|
|
|
|
# all unrolled versions must have a long string, otherwise |
|
714
|
|
|
|
|
|
|
# we cannot reliably match all variants of the rule |
|
715
|
|
|
|
|
|
|
} else { |
|
716
|
135
|
100
|
|
|
|
1263
|
push @longests, ($main->{bases_must_be_casei}) ? |
|
717
|
|
|
|
|
|
|
lc $longestexact : $longestexact; |
|
718
|
|
|
|
|
|
|
} |
|
719
|
|
|
|
|
|
|
} |
|
720
|
|
|
|
|
|
|
|
|
721
|
40
|
|
|
|
|
74
|
DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/"; |
|
722
|
40
|
|
|
|
|
2451
|
return ($lossy, @longests); |
|
723
|
|
|
|
|
|
|
} |
|
724
|
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
########################################################################### |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
sub unroll_branches { |
|
728
|
187
|
|
|
187
|
0
|
492
|
my ($self, $depth, $opslist) = @_; |
|
729
|
|
|
|
|
|
|
|
|
730
|
187
|
50
|
|
|
|
384
|
die "too deep" if ($depth++ > 5); |
|
731
|
|
|
|
|
|
|
|
|
732
|
187
|
|
|
|
|
289
|
my @ops = (@{$opslist}); # copy |
|
|
187
|
|
|
|
|
659
|
|
|
733
|
187
|
|
|
|
|
520
|
my @pre_branch_ops; |
|
734
|
|
|
|
|
|
|
my $branch_spcs; |
|
735
|
187
|
|
|
|
|
0
|
my $trie_spcs; |
|
736
|
187
|
|
|
|
|
0
|
my $open_spcs; |
|
737
|
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
# our input looks something like this 2-level structure: |
|
739
|
|
|
|
|
|
|
# 1: BOUND(2) |
|
740
|
|
|
|
|
|
|
# 2: EXACT <Dear >(5) |
|
741
|
|
|
|
|
|
|
# 5: BRANCH(9) |
|
742
|
|
|
|
|
|
|
# 6: EXACT <IT>(8) |
|
743
|
|
|
|
|
|
|
# 8: NALNUM(24) |
|
744
|
|
|
|
|
|
|
# 9: BRANCH(23) |
|
745
|
|
|
|
|
|
|
# 10: EXACT <Int>(12) |
|
746
|
|
|
|
|
|
|
# 12: BRANCH(14) |
|
747
|
|
|
|
|
|
|
# 13: NOTHING(21) |
|
748
|
|
|
|
|
|
|
# 14: BRANCH(17) |
|
749
|
|
|
|
|
|
|
# 15: EXACT <a>(21) |
|
750
|
|
|
|
|
|
|
# 17: BRANCH(20) |
|
751
|
|
|
|
|
|
|
# 18: EXACT <er>(21) |
|
752
|
|
|
|
|
|
|
# 20: TAIL(21) |
|
753
|
|
|
|
|
|
|
# 21: EXACT <net>(24) |
|
754
|
|
|
|
|
|
|
# 23: TAIL(24) |
|
755
|
|
|
|
|
|
|
# 24: EXACT < shop>(27) |
|
756
|
|
|
|
|
|
|
# 27: END(0) |
|
757
|
|
|
|
|
|
|
# |
|
758
|
|
|
|
|
|
|
# or: |
|
759
|
|
|
|
|
|
|
# |
|
760
|
|
|
|
|
|
|
# 1: OPEN1(3) |
|
761
|
|
|
|
|
|
|
# 3: BRANCH(6) |
|
762
|
|
|
|
|
|
|
# 4: EXACT <v>(9) |
|
763
|
|
|
|
|
|
|
# 6: BRANCH(9) |
|
764
|
|
|
|
|
|
|
# 7: EXACT <\\/>(9) |
|
765
|
|
|
|
|
|
|
# 9: CLOSE1(11) |
|
766
|
|
|
|
|
|
|
# 11: CURLY {2,5}(14) |
|
767
|
|
|
|
|
|
|
# 13: REG_ANY(0) |
|
768
|
|
|
|
|
|
|
# 14: EXACT < g r a >(17) |
|
769
|
|
|
|
|
|
|
# 17: ANYOF[a-z](28) |
|
770
|
|
|
|
|
|
|
# 28: END(0) |
|
771
|
|
|
|
|
|
|
# |
|
772
|
|
|
|
|
|
|
# or: |
|
773
|
|
|
|
|
|
|
# |
|
774
|
|
|
|
|
|
|
# 1: EXACT <i >(3) |
|
775
|
|
|
|
|
|
|
# 3: OPEN1(5) |
|
776
|
|
|
|
|
|
|
# 5: TRIE-EXACT[am](21) |
|
777
|
|
|
|
|
|
|
# <am> (21) |
|
778
|
|
|
|
|
|
|
# <might> (12) |
|
779
|
|
|
|
|
|
|
# 12: OPEN2(14) |
|
780
|
|
|
|
|
|
|
# 14: TRIE-EXACT[ ](19) |
|
781
|
|
|
|
|
|
|
# < be> |
|
782
|
|
|
|
|
|
|
# <> |
|
783
|
|
|
|
|
|
|
# 19: CLOSE2(21) |
|
784
|
|
|
|
|
|
|
# 21: CLOSE1(23) |
|
785
|
|
|
|
|
|
|
# 23: EXACT < c>(25) |
|
786
|
|
|
|
|
|
|
|
|
787
|
187
|
|
|
|
|
238
|
DEBUG_RE_PARSING and warn "starting parse"; |
|
788
|
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
# this happens for /foo|bar/ instead of /(?:foo|bar)/ ; transform |
|
790
|
|
|
|
|
|
|
# it into the latter. bit of a kludge to do this before the loop, but hey. |
|
791
|
|
|
|
|
|
|
# note that it doesn't fix the CLOSE1/END ordering to be correct |
|
792
|
187
|
50
|
66
|
|
|
1093
|
if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) { |
|
793
|
0
|
|
|
|
|
0
|
my @newops = ([ "", "OPEN1", "" ]); |
|
794
|
0
|
|
|
|
|
0
|
foreach my $op (@ops) { |
|
795
|
0
|
|
|
|
|
0
|
push @newops, [ " ".$op->[0], $op->[1], $op->[2] ]; |
|
796
|
|
|
|
|
|
|
} |
|
797
|
0
|
|
|
|
|
0
|
push @newops, [ "", "CLOSE1", "" ]; |
|
798
|
0
|
|
|
|
|
0
|
@ops = @newops; |
|
799
|
|
|
|
|
|
|
} |
|
800
|
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
# iterate until we start a branch set. using |
|
802
|
|
|
|
|
|
|
# /dkjfksl(foo|bar(baz|argh)boo)gab/ as an example, we're at "dkj..." |
|
803
|
|
|
|
|
|
|
# just hitting an OPEN is not enough; wait until we see a TRIE-EXACT |
|
804
|
|
|
|
|
|
|
# or a BRANCH, *then* unroll the most recent OPEN set. |
|
805
|
187
|
|
|
|
|
267
|
while (1) { |
|
806
|
968
|
|
|
|
|
1189
|
my $op = shift @ops; |
|
807
|
968
|
100
|
|
|
|
1550
|
last unless defined $op; |
|
808
|
|
|
|
|
|
|
|
|
809
|
838
|
|
|
|
|
926
|
my ($spcs, $item, $args) = @{$op}; |
|
|
838
|
|
|
|
|
2314
|
|
|
810
|
838
|
|
|
|
|
966
|
DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args"; |
|
811
|
|
|
|
|
|
|
|
|
812
|
838
|
100
|
100
|
|
|
4907
|
if ($item =~ /^OPEN/) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
813
|
54
|
|
|
|
|
202
|
$open_spcs = $spcs; |
|
814
|
54
|
|
|
|
|
204
|
next; # next will be a BRANCH or TRIE |
|
815
|
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
} elsif ($item =~ /^TRIE/) { |
|
817
|
49
|
|
|
|
|
157
|
$trie_spcs = $spcs; |
|
818
|
49
|
|
|
|
|
153
|
last; |
|
819
|
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
} elsif ($item =~ /^BRANCH/) { |
|
821
|
8
|
|
|
|
|
63
|
$branch_spcs = $spcs; |
|
822
|
8
|
|
|
|
|
58
|
last; |
|
823
|
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
} elsif ($item =~ /^EXACT/ && defined $open_spcs) { |
|
825
|
|
|
|
|
|
|
# perl 5.9.5 does this; f(o|oish) => OPEN, EXACT, TRIE-EXACT |
|
826
|
7
|
|
|
|
|
96
|
push @pre_branch_ops, [ $open_spcs, $item, $args ]; |
|
827
|
7
|
|
|
|
|
105
|
next; |
|
828
|
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
} elsif (defined $open_spcs) { |
|
830
|
|
|
|
|
|
|
# OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT: |
|
831
|
|
|
|
|
|
|
# ignore this OPEN block entirely and don't try to unroll it |
|
832
|
2
|
|
|
|
|
11
|
undef $open_spcs; |
|
833
|
|
|
|
|
|
|
|
|
834
|
|
|
|
|
|
|
} else { |
|
835
|
718
|
|
|
|
|
1696
|
push @pre_branch_ops, $op; |
|
836
|
|
|
|
|
|
|
} |
|
837
|
|
|
|
|
|
|
} |
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
# no branches found? we're done unrolling on this one! |
|
840
|
187
|
100
|
|
|
|
441
|
if (scalar @ops == 0) { |
|
841
|
130
|
|
|
|
|
657
|
return [ @pre_branch_ops ]; |
|
842
|
|
|
|
|
|
|
} |
|
843
|
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
# otherwise we're at the start of a new branch set |
|
845
|
|
|
|
|
|
|
# /(foo|bar(baz|argh)boo)gab/ |
|
846
|
57
|
|
|
|
|
109
|
my @alts; |
|
847
|
|
|
|
|
|
|
my @in_this_branch; |
|
848
|
|
|
|
|
|
|
|
|
849
|
57
|
|
|
|
|
87
|
DEBUG_RE_PARSING and warn "entering branch: ". |
|
850
|
|
|
|
|
|
|
"open='".(defined $open_spcs ? $open_spcs : 'undef')."' ". |
|
851
|
|
|
|
|
|
|
"branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ". |
|
852
|
|
|
|
|
|
|
"trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'"; |
|
853
|
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
# indentation level to remove from "normal" ops (using a s///) |
|
855
|
57
|
100
|
|
|
|
450
|
my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." "; |
|
856
|
57
|
|
|
|
|
314
|
my $trie_sub_spcs = ""; |
|
857
|
57
|
|
|
|
|
99
|
while (1) { |
|
858
|
387
|
|
|
|
|
573
|
my $op = shift @ops; |
|
859
|
387
|
50
|
|
|
|
789
|
last unless defined $op; |
|
860
|
387
|
|
|
|
|
495
|
my ($spcs, $item, $args) = @{$op}; |
|
|
387
|
|
|
|
|
1330
|
|
|
861
|
387
|
|
|
|
|
464
|
DEBUG_RE_PARSING and warn "in: [$spcs] $item $args"; |
|
862
|
|
|
|
|
|
|
|
|
863
|
387
|
100
|
100
|
|
|
5484
|
if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt |
|
|
|
50
|
66
|
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
|
100
|
33
|
|
|
|
|
|
|
|
100
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
864
|
9
|
|
|
|
|
42
|
push @alts, [ @pre_branch_ops, @in_this_branch ]; |
|
865
|
9
|
|
|
|
|
30
|
@in_this_branch = (); |
|
866
|
9
|
|
|
|
|
30
|
$open_sub_spcs = $branch_spcs." "; |
|
867
|
9
|
|
|
|
|
32
|
$trie_sub_spcs = ""; |
|
868
|
9
|
|
|
|
|
33
|
next; |
|
869
|
|
|
|
|
|
|
} |
|
870
|
|
|
|
|
|
|
elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end |
|
871
|
0
|
|
|
|
|
0
|
push @alts, [ @pre_branch_ops, @in_this_branch ]; |
|
872
|
0
|
|
|
|
|
0
|
undef $branch_spcs; |
|
873
|
0
|
|
|
|
|
0
|
$open_sub_spcs = ""; |
|
874
|
0
|
|
|
|
|
0
|
$trie_sub_spcs = ""; |
|
875
|
0
|
|
|
|
|
0
|
last; |
|
876
|
|
|
|
|
|
|
} |
|
877
|
|
|
|
|
|
|
elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') { |
|
878
|
135
|
100
|
|
|
|
308
|
if (scalar @in_this_branch > 0) { |
|
879
|
86
|
|
|
|
|
238
|
push @alts, [ @pre_branch_ops, @in_this_branch ]; |
|
880
|
|
|
|
|
|
|
} |
|
881
|
|
|
|
|
|
|
# use $open_spcs instead of $trie_spcs (which is 2 spcs further indented) |
|
882
|
135
|
|
|
|
|
711
|
@in_this_branch = ( [ $open_spcs, $item, $args ] ); |
|
883
|
135
|
50
|
|
|
|
318
|
$open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." "; |
|
884
|
135
|
|
|
|
|
253
|
$trie_sub_spcs = " "; |
|
885
|
135
|
|
|
|
|
382
|
next; |
|
886
|
|
|
|
|
|
|
} |
|
887
|
|
|
|
|
|
|
elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end |
|
888
|
52
|
|
|
|
|
230
|
push @alts, [ @pre_branch_ops, @in_this_branch ]; |
|
889
|
52
|
|
|
|
|
101
|
undef $branch_spcs; |
|
890
|
52
|
|
|
|
|
110
|
undef $open_spcs; |
|
891
|
52
|
|
|
|
|
106
|
undef $trie_spcs; |
|
892
|
52
|
|
|
|
|
132
|
$open_sub_spcs = ""; |
|
893
|
52
|
|
|
|
|
102
|
$trie_sub_spcs = ""; |
|
894
|
52
|
|
|
|
|
160
|
last; |
|
895
|
|
|
|
|
|
|
} |
|
896
|
|
|
|
|
|
|
elsif ($item eq 'END') { # of string |
|
897
|
5
|
|
|
|
|
32
|
push @alts, [ @pre_branch_ops, @in_this_branch ]; |
|
898
|
5
|
|
|
|
|
27
|
undef $branch_spcs; |
|
899
|
5
|
|
|
|
|
18
|
undef $open_spcs; |
|
900
|
5
|
|
|
|
|
16
|
undef $trie_spcs; |
|
901
|
5
|
|
|
|
|
17
|
$open_sub_spcs = ""; |
|
902
|
5
|
|
|
|
|
15
|
$trie_sub_spcs = ""; |
|
903
|
5
|
|
|
|
|
26
|
last; |
|
904
|
|
|
|
|
|
|
} |
|
905
|
|
|
|
|
|
|
else { |
|
906
|
186
|
50
|
|
|
|
440
|
if ($open_sub_spcs) { |
|
907
|
|
|
|
|
|
|
# deindent the space-level to match the opening brace |
|
908
|
186
|
|
|
|
|
1321
|
$spcs =~ s/^$open_sub_spcs//; |
|
909
|
|
|
|
|
|
|
# tries also add one more indent level in |
|
910
|
186
|
|
|
|
|
1016
|
$spcs =~ s/^$trie_sub_spcs//; |
|
911
|
|
|
|
|
|
|
} |
|
912
|
186
|
|
|
|
|
1389
|
push @in_this_branch, [ $spcs, $item, $args ]; |
|
913
|
|
|
|
|
|
|
# note that we ignore ops at a deeper $spcs level entirely (until later!) |
|
914
|
|
|
|
|
|
|
} |
|
915
|
|
|
|
|
|
|
} |
|
916
|
|
|
|
|
|
|
|
|
917
|
57
|
50
|
|
|
|
130
|
if (defined $branch_spcs) { |
|
918
|
0
|
|
|
|
|
0
|
die "fell off end of string with a branch open: '$branch_spcs'"; |
|
919
|
|
|
|
|
|
|
} |
|
920
|
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
# we're now after the branch set: /gab/ |
|
922
|
|
|
|
|
|
|
# @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz|argh)boo/ ] |
|
923
|
57
|
|
|
|
|
188
|
foreach my $alt (@alts) { |
|
924
|
152
|
|
|
|
|
231
|
push @{$alt}, @ops; # add all remaining ops to each one |
|
|
152
|
|
|
|
|
568
|
|
|
925
|
|
|
|
|
|
|
# note that this could include more (?:...); we don't care, since |
|
926
|
|
|
|
|
|
|
# those can be handled by recursing |
|
927
|
|
|
|
|
|
|
} |
|
928
|
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
# ok, parsed the entire ops list |
|
930
|
|
|
|
|
|
|
# @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz|argh)boogab/ ] |
|
931
|
|
|
|
|
|
|
|
|
932
|
57
|
|
|
|
|
96
|
if (DEBUG_RE_PARSING) { |
|
933
|
|
|
|
|
|
|
print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; } |
|
934
|
|
|
|
|
|
|
} |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
# now recurse, to unroll the remaining branches (if any exist) |
|
937
|
57
|
|
|
|
|
107
|
my @rets; |
|
938
|
57
|
|
|
|
|
103
|
foreach my $alt (@alts) { |
|
939
|
152
|
|
|
|
|
672
|
push @rets, $self->unroll_branches($depth, $alt); |
|
940
|
|
|
|
|
|
|
} |
|
941
|
|
|
|
|
|
|
|
|
942
|
57
|
|
|
|
|
79
|
if (DEBUG_RE_PARSING) { |
|
943
|
|
|
|
|
|
|
print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; } |
|
944
|
|
|
|
|
|
|
} |
|
945
|
|
|
|
|
|
|
|
|
946
|
57
|
|
|
|
|
420
|
return @rets; |
|
947
|
|
|
|
|
|
|
} |
|
948
|
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
########################################################################### |
|
950
|
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
sub test { |
|
952
|
0
|
|
|
0
|
0
|
0
|
my ($self) = @_; |
|
953
|
|
|
|
|
|
|
|
|
954
|
0
|
|
|
|
|
0
|
$self->test_split_alt("foo", "/foo/"); |
|
955
|
0
|
|
|
|
|
0
|
$self->test_split_alt("(foo)", "/foo/"); |
|
956
|
0
|
|
|
|
|
0
|
$self->test_split_alt("foo(bar)baz", "/foobarbaz/"); |
|
957
|
0
|
|
|
|
|
0
|
$self->test_split_alt("x(foo|)", "/xfoo/ /x/"); |
|
958
|
0
|
|
|
|
|
0
|
$self->test_split_alt("fo(o|)", "/foo/ /fo/"); |
|
959
|
0
|
|
|
|
|
0
|
$self->test_split_alt("(foo|bar)", "/foo/ /bar/"); |
|
960
|
0
|
|
|
|
|
0
|
$self->test_split_alt("foo|bar", "/foo/ /bar/"); |
|
961
|
0
|
|
|
|
|
0
|
$self->test_split_alt("foo (bar|baz) argh", "/foo bar argh/ /foo baz argh/"); |
|
962
|
0
|
|
|
|
|
0
|
$self->test_split_alt("foo (bar|baz|bl(arg|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/"); |
|
963
|
0
|
|
|
|
|
0
|
$self->test_split_alt("(s(otc|tco)k)", "/sotck/ /stcok/"); |
|
964
|
0
|
|
|
|
|
0
|
$self->test_split_alt("(business partner(s|ship|)|silent partner(s|ship|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/"); |
|
965
|
|
|
|
|
|
|
} |
|
966
|
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
sub test_split_alt { |
|
968
|
0
|
|
|
0
|
0
|
0
|
my ($self, $in, $out) = @_; |
|
969
|
|
|
|
|
|
|
|
|
970
|
0
|
|
|
|
|
0
|
my @got = $self->split_alt($in); |
|
971
|
0
|
|
|
|
|
0
|
$out =~ s/^\///; |
|
972
|
0
|
|
|
|
|
0
|
$out =~ s/\/$//; |
|
973
|
0
|
|
|
|
|
0
|
my @want = split(/\/ \//, $out); |
|
974
|
|
|
|
|
|
|
|
|
975
|
0
|
|
|
|
|
0
|
my $failed = 0; |
|
976
|
0
|
0
|
|
|
|
0
|
if (scalar @want != scalar @got) { |
|
977
|
0
|
|
|
|
|
0
|
warn "FAIL: results count don't match"; |
|
978
|
0
|
|
|
|
|
0
|
$failed++; |
|
979
|
|
|
|
|
|
|
} |
|
980
|
|
|
|
|
|
|
else { |
|
981
|
0
|
|
|
|
|
0
|
my %got = map { $_ => 1 } @got; |
|
|
0
|
|
|
|
|
0
|
|
|
982
|
0
|
|
|
|
|
0
|
foreach my $w (@want) { |
|
983
|
0
|
0
|
|
|
|
0
|
if (!$got{$w}) { |
|
984
|
0
|
|
|
|
|
0
|
warn "FAIL: '$w' not found"; |
|
985
|
0
|
|
|
|
|
0
|
$failed++; |
|
986
|
|
|
|
|
|
|
} |
|
987
|
|
|
|
|
|
|
} |
|
988
|
|
|
|
|
|
|
} |
|
989
|
|
|
|
|
|
|
|
|
990
|
0
|
0
|
|
|
|
0
|
if ($failed) { |
|
991
|
0
|
0
|
|
|
|
0
|
print "want: /".join('/ /', @want)."/\n" or die "error writing: $!"; |
|
992
|
0
|
0
|
|
|
|
0
|
print "got: /".join('/ /', @got)."/\n" or die "error writing: $!"; |
|
993
|
0
|
|
|
|
|
0
|
return 0; |
|
994
|
|
|
|
|
|
|
} else { |
|
995
|
0
|
0
|
|
|
|
0
|
print "ok\n" or die "error writing: $!"; |
|
996
|
0
|
|
|
|
|
0
|
return 1; |
|
997
|
|
|
|
|
|
|
} |
|
998
|
|
|
|
|
|
|
} |
|
999
|
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
########################################################################### |
|
1001
|
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
sub get_perl { |
|
1003
|
41
|
|
|
41
|
0
|
120
|
my ($self) = @_; |
|
1004
|
41
|
|
|
|
|
81
|
my $perl; |
|
1005
|
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
# allow user override of the perl interpreter to use when |
|
1007
|
|
|
|
|
|
|
# extracting base strings. |
|
1008
|
|
|
|
|
|
|
# TODO: expose this via sa-compile command-line option |
|
1009
|
41
|
|
|
|
|
120
|
my $fromconf = $self->{main}->{conf}->{re_parser_perl}; |
|
1010
|
|
|
|
|
|
|
|
|
1011
|
41
|
50
|
|
|
|
396
|
if ($fromconf) { |
|
|
|
50
|
|
|
|
|
|
|
1012
|
0
|
|
|
|
|
0
|
$perl = $fromconf; |
|
1013
|
|
|
|
|
|
|
} elsif ($^X =~ m|^/|) { |
|
1014
|
41
|
|
|
|
|
174
|
$perl = $^X; |
|
1015
|
|
|
|
|
|
|
} else { |
|
1016
|
1
|
|
|
1
|
|
24
|
use Config; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
1210
|
|
|
1017
|
0
|
|
|
|
|
0
|
$perl = $Config{perlpath}; |
|
1018
|
0
|
|
|
|
|
0
|
$perl =~ s|/[^/]*$|/$^X|; |
|
1019
|
|
|
|
|
|
|
} |
|
1020
|
41
|
|
|
|
|
166
|
untaint_var(\$perl); |
|
1021
|
41
|
|
|
|
|
243
|
return $perl; |
|
1022
|
|
|
|
|
|
|
} |
|
1023
|
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
########################################################################### |
|
1025
|
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
sub read_cachefile { |
|
1027
|
0
|
|
|
0
|
0
|
0
|
my ($self, $cachefile) = @_; |
|
1028
|
0
|
|
|
|
|
0
|
local *IN; |
|
1029
|
0
|
0
|
|
|
|
0
|
if (open(IN, "<".$cachefile)) { |
|
1030
|
0
|
|
|
|
|
0
|
my($inbuf,$nread,$str); $str = ''; |
|
|
0
|
|
|
|
|
0
|
|
|
1031
|
0
|
|
|
|
|
0
|
while ( $nread=read(IN,$inbuf,16384) ) { $str .= $inbuf } |
|
|
0
|
|
|
|
|
0
|
|
|
1032
|
0
|
0
|
|
|
|
0
|
defined $nread or die "error reading from $cachefile: $!"; |
|
1033
|
0
|
0
|
|
|
|
0
|
close IN or die "error closing $cachefile: $!"; |
|
1034
|
|
|
|
|
|
|
|
|
1035
|
0
|
|
|
|
|
0
|
untaint_var(\$str); |
|
1036
|
0
|
|
|
|
|
0
|
my $VAR1; # Data::Dumper |
|
1037
|
0
|
0
|
|
|
|
0
|
if (eval $str) { |
|
1038
|
0
|
|
|
|
|
0
|
return $VAR1; # Data::Dumper's naming |
|
1039
|
|
|
|
|
|
|
} |
|
1040
|
|
|
|
|
|
|
} |
|
1041
|
0
|
|
|
|
|
0
|
return { }; |
|
1042
|
|
|
|
|
|
|
} |
|
1043
|
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
sub write_cachefile { |
|
1045
|
0
|
|
|
0
|
0
|
0
|
my ($self, $cachefile, $cached) = @_; |
|
1046
|
|
|
|
|
|
|
|
|
1047
|
0
|
|
|
|
|
0
|
my $dump = Data::Dumper->new ([ $cached ]); |
|
1048
|
0
|
|
|
|
|
0
|
$dump->Deepcopy(1); |
|
1049
|
0
|
|
|
|
|
0
|
$dump->Purity(1); |
|
1050
|
0
|
|
|
|
|
0
|
$dump->Indent(1); |
|
1051
|
|
|
|
|
|
|
|
|
1052
|
0
|
|
|
|
|
0
|
my $cachedir = $self->{main}->{bases_cache_dir}; |
|
1053
|
0
|
0
|
|
|
|
0
|
if (mkdir($cachedir)) { |
|
|
|
0
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
# successfully created |
|
1055
|
|
|
|
|
|
|
} elsif ($! == EEXIST) { |
|
1056
|
0
|
|
|
|
|
0
|
dbg("zoom: ok, cache directory already existed"); |
|
1057
|
|
|
|
|
|
|
} else { |
|
1058
|
0
|
|
|
|
|
0
|
warn "zoom: could not create cache directory: $cachedir ($!)\n"; |
|
1059
|
0
|
|
|
|
|
0
|
return; |
|
1060
|
|
|
|
|
|
|
} |
|
1061
|
0
|
0
|
|
|
|
0
|
open(CACHE, ">$cachefile") or warn "cannot write to $cachefile"; |
|
1062
|
0
|
0
|
|
|
|
0
|
print CACHE ($dump->Dump, ";1;") or die "error writing: $!"; |
|
1063
|
0
|
0
|
|
|
|
0
|
close CACHE or die "error closing $cachefile: $!"; |
|
1064
|
|
|
|
|
|
|
} |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
=over 4 |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
=item my ($cleanregexp) = fixup_re($regexp); |
|
1069
|
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
Converts encoded characters in a regular expression pattern into their |
|
1071
|
|
|
|
|
|
|
equivalent characters |
|
1072
|
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
=back |
|
1074
|
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
=cut |
|
1076
|
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
sub fixup_re { |
|
1078
|
135
|
|
|
135
|
1
|
331
|
my $re = shift; |
|
1079
|
|
|
|
|
|
|
|
|
1080
|
135
|
0
|
|
|
|
413
|
if ($fixup_re_test) { print "INPUT: /$re/\n" or die "error writing: $!" } |
|
|
0
|
50
|
|
|
|
0
|
|
|
1081
|
|
|
|
|
|
|
|
|
1082
|
135
|
|
|
|
|
592
|
my $output = ""; |
|
1083
|
135
|
|
|
|
|
1147
|
my $TOK = qr([\"\\]); |
|
1084
|
|
|
|
|
|
|
|
|
1085
|
135
|
|
|
|
|
220
|
my $STATE; |
|
1086
|
135
|
|
|
|
|
656
|
local ($1,$2); |
|
1087
|
135
|
|
|
|
|
1207
|
while ($re =~ /\G(.*?)($TOK)/gcs) { |
|
1088
|
0
|
|
|
|
|
0
|
my $pre = $1; |
|
1089
|
0
|
|
|
|
|
0
|
my $tok = $2; |
|
1090
|
|
|
|
|
|
|
|
|
1091
|
0
|
0
|
|
|
|
0
|
if (length($pre)) { |
|
1092
|
0
|
|
|
|
|
0
|
$output .= "\"$pre\""; |
|
1093
|
|
|
|
|
|
|
} |
|
1094
|
|
|
|
|
|
|
|
|
1095
|
0
|
0
|
|
|
|
0
|
if ($tok eq '"') { |
|
|
|
0
|
|
|
|
|
|
|
1096
|
0
|
|
|
|
|
0
|
$output .= '"\\""'; |
|
1097
|
|
|
|
|
|
|
} |
|
1098
|
|
|
|
|
|
|
elsif ($tok eq '\\') { |
|
1099
|
0
|
0
|
|
|
|
0
|
$re =~ /\G(x\{[^\}]+\}|[0-7]{1,3}|.)/gcs or die "\\ at end of string!"; |
|
1100
|
0
|
|
|
|
|
0
|
my $esc = $1; |
|
1101
|
0
|
0
|
|
|
|
0
|
if ($esc eq '"') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1102
|
0
|
|
|
|
|
0
|
$output .= '"\\""'; |
|
1103
|
|
|
|
|
|
|
} elsif ($esc eq '\\') { |
|
1104
|
0
|
|
|
|
|
0
|
$output .= '"**BACKSLASH**"'; # avoid hairy escape-parsing |
|
1105
|
|
|
|
|
|
|
} elsif ($esc =~ /^x\{(\S+)\}\z/) { |
|
1106
|
0
|
|
|
|
|
0
|
$output .= '"'.chr(hex($1)).'"'; |
|
1107
|
|
|
|
|
|
|
} elsif ($esc =~ /^[0-7]{1,3}\z/) { |
|
1108
|
0
|
|
|
|
|
0
|
$output .= '"'.chr(oct($esc)).'"'; |
|
1109
|
|
|
|
|
|
|
} else { |
|
1110
|
0
|
|
|
|
|
0
|
$output .= "\"$esc\""; |
|
1111
|
|
|
|
|
|
|
} |
|
1112
|
|
|
|
|
|
|
} |
|
1113
|
|
|
|
|
|
|
else { |
|
1114
|
0
|
0
|
|
|
|
0
|
print "PRE: $pre\nTOK: $tok\n" or die "error writing: $!"; |
|
1115
|
|
|
|
|
|
|
} |
|
1116
|
|
|
|
|
|
|
} |
|
1117
|
|
|
|
|
|
|
|
|
1118
|
135
|
50
|
|
|
|
375
|
if (!defined(pos($re))) { |
|
|
|
0
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
# no matches |
|
1120
|
135
|
|
|
|
|
661
|
$output .= "\"$re\""; |
|
1121
|
|
|
|
|
|
|
# Bug 6649: protect NL, NULL, ^Z, (and controls to stay on the safe side) |
|
1122
|
135
|
|
|
|
|
491
|
$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse; |
|
|
0
|
|
|
|
|
0
|
|
|
1123
|
|
|
|
|
|
|
} |
|
1124
|
|
|
|
|
|
|
elsif (pos($re) <= length($re)) { |
|
1125
|
0
|
|
|
|
|
0
|
$output =~ s{([\000-\037\177\200\377])}{sprintf("\\%03o",ord($1))}gse; |
|
|
0
|
|
|
|
|
0
|
|
|
1126
|
0
|
|
|
|
|
0
|
$output .= fixup_re(substr($re, pos($re))); |
|
1127
|
|
|
|
|
|
|
} |
|
1128
|
|
|
|
|
|
|
|
|
1129
|
135
|
|
|
|
|
265
|
$output =~ s/^""/"/; # protect start and end quotes |
|
1130
|
135
|
|
|
|
|
195
|
$output =~ s/(?<!\\)""\z/"/; |
|
1131
|
135
|
|
|
|
|
191
|
$output =~ s/(?<!\\)""//g; # strip empty strings, or turn "abc""def" -> "abcdef" |
|
1132
|
135
|
|
|
|
|
171
|
$output =~ s/\*\*BACKSLASH\*\*/\\\\/gs; |
|
1133
|
|
|
|
|
|
|
|
|
1134
|
135
|
0
|
|
|
|
235
|
if ($fixup_re_test) { print "OUTPUT: $output\n" or die "error writing: $!" } |
|
|
0
|
50
|
|
|
|
0
|
|
|
1135
|
135
|
|
|
|
|
725
|
return $output; |
|
1136
|
|
|
|
|
|
|
} |
|
1137
|
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
1; |