line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# <@LICENSE> |
2
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
3
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
4
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
5
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
6
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
7
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
12
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
13
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
15
|
|
|
|
|
|
|
# limitations under the License. |
16
|
|
|
|
|
|
|
# </@LICENSE> |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Mail::SpamAssassin::Plugin::AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 SYNOPSIS |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
loadplugin Mail::SpamAssassin::Plugin::AutoLearnThreshold |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 DESCRIPTION |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
This plugin implements the threshold-based auto-learning discriminator |
29
|
|
|
|
|
|
|
for SpamAssassin's Bayes subsystem. Auto-learning is a mechanism |
30
|
|
|
|
|
|
|
whereby high-scoring mails (or low-scoring mails, for non-spam) are fed |
31
|
|
|
|
|
|
|
into its learning systems without user intervention, during scanning. |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Note that certain tests are ignored when determining whether a message |
34
|
|
|
|
|
|
|
should be trained upon: |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=over 4 |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=item * rules with tflags set to 'learn' (the Bayesian rules) |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=item * rules with tflags set to 'userconf' (user configuration) |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=item * rules with tflags set to 'noautolearn' |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=back |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
Also note that auto-learning occurs using scores from either scoreset 0 |
47
|
|
|
|
|
|
|
or 1, depending on what scoreset is used during message check. It is |
48
|
|
|
|
|
|
|
likely that the message check and auto-learn scores will be different. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
package Mail::SpamAssassin::Plugin::AutoLearnThreshold; |
53
|
|
|
|
|
|
|
|
54
|
21
|
|
|
21
|
|
172
|
use Mail::SpamAssassin::Plugin; |
|
21
|
|
|
|
|
53
|
|
|
21
|
|
|
|
|
706
|
|
55
|
21
|
|
|
21
|
|
153
|
use Mail::SpamAssassin::Logger; |
|
21
|
|
|
|
|
68
|
|
|
21
|
|
|
|
|
1355
|
|
56
|
21
|
|
|
21
|
|
145
|
use strict; |
|
21
|
|
|
|
|
99
|
|
|
21
|
|
|
|
|
609
|
|
57
|
21
|
|
|
21
|
|
146
|
use warnings; |
|
21
|
|
|
|
|
46
|
|
|
21
|
|
|
|
|
919
|
|
58
|
|
|
|
|
|
|
# use bytes; |
59
|
21
|
|
|
21
|
|
179
|
use re 'taint'; |
|
21
|
|
|
|
|
45
|
|
|
21
|
|
|
|
|
17977
|
|
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
our @ISA = qw(Mail::SpamAssassin::Plugin); |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub new { |
64
|
62
|
|
|
62
|
1
|
235
|
my $class = shift; |
65
|
62
|
|
|
|
|
177
|
my $mailsaobject = shift; |
66
|
|
|
|
|
|
|
|
67
|
62
|
|
33
|
|
|
475
|
$class = ref($class) || $class; |
68
|
62
|
|
|
|
|
402
|
my $self = $class->SUPER::new($mailsaobject); |
69
|
62
|
|
|
|
|
218
|
bless ($self, $class); |
70
|
|
|
|
|
|
|
|
71
|
62
|
|
|
|
|
361
|
$self->set_config($mailsaobject->{conf}); |
72
|
|
|
|
|
|
|
|
73
|
62
|
|
|
|
|
644
|
return $self; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
sub set_config { |
77
|
62
|
|
|
62
|
0
|
206
|
my($self, $conf) = @_; |
78
|
62
|
|
|
|
|
142
|
my @cmds; |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=head1 USER OPTIONS |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
The following configuration settings are used to control auto-learning: |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=over 4 |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=item bayes_auto_learn_threshold_nonspam n.nn (default: 0.1) |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
The score threshold below which a mail has to score, to be fed into |
89
|
|
|
|
|
|
|
SpamAssassin's learning systems automatically as a non-spam message. |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=cut |
92
|
|
|
|
|
|
|
|
93
|
62
|
|
|
|
|
375
|
push (@cmds, { |
94
|
|
|
|
|
|
|
setting => 'bayes_auto_learn_threshold_nonspam', |
95
|
|
|
|
|
|
|
default => 0.1, |
96
|
|
|
|
|
|
|
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC |
97
|
|
|
|
|
|
|
}); |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=item bayes_auto_learn_threshold_spam n.nn (default: 12.0) |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
The score threshold above which a mail has to score, to be fed into |
102
|
|
|
|
|
|
|
SpamAssassin's learning systems automatically as a spam message. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
Note: SpamAssassin requires at least 3 points from the header, and 3 |
105
|
|
|
|
|
|
|
points from the body to auto-learn as spam. Therefore, the minimum |
106
|
|
|
|
|
|
|
working value for this option is 6. |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
If the test option autolearn_force is set, the minimum value will |
109
|
|
|
|
|
|
|
remain at 6 points but there is no requirement that the points come |
110
|
|
|
|
|
|
|
from body and header rules. This option is useful for autolearning |
111
|
|
|
|
|
|
|
with rules that are considered to be extremely safe indicators of |
112
|
|
|
|
|
|
|
the spaminess of a message. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=cut |
115
|
|
|
|
|
|
|
|
116
|
62
|
|
|
|
|
307
|
push (@cmds, { |
117
|
|
|
|
|
|
|
setting => 'bayes_auto_learn_threshold_spam', |
118
|
|
|
|
|
|
|
default => 12.0, |
119
|
|
|
|
|
|
|
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC |
120
|
|
|
|
|
|
|
}); |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
=item bayes_auto_learn_on_error (0 | 1) (default: 0) |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
With C<bayes_auto_learn_on_error> off, autolearning will be performed |
125
|
|
|
|
|
|
|
even if bayes classifier already agrees with the new classification (i.e. |
126
|
|
|
|
|
|
|
yielded BAYES_00 for what we are now trying to teach it as ham, or yielded |
127
|
|
|
|
|
|
|
BAYES_99 for spam). This is a traditional setting, the default was chosen |
128
|
|
|
|
|
|
|
to retain backward compatibility. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
With C<bayes_auto_learn_on_error> turned on, autolearning will be performed |
131
|
|
|
|
|
|
|
only when a bayes classifier had a different opinion from what the autolearner |
132
|
|
|
|
|
|
|
is now trying to teach it (i.e. it made an error in judgement). This strategy |
133
|
|
|
|
|
|
|
may or may not produce better future classifications, but usually works |
134
|
|
|
|
|
|
|
very well, while also preventing unnecessary overlearning and slows down |
135
|
|
|
|
|
|
|
database growth. |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=cut |
138
|
|
|
|
|
|
|
|
139
|
62
|
|
|
|
|
251
|
push (@cmds, { |
140
|
|
|
|
|
|
|
setting => 'bayes_auto_learn_on_error', |
141
|
|
|
|
|
|
|
default => 0, |
142
|
|
|
|
|
|
|
type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL |
143
|
|
|
|
|
|
|
}); |
144
|
|
|
|
|
|
|
|
145
|
62
|
|
|
|
|
316
|
$conf->{parser}->register_commands(\@cmds); |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub autolearn_discriminator { |
149
|
12
|
|
|
12
|
1
|
36
|
my ($self, $params) = @_; |
150
|
|
|
|
|
|
|
|
151
|
12
|
|
|
|
|
30
|
my $scan = $params->{permsgstatus}; |
152
|
12
|
|
|
|
|
29
|
my $conf = $scan->{conf}; |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# Figure out min/max for autolearning. |
155
|
|
|
|
|
|
|
# Default to specified auto_learn_threshold settings |
156
|
12
|
|
|
|
|
34
|
my $min = $conf->{bayes_auto_learn_threshold_nonspam}; |
157
|
12
|
|
|
|
|
38
|
my $max = $conf->{bayes_auto_learn_threshold_spam}; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# Find out what score we should consider this message to have ... |
160
|
12
|
|
|
|
|
60
|
my $score = $scan->get_autolearn_points(); |
161
|
12
|
|
|
|
|
67
|
my $body_only_points = $scan->get_body_only_points(); |
162
|
12
|
|
|
|
|
59
|
my $head_only_points = $scan->get_head_only_points(); |
163
|
12
|
|
|
|
|
55
|
my $learned_points = $scan->get_learned_points(); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# find out if any of the tests added an autolearn_force status |
166
|
12
|
|
|
|
|
54
|
my $force_autolearn = $scan->get_autolearn_force_status(); |
167
|
12
|
|
|
|
|
55
|
my $force_autolearn_names = $scan->get_autolearn_force_names(); |
168
|
|
|
|
|
|
|
|
169
|
12
|
|
|
|
|
176
|
dbg("learn: auto-learn? ham=$min, spam=$max, ". |
170
|
|
|
|
|
|
|
"body-points=".$body_only_points.", ". |
171
|
|
|
|
|
|
|
"head-points=".$head_only_points.", ". |
172
|
|
|
|
|
|
|
"learned-points=".$learned_points); |
173
|
|
|
|
|
|
|
|
174
|
12
|
|
|
|
|
25
|
my $isspam; |
175
|
12
|
100
|
|
|
|
62
|
if ($score < $min) { |
|
|
50
|
|
|
|
|
|
176
|
3
|
|
|
|
|
9
|
$isspam = 0; |
177
|
|
|
|
|
|
|
} elsif ($score >= $max) { |
178
|
0
|
|
|
|
|
0
|
$isspam = 1; |
179
|
|
|
|
|
|
|
} else { |
180
|
9
|
|
|
|
|
34
|
dbg("learn: auto-learn? no: inside auto-learn thresholds, not considered ham or spam"); |
181
|
9
|
|
|
|
|
81
|
return; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
|
184
|
3
|
|
|
|
|
5
|
my $learner_said_ham_points = -1.0; |
185
|
3
|
|
|
|
|
6
|
my $learner_said_spam_points = 1.0; |
186
|
|
|
|
|
|
|
|
187
|
3
|
50
|
|
|
|
9
|
if ($isspam) { |
188
|
0
|
|
|
|
|
0
|
my $required_body_points = 3; |
189
|
0
|
|
|
|
|
0
|
my $required_head_points = 3; |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
#Set a lower threshold of "just has to be spam" if autolearn_force was set on a rule |
192
|
0
|
0
|
|
|
|
0
|
if ($force_autolearn) { |
193
|
0
|
|
|
|
|
0
|
$required_body_points = -99; |
194
|
0
|
|
|
|
|
0
|
$required_head_points = -99; |
195
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn: autolearn_force flagged for a rule. Removing separate body and head point threshold. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)"); |
196
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn: autolearn_force flagged because of rule(s): $force_autolearn_names"); |
197
|
|
|
|
|
|
|
} else { |
198
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn: autolearn_force not flagged for a rule. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)"); |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
|
201
|
0
|
0
|
|
|
|
0
|
if ($body_only_points < $required_body_points) { |
202
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn? no: scored as spam but too few body points (". |
203
|
|
|
|
|
|
|
$body_only_points." < ".$required_body_points.")"); |
204
|
0
|
|
|
|
|
0
|
return; |
205
|
|
|
|
|
|
|
} |
206
|
0
|
0
|
|
|
|
0
|
if ($head_only_points < $required_head_points) { |
207
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn? no: scored as spam but too few head points (". |
208
|
|
|
|
|
|
|
$head_only_points." < ".$required_head_points.")"); |
209
|
0
|
|
|
|
|
0
|
return; |
210
|
|
|
|
|
|
|
} |
211
|
0
|
0
|
|
|
|
0
|
if ($learned_points < $learner_said_ham_points) { |
212
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn? no: scored as spam but learner indicated ham (". |
213
|
|
|
|
|
|
|
$learned_points." < ".$learner_said_ham_points.")"); |
214
|
0
|
|
|
|
|
0
|
return; |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
0
|
0
|
|
|
|
0
|
if (!$scan->is_spam()) { |
218
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn? no: scored as ham but autolearn wanted spam"); |
219
|
0
|
|
|
|
|
0
|
return; |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
} else { |
223
|
3
|
50
|
|
|
|
12
|
if ($learned_points > $learner_said_spam_points) { |
224
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn? no: scored as ham but learner indicated spam (". |
225
|
|
|
|
|
|
|
$learned_points." > ".$learner_said_spam_points.")"); |
226
|
0
|
|
|
|
|
0
|
return; |
227
|
|
|
|
|
|
|
} |
228
|
|
|
|
|
|
|
|
229
|
3
|
100
|
|
|
|
11
|
if ($scan->is_spam()) { |
230
|
1
|
|
|
|
|
6
|
dbg("learn: auto-learn? no: scored as spam but autolearn wanted ham"); |
231
|
1
|
|
|
|
|
4
|
return; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
2
|
50
|
|
|
|
9
|
if ($conf->{bayes_auto_learn_on_error}) { |
236
|
|
|
|
|
|
|
# learn-on-error strategy chosen: |
237
|
|
|
|
|
|
|
# only allow learning if the autolearning classifier was unsure or |
238
|
|
|
|
|
|
|
# had a different opinion from what we are trying to make it learn |
239
|
|
|
|
|
|
|
# |
240
|
0
|
|
|
|
|
0
|
my $tests = $scan->get_tag('TESTS'); |
241
|
0
|
0
|
0
|
|
|
0
|
if (defined $tests && $tests ne 'none') { |
242
|
0
|
|
|
|
|
0
|
my %t = map { ($_,1) } split(/,/, $tests); |
|
0
|
|
|
|
|
0
|
|
243
|
0
|
0
|
0
|
|
|
0
|
if ($isspam && $t{'BAYES_99'} || !$isspam && $t{'BAYES_00'}) { |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
244
|
0
|
0
|
|
|
|
0
|
dbg("learn: auto-learn? no: learn-on-error, %s, already classified ". |
245
|
|
|
|
|
|
|
"as such", $isspam ? 'spam' : 'ham'); |
246
|
0
|
|
|
|
|
0
|
return; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
2
|
50
|
|
|
|
24
|
dbg("learn: auto-learn? yes, ".($isspam?"spam ($score > $max)":"ham ($score < $min)")." autolearn_force=".($force_autolearn?"yes":"no")); |
|
|
50
|
|
|
|
|
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
#Return an array reference because call_plugins only carry's one return value |
254
|
2
|
|
|
|
|
10
|
return [$isspam, $force_autolearn, $force_autolearn_names]; |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
1; |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=back |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=cut |