line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# <@LICENSE> |
2
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
3
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
4
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
5
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
6
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
7
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
12
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
13
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
15
|
|
|
|
|
|
|
# limitations under the License. |
16
|
|
|
|
|
|
|
# </@LICENSE> |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 NAME |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Mail::SpamAssassin::Plugin::AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 SYNOPSIS |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
loadplugin Mail::SpamAssassin::Plugin::AutoLearnThreshold |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 DESCRIPTION |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
This plugin implements the threshold-based auto-learning discriminator |
29
|
|
|
|
|
|
|
for SpamAssassin's Bayes subsystem. Auto-learning is a mechanism |
30
|
|
|
|
|
|
|
whereby high-scoring mails (or low-scoring mails, for non-spam) are fed |
31
|
|
|
|
|
|
|
into its learning systems without user intervention, during scanning. |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
Note that certain tests are ignored when determining whether a message |
34
|
|
|
|
|
|
|
should be trained upon: |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=over 4 |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=item * rules with tflags set to 'learn' (the Bayesian rules) |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=item * rules with tflags set to 'userconf' (user configuration) |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=item * rules with tflags set to 'noautolearn' |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=back |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
Also note that auto-learning occurs using scores from either scoreset 0 |
47
|
|
|
|
|
|
|
or 1, depending on what scoreset is used during message check. It is |
48
|
|
|
|
|
|
|
likely that the message check and auto-learn scores will be different. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
use Mail::SpamAssassin::Plugin; |
54
|
22
|
|
|
22
|
|
159
|
use Mail::SpamAssassin::Logger; |
|
22
|
|
|
|
|
57
|
|
|
22
|
|
|
|
|
757
|
|
55
|
22
|
|
|
22
|
|
126
|
use strict; |
|
22
|
|
|
|
|
56
|
|
|
22
|
|
|
|
|
1400
|
|
56
|
22
|
|
|
22
|
|
172
|
use warnings; |
|
22
|
|
|
|
|
66
|
|
|
22
|
|
|
|
|
585
|
|
57
|
22
|
|
|
22
|
|
118
|
# use bytes; |
|
22
|
|
|
|
|
65
|
|
|
22
|
|
|
|
|
882
|
|
58
|
|
|
|
|
|
|
use re 'taint'; |
59
|
22
|
|
|
22
|
|
132
|
|
|
22
|
|
|
|
|
54
|
|
|
22
|
|
|
|
|
16741
|
|
60
|
|
|
|
|
|
|
our @ISA = qw(Mail::SpamAssassin::Plugin); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
my $class = shift; |
63
|
|
|
|
|
|
|
my $mailsaobject = shift; |
64
|
63
|
|
|
63
|
1
|
195
|
|
65
|
63
|
|
|
|
|
139
|
$class = ref($class) || $class; |
66
|
|
|
|
|
|
|
my $self = $class->SUPER::new($mailsaobject); |
67
|
63
|
|
33
|
|
|
389
|
bless ($self, $class); |
68
|
63
|
|
|
|
|
327
|
|
69
|
63
|
|
|
|
|
168
|
$self->set_config($mailsaobject->{conf}); |
70
|
|
|
|
|
|
|
|
71
|
63
|
|
|
|
|
281
|
return $self; |
72
|
|
|
|
|
|
|
} |
73
|
63
|
|
|
|
|
532
|
|
74
|
|
|
|
|
|
|
my($self, $conf) = @_; |
75
|
|
|
|
|
|
|
my @cmds; |
76
|
|
|
|
|
|
|
|
77
|
63
|
|
|
63
|
0
|
182
|
=head1 USER OPTIONS |
78
|
63
|
|
|
|
|
117
|
|
79
|
|
|
|
|
|
|
The following configuration settings are used to control auto-learning: |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
=over 4 |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=item bayes_auto_learn_threshold_nonspam n.nn (default: 0.1) |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
The score threshold below which a mail has to score, to be fed into |
86
|
|
|
|
|
|
|
SpamAssassin's learning systems automatically as a non-spam message. |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=cut |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
push (@cmds, { |
91
|
|
|
|
|
|
|
setting => 'bayes_auto_learn_threshold_nonspam', |
92
|
|
|
|
|
|
|
default => 0.1, |
93
|
63
|
|
|
|
|
330
|
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC |
94
|
|
|
|
|
|
|
}); |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=item bayes_auto_learn_threshold_spam n.nn (default: 12.0) |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
The score threshold above which a mail has to score, to be fed into |
99
|
|
|
|
|
|
|
SpamAssassin's learning systems automatically as a spam message. |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Note: SpamAssassin requires at least 3 points from the header, and 3 |
102
|
|
|
|
|
|
|
points from the body to auto-learn as spam. Therefore, the minimum |
103
|
|
|
|
|
|
|
working value for this option is 6. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
If the test option autolearn_force is set, the minimum value will |
106
|
|
|
|
|
|
|
remain at 6 points but there is no requirement that the points come |
107
|
|
|
|
|
|
|
from body and header rules. This option is useful for autolearning |
108
|
|
|
|
|
|
|
with rules that are considered to be extremely safe indicators of |
109
|
|
|
|
|
|
|
the spaminess of a message. |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=cut |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
push (@cmds, { |
114
|
|
|
|
|
|
|
setting => 'bayes_auto_learn_threshold_spam', |
115
|
|
|
|
|
|
|
default => 12.0, |
116
|
63
|
|
|
|
|
274
|
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC |
117
|
|
|
|
|
|
|
}); |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=item bayes_auto_learn_on_error (0 | 1) (default: 0) |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
With C<bayes_auto_learn_on_error> off, autolearning will be performed |
122
|
|
|
|
|
|
|
even if bayes classifier already agrees with the new classification (i.e. |
123
|
|
|
|
|
|
|
yielded BAYES_00 for what we are now trying to teach it as ham, or yielded |
124
|
|
|
|
|
|
|
BAYES_99 for spam). This is a traditional setting, the default was chosen |
125
|
|
|
|
|
|
|
to retain backward compatibility. |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
With C<bayes_auto_learn_on_error> turned on, autolearning will be performed |
128
|
|
|
|
|
|
|
only when a bayes classifier had a different opinion from what the autolearner |
129
|
|
|
|
|
|
|
is now trying to teach it (i.e. it made an error in judgement). This strategy |
130
|
|
|
|
|
|
|
may or may not produce better future classifications, but usually works |
131
|
|
|
|
|
|
|
very well, while also preventing unnecessary overlearning and slows down |
132
|
|
|
|
|
|
|
database growth. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=cut |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
push (@cmds, { |
137
|
|
|
|
|
|
|
setting => 'bayes_auto_learn_on_error', |
138
|
|
|
|
|
|
|
default => 0, |
139
|
63
|
|
|
|
|
243
|
type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL |
140
|
|
|
|
|
|
|
}); |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
$conf->{parser}->register_commands(\@cmds); |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
63
|
|
|
|
|
286
|
my ($self, $params) = @_; |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
my $scan = $params->{permsgstatus}; |
148
|
|
|
|
|
|
|
my $conf = $scan->{conf}; |
149
|
12
|
|
|
12
|
1
|
37
|
|
150
|
|
|
|
|
|
|
# Figure out min/max for autolearning. |
151
|
12
|
|
|
|
|
28
|
# Default to specified auto_learn_threshold settings |
152
|
12
|
|
|
|
|
26
|
my $min = $conf->{bayes_auto_learn_threshold_nonspam}; |
153
|
|
|
|
|
|
|
my $max = $conf->{bayes_auto_learn_threshold_spam}; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Find out what score we should consider this message to have ... |
156
|
12
|
|
|
|
|
43
|
my $score = $scan->get_autolearn_points(); |
157
|
12
|
|
|
|
|
26
|
my $body_only_points = $scan->get_body_only_points(); |
158
|
|
|
|
|
|
|
my $head_only_points = $scan->get_head_only_points(); |
159
|
|
|
|
|
|
|
my $learned_points = $scan->get_learned_points(); |
160
|
12
|
|
|
|
|
62
|
|
161
|
12
|
|
|
|
|
51
|
# find out if any of the tests added an autolearn_force status |
162
|
12
|
|
|
|
|
57
|
my $force_autolearn = $scan->get_autolearn_force_status(); |
163
|
12
|
|
|
|
|
58
|
my $force_autolearn_names = $scan->get_autolearn_force_names(); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
dbg("learn: auto-learn? ham=$min, spam=$max, ". |
166
|
12
|
|
|
|
|
47
|
"body-points=".$body_only_points.", ". |
167
|
12
|
|
|
|
|
44
|
"head-points=".$head_only_points.", ". |
168
|
|
|
|
|
|
|
"learned-points=".$learned_points); |
169
|
12
|
|
|
|
|
164
|
|
170
|
|
|
|
|
|
|
my $isspam; |
171
|
|
|
|
|
|
|
if ($score < $min) { |
172
|
|
|
|
|
|
|
$isspam = 0; |
173
|
|
|
|
|
|
|
} elsif ($score >= $max) { |
174
|
12
|
|
|
|
|
21
|
$isspam = 1; |
175
|
12
|
100
|
|
|
|
63
|
} else { |
|
|
50
|
|
|
|
|
|
176
|
3
|
|
|
|
|
11
|
dbg("learn: auto-learn? no: inside auto-learn thresholds, not considered ham or spam"); |
177
|
|
|
|
|
|
|
return; |
178
|
0
|
|
|
|
|
0
|
} |
179
|
|
|
|
|
|
|
|
180
|
9
|
|
|
|
|
38
|
my $learner_said_ham_points = -1.0; |
181
|
9
|
|
|
|
|
32
|
my $learner_said_spam_points = 1.0; |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
if ($isspam) { |
184
|
3
|
|
|
|
|
7
|
my $required_body_points = 3; |
185
|
3
|
|
|
|
|
6
|
my $required_head_points = 3; |
186
|
|
|
|
|
|
|
|
187
|
3
|
50
|
|
|
|
11
|
#Set a lower threshold of "just has to be spam" if autolearn_force was set on a rule |
188
|
0
|
|
|
|
|
0
|
if ($force_autolearn) { |
189
|
0
|
|
|
|
|
0
|
$required_body_points = -99; |
190
|
|
|
|
|
|
|
$required_head_points = -99; |
191
|
|
|
|
|
|
|
dbg("learn: auto-learn: autolearn_force flagged for a rule. Removing separate body and head point threshold. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)"); |
192
|
0
|
0
|
|
|
|
0
|
dbg("learn: auto-learn: autolearn_force flagged because of rule(s): $force_autolearn_names"); |
193
|
0
|
|
|
|
|
0
|
} else { |
194
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn: autolearn_force not flagged for a rule. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)"); |
195
|
0
|
|
|
|
|
0
|
} |
196
|
0
|
|
|
|
|
0
|
|
197
|
|
|
|
|
|
|
if ($body_only_points < $required_body_points) { |
198
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn? no: scored as spam but too few body points (". |
199
|
|
|
|
|
|
|
$body_only_points." < ".$required_body_points.")"); |
200
|
|
|
|
|
|
|
return; |
201
|
0
|
0
|
|
|
|
0
|
} |
202
|
0
|
|
|
|
|
0
|
if ($head_only_points < $required_head_points) { |
203
|
|
|
|
|
|
|
dbg("learn: auto-learn? no: scored as spam but too few head points (". |
204
|
0
|
|
|
|
|
0
|
$head_only_points." < ".$required_head_points.")"); |
205
|
|
|
|
|
|
|
return; |
206
|
0
|
0
|
|
|
|
0
|
} |
207
|
0
|
|
|
|
|
0
|
if ($learned_points < $learner_said_ham_points) { |
208
|
|
|
|
|
|
|
dbg("learn: auto-learn? no: scored as spam but learner indicated ham (". |
209
|
0
|
|
|
|
|
0
|
$learned_points." < ".$learner_said_ham_points.")"); |
210
|
|
|
|
|
|
|
return; |
211
|
0
|
0
|
|
|
|
0
|
} |
212
|
0
|
|
|
|
|
0
|
|
213
|
|
|
|
|
|
|
if (!$scan->is_spam()) { |
214
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn? no: scored as ham but autolearn wanted spam"); |
215
|
|
|
|
|
|
|
return; |
216
|
|
|
|
|
|
|
} |
217
|
0
|
0
|
|
|
|
0
|
|
218
|
0
|
|
|
|
|
0
|
} else { |
219
|
0
|
|
|
|
|
0
|
if ($learned_points > $learner_said_spam_points) { |
220
|
|
|
|
|
|
|
dbg("learn: auto-learn? no: scored as ham but learner indicated spam (". |
221
|
|
|
|
|
|
|
$learned_points." > ".$learner_said_spam_points.")"); |
222
|
|
|
|
|
|
|
return; |
223
|
3
|
50
|
|
|
|
12
|
} |
224
|
0
|
|
|
|
|
0
|
|
225
|
|
|
|
|
|
|
if ($scan->is_spam()) { |
226
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn? no: scored as spam but autolearn wanted ham"); |
227
|
|
|
|
|
|
|
return; |
228
|
|
|
|
|
|
|
} |
229
|
3
|
100
|
|
|
|
10
|
} |
230
|
1
|
|
|
|
|
5
|
|
231
|
1
|
|
|
|
|
4
|
if ($conf->{bayes_auto_learn_on_error}) { |
232
|
|
|
|
|
|
|
# learn-on-error strategy chosen: |
233
|
|
|
|
|
|
|
# only allow learning if the autolearning classifier was unsure or |
234
|
|
|
|
|
|
|
# had a different opinion from what we are trying to make it learn |
235
|
2
|
50
|
|
|
|
8
|
# |
236
|
|
|
|
|
|
|
my $tests = $scan->get_tag('TESTS'); |
237
|
|
|
|
|
|
|
if (defined $tests && $tests ne 'none') { |
238
|
|
|
|
|
|
|
my %t = map { ($_,1) } split(/,/, $tests); |
239
|
|
|
|
|
|
|
if ($isspam && $t{'BAYES_99'} || !$isspam && $t{'BAYES_00'}) { |
240
|
0
|
|
|
|
|
0
|
dbg("learn: auto-learn? no: learn-on-error, %s, already classified ". |
241
|
0
|
0
|
0
|
|
|
0
|
"as such", $isspam ? 'spam' : 'ham'); |
242
|
0
|
|
|
|
|
0
|
return; |
|
0
|
|
|
|
|
0
|
|
243
|
0
|
0
|
0
|
|
|
0
|
} |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
244
|
0
|
0
|
|
|
|
0
|
} |
245
|
|
|
|
|
|
|
} |
246
|
0
|
|
|
|
|
0
|
|
247
|
|
|
|
|
|
|
dbg("learn: auto-learn? yes, ".($isspam?"spam ($score > $max)":"ham ($score < $min)")." autolearn_force=".($force_autolearn?"yes":"no")); |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
#Return an array reference because call_plugins only carry's one return value |
250
|
|
|
|
|
|
|
return [$isspam, $force_autolearn, $force_autolearn_names]; |
251
|
2
|
50
|
|
|
|
20
|
} |
|
|
50
|
|
|
|
|
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
1; |
254
|
2
|
|
|
|
|
9
|
|
255
|
|
|
|
|
|
|
=back |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=cut |