File Coverage

blib/lib/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm

Criterion	Covered	Total	%
statement	55	85	64.7
branch	10	32	31.2
condition	1	15	6.6
subroutine	8	8	100.0
pod	2	3	66.6
total	76	143	53.1

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Plugin::AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning
21
22							=head1 SYNOPSIS
23
24							loadplugin Mail::SpamAssassin::Plugin::AutoLearnThreshold
25
26							=head1 DESCRIPTION
27
28							This plugin implements the threshold-based auto-learning discriminator
29							for SpamAssassin's Bayes subsystem. Auto-learning is a mechanism
30							whereby high-scoring mails (or low-scoring mails, for non-spam) are fed
31							into its learning systems without user intervention, during scanning.
32
33							Note that certain tests are ignored when determining whether a message
34							should be trained upon:
35
36							=over 4
37
38							=item * rules with tflags set to 'learn' (the Bayesian rules)
39
40							=item * rules with tflags set to 'userconf' (user configuration)
41
42							=item * rules with tflags set to 'noautolearn'
43
44							=back
45
46							Also note that auto-learning occurs using scores from either scoreset 0
47							or 1, depending on what scoreset is used during message check. It is
48							likely that the message check and auto-learn scores will be different.
49
50							=cut
51
52							package Mail::SpamAssassin::Plugin::AutoLearnThreshold;
53
54	22			22		166	use Mail::SpamAssassin::Plugin;
	22					56
	22					725
55	22			22		126	use Mail::SpamAssassin::Logger;
	22					70
	22					1475
56	22			22		180	use strict;
	22					56
	22					569
57	22			22		122	use warnings;
	22					76
	22					865
58							# use bytes;
59	22			22		161	use re 'taint';
	22					47
	22					18062
60
61							our @ISA = qw(Mail::SpamAssassin::Plugin);
62
63							sub new {
64	63			63	1	222	my $class = shift;
65	63					153	my $mailsaobject = shift;
66
67	63		33			405	$class = ref($class) \|\| $class;
68	63					358	my $self = $class->SUPER::new($mailsaobject);
69	63					167	bless ($self, $class);
70
71	63					344	$self->set_config($mailsaobject->{conf});
72
73	63					574	return $self;
74							}
75
76							sub set_config {
77	63			63	0	189	my($self, $conf) = @_;
78	63					159	my @cmds;
79
80							=head1 USER OPTIONS
81
82							The following configuration settings are used to control auto-learning:
83
84							=over 4
85
86							=item bayes_auto_learn_threshold_nonspam n.nn (default: 0.1)
87
88							The score threshold below which a mail has to score, to be fed into
89							SpamAssassin's learning systems automatically as a non-spam message.
90
91							=cut
92
93	63					374	push (@cmds, {
94							setting => 'bayes_auto_learn_threshold_nonspam',
95							default => 0.1,
96							type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
97							});
98
99							=item bayes_auto_learn_threshold_spam n.nn (default: 12.0)
100
101							The score threshold above which a mail has to score, to be fed into
102							SpamAssassin's learning systems automatically as a spam message.
103
104							Note: SpamAssassin requires at least 3 points from the header, and 3
105							points from the body to auto-learn as spam. Therefore, the minimum
106							working value for this option is 6.
107
108							If the test option autolearn_force is set, the minimum value will
109							remain at 6 points but there is no requirement that the points come
110							from body and header rules. This option is useful for autolearning
111							with rules that are considered to be extremely safe indicators of
112							the spaminess of a message.
113
114							=cut
115
116	63					358	push (@cmds, {
117							setting => 'bayes_auto_learn_threshold_spam',
118							default => 12.0,
119							type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
120							});
121
122							=item bayes_auto_learn_on_error (0 \| 1) (default: 0)
123
124							With C<bayes_auto_learn_on_error> off, autolearning will be performed
125							even if bayes classifier already agrees with the new classification (i.e.
126							yielded BAYES_00 for what we are now trying to teach it as ham, or yielded
127							BAYES_99 for spam). This is a traditional setting, the default was chosen
128							to retain backward compatibility.
129
130							With C<bayes_auto_learn_on_error> turned on, autolearning will be performed
131							only when a bayes classifier had a different opinion from what the autolearner
132							is now trying to teach it (i.e. it made an error in judgement). This strategy
133							may or may not produce better future classifications, but usually works
134							very well, while also preventing unnecessary overlearning and slows down
135							database growth.
136
137							=cut
138
139	63					279	push (@cmds, {
140							setting => 'bayes_auto_learn_on_error',
141							default => 0,
142							type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
143							});
144
145	63					322	$conf->{parser}->register_commands(\@cmds);
146							}
147
148							sub autolearn_discriminator {
149	12			12	1	35	my ($self, $params) = @_;
150
151	12					37	my $scan = $params->{permsgstatus};
152	12					36	my $conf = $scan->{conf};
153
154							# Figure out min/max for autolearning.
155							# Default to specified auto_learn_threshold settings
156	12					32	my $min = $conf->{bayes_auto_learn_threshold_nonspam};
157	12					27	my $max = $conf->{bayes_auto_learn_threshold_spam};
158
159							# Find out what score we should consider this message to have ...
160	12					49	my $score = $scan->get_autolearn_points();
161	12					73	my $body_only_points = $scan->get_body_only_points();
162	12					55	my $head_only_points = $scan->get_head_only_points();
163	12					62	my $learned_points = $scan->get_learned_points();
164
165							# find out if any of the tests added an autolearn_force status
166	12					46	my $force_autolearn = $scan->get_autolearn_force_status();
167	12					44	my $force_autolearn_names = $scan->get_autolearn_force_names();
168
169	12					139	dbg("learn: auto-learn? ham=$min, spam=$max, ".
170							"body-points=".$body_only_points.", ".
171							"head-points=".$head_only_points.", ".
172							"learned-points=".$learned_points);
173
174	12					20	my $isspam;
175	12	100				64	if ($score < $min) {
		50
176	3					8	$isspam = 0;
177							} elsif ($score >= $max) {
178	0					0	$isspam = 1;
179							} else {
180	9					57	dbg("learn: auto-learn? no: inside auto-learn thresholds, not considered ham or spam");
181	9					32	return;
182							}
183
184	3					5	my $learner_said_ham_points = -1.0;
185	3					4	my $learner_said_spam_points = 1.0;
186
187	3	50				8	if ($isspam) {
188	0					0	my $required_body_points = 3;
189	0					0	my $required_head_points = 3;
190
191							#Set a lower threshold of "just has to be spam" if autolearn_force was set on a rule
192	0	0				0	if ($force_autolearn) {
193	0					0	$required_body_points = -99;
194	0					0	$required_head_points = -99;
195	0					0	dbg("learn: auto-learn: autolearn_force flagged for a rule. Removing seperate body and head point threshold. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)");
196	0					0	dbg("learn: auto-learn: autolearn_force flagged because of rule(s): $force_autolearn_names");
197							} else {
198	0					0	dbg("learn: auto-learn: autolearn_force not flagged for a rule. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)");
199							}
200
201	0	0				0	if ($body_only_points < $required_body_points) {
202	0					0	dbg("learn: auto-learn? no: scored as spam but too few body points (".
203							$body_only_points." < ".$required_body_points.")");
204	0					0	return;
205							}
206	0	0				0	if ($head_only_points < $required_head_points) {
207	0					0	dbg("learn: auto-learn? no: scored as spam but too few head points (".
208							$head_only_points." < ".$required_head_points.")");
209	0					0	return;
210							}
211	0	0				0	if ($learned_points < $learner_said_ham_points) {
212	0					0	dbg("learn: auto-learn? no: scored as spam but learner indicated ham (".
213							$learned_points." < ".$learner_said_ham_points.")");
214	0					0	return;
215							}
216
217	0	0				0	if (!$scan->is_spam()) {
218	0					0	dbg("learn: auto-learn? no: scored as ham but autolearn wanted spam");
219	0					0	return;
220							}
221
222							} else {
223	3	50				8	if ($learned_points > $learner_said_spam_points) {
224	0					0	dbg("learn: auto-learn? no: scored as ham but learner indicated spam (".
225							$learned_points." > ".$learner_said_spam_points.")");
226	0					0	return;
227							}
228
229	3	100				8	if ($scan->is_spam()) {
230	1					3	dbg("learn: auto-learn? no: scored as spam but autolearn wanted ham");
231	1					2	return;
232							}
233							}
234
235	2	50				9	if ($conf->{bayes_auto_learn_on_error}) {
236							# learn-on-error strategy chosen:
237							# only allow learning if the autolearning classifier was unsure or
238							# had a different opinion from what we are trying to make it learn
239							#
240	0					0	my $tests = $scan->get_tag('TESTS');
241	0	0	0			0	if (defined $tests && $tests ne 'none') {
242	0					0	my %t = map { ($_,1) } split(/,/, $tests);
	0					0
243	0	0	0			0	if ($isspam && $t{'BAYES_99'} \|\| !$isspam && $t{'BAYES_00'}) {
			0
			0
244	0	0				0	dbg("learn: auto-learn? no: learn-on-error, %s, already classified ".
245							"as such", $isspam ? 'spam' : 'ham');
246	0					0	return;
247							}
248							}
249							}
250
251	2	50				22	dbg("learn: auto-learn? yes, ".($isspam?"spam ($score > $max)":"ham ($score < $min)")." autolearn_force=".($force_autolearn?"yes":"no"));
		50
252
253							#Return an array reference because call_plugins only carry's one return value
254	2					8	return [$isspam, $force_autolearn, $force_autolearn_names];
255							}
256
257							1;
258
259							=back
260
261							=cut