File Coverage

blib/lib/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm

Criterion	Covered	Total	%
statement	55	85	64.7
branch	10	32	31.2
condition	1	15	6.6
subroutine	8	8	100.0
pod	2	3	66.6
total	76	143	53.1

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Plugin::AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning
21
22							=head1 SYNOPSIS
23
24							loadplugin Mail::SpamAssassin::Plugin::AutoLearnThreshold
25
26							=head1 DESCRIPTION
27
28							This plugin implements the threshold-based auto-learning discriminator
29							for SpamAssassin's Bayes subsystem. Auto-learning is a mechanism
30							whereby high-scoring mails (or low-scoring mails, for non-spam) are fed
31							into its learning systems without user intervention, during scanning.
32
33							Note that certain tests are ignored when determining whether a message
34							should be trained upon:
35
36							=over 4
37
38							=item * rules with tflags set to 'learn' (the Bayesian rules)
39
40							=item * rules with tflags set to 'userconf' (user configuration)
41
42							=item * rules with tflags set to 'noautolearn'
43
44							=back
45
46							Also note that auto-learning occurs using scores from either scoreset 0
47							or 1, depending on what scoreset is used during message check. It is
48							likely that the message check and auto-learn scores will be different.
49
50							=cut
51
52
53							use Mail::SpamAssassin::Plugin;
54	22			22		159	use Mail::SpamAssassin::Logger;
	22					57
	22					757
55	22			22		126	use strict;
	22					56
	22					1400
56	22			22		172	use warnings;
	22					66
	22					585
57	22			22		118	# use bytes;
	22					65
	22					882
58							use re 'taint';
59	22			22		132
	22					54
	22					16741
60							our @ISA = qw(Mail::SpamAssassin::Plugin);
61
62							my $class = shift;
63							my $mailsaobject = shift;
64	63			63	1	195
65	63					139	$class = ref($class) \|\| $class;
66							my $self = $class->SUPER::new($mailsaobject);
67	63		33			389	bless ($self, $class);
68	63					327
69	63					168	$self->set_config($mailsaobject->{conf});
70
71	63					281	return $self;
72							}
73	63					532
74							my($self, $conf) = @_;
75							my @cmds;
76
77	63			63	0	182	=head1 USER OPTIONS
78	63					117
79							The following configuration settings are used to control auto-learning:
80
81							=over 4
82
83							=item bayes_auto_learn_threshold_nonspam n.nn (default: 0.1)
84
85							The score threshold below which a mail has to score, to be fed into
86							SpamAssassin's learning systems automatically as a non-spam message.
87
88							=cut
89
90							push (@cmds, {
91							setting => 'bayes_auto_learn_threshold_nonspam',
92							default => 0.1,
93	63					330	type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
94							});
95
96							=item bayes_auto_learn_threshold_spam n.nn (default: 12.0)
97
98							The score threshold above which a mail has to score, to be fed into
99							SpamAssassin's learning systems automatically as a spam message.
100
101							Note: SpamAssassin requires at least 3 points from the header, and 3
102							points from the body to auto-learn as spam. Therefore, the minimum
103							working value for this option is 6.
104
105							If the test option autolearn_force is set, the minimum value will
106							remain at 6 points but there is no requirement that the points come
107							from body and header rules. This option is useful for autolearning
108							with rules that are considered to be extremely safe indicators of
109							the spaminess of a message.
110
111							=cut
112
113							push (@cmds, {
114							setting => 'bayes_auto_learn_threshold_spam',
115							default => 12.0,
116	63					274	type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
117							});
118
119							=item bayes_auto_learn_on_error (0 \| 1) (default: 0)
120
121							With C<bayes_auto_learn_on_error> off, autolearning will be performed
122							even if bayes classifier already agrees with the new classification (i.e.
123							yielded BAYES_00 for what we are now trying to teach it as ham, or yielded
124							BAYES_99 for spam). This is a traditional setting, the default was chosen
125							to retain backward compatibility.
126
127							With C<bayes_auto_learn_on_error> turned on, autolearning will be performed
128							only when a bayes classifier had a different opinion from what the autolearner
129							is now trying to teach it (i.e. it made an error in judgement). This strategy
130							may or may not produce better future classifications, but usually works
131							very well, while also preventing unnecessary overlearning and slows down
132							database growth.
133
134							=cut
135
136							push (@cmds, {
137							setting => 'bayes_auto_learn_on_error',
138							default => 0,
139	63					243	type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
140							});
141
142							$conf->{parser}->register_commands(\@cmds);
143							}
144
145	63					286	my ($self, $params) = @_;
146
147							my $scan = $params->{permsgstatus};
148							my $conf = $scan->{conf};
149	12			12	1	37
150							# Figure out min/max for autolearning.
151	12					28	# Default to specified auto_learn_threshold settings
152	12					26	my $min = $conf->{bayes_auto_learn_threshold_nonspam};
153							my $max = $conf->{bayes_auto_learn_threshold_spam};
154
155							# Find out what score we should consider this message to have ...
156	12					43	my $score = $scan->get_autolearn_points();
157	12					26	my $body_only_points = $scan->get_body_only_points();
158							my $head_only_points = $scan->get_head_only_points();
159							my $learned_points = $scan->get_learned_points();
160	12					62
161	12					51	# find out if any of the tests added an autolearn_force status
162	12					57	my $force_autolearn = $scan->get_autolearn_force_status();
163	12					58	my $force_autolearn_names = $scan->get_autolearn_force_names();
164
165							dbg("learn: auto-learn? ham=$min, spam=$max, ".
166	12					47	"body-points=".$body_only_points.", ".
167	12					44	"head-points=".$head_only_points.", ".
168							"learned-points=".$learned_points);
169	12					164
170							my $isspam;
171							if ($score < $min) {
172							$isspam = 0;
173							} elsif ($score >= $max) {
174	12					21	$isspam = 1;
175	12	100				63	} else {
		50
176	3					11	dbg("learn: auto-learn? no: inside auto-learn thresholds, not considered ham or spam");
177							return;
178	0					0	}
179
180	9					38	my $learner_said_ham_points = -1.0;
181	9					32	my $learner_said_spam_points = 1.0;
182
183							if ($isspam) {
184	3					7	my $required_body_points = 3;
185	3					6	my $required_head_points = 3;
186
187	3	50				11	#Set a lower threshold of "just has to be spam" if autolearn_force was set on a rule
188	0					0	if ($force_autolearn) {
189	0					0	$required_body_points = -99;
190							$required_head_points = -99;
191							dbg("learn: auto-learn: autolearn_force flagged for a rule. Removing separate body and head point threshold. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)");
192	0	0				0	dbg("learn: auto-learn: autolearn_force flagged because of rule(s): $force_autolearn_names");
193	0					0	} else {
194	0					0	dbg("learn: auto-learn: autolearn_force not flagged for a rule. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)");
195	0					0	}
196	0					0
197							if ($body_only_points < $required_body_points) {
198	0					0	dbg("learn: auto-learn? no: scored as spam but too few body points (".
199							$body_only_points." < ".$required_body_points.")");
200							return;
201	0	0				0	}
202	0					0	if ($head_only_points < $required_head_points) {
203							dbg("learn: auto-learn? no: scored as spam but too few head points (".
204	0					0	$head_only_points." < ".$required_head_points.")");
205							return;
206	0	0				0	}
207	0					0	if ($learned_points < $learner_said_ham_points) {
208							dbg("learn: auto-learn? no: scored as spam but learner indicated ham (".
209	0					0	$learned_points." < ".$learner_said_ham_points.")");
210							return;
211	0	0				0	}
212	0					0
213							if (!$scan->is_spam()) {
214	0					0	dbg("learn: auto-learn? no: scored as ham but autolearn wanted spam");
215							return;
216							}
217	0	0				0
218	0					0	} else {
219	0					0	if ($learned_points > $learner_said_spam_points) {
220							dbg("learn: auto-learn? no: scored as ham but learner indicated spam (".
221							$learned_points." > ".$learner_said_spam_points.")");
222							return;
223	3	50				12	}
224	0					0
225							if ($scan->is_spam()) {
226	0					0	dbg("learn: auto-learn? no: scored as spam but autolearn wanted ham");
227							return;
228							}
229	3	100				10	}
230	1					5
231	1					4	if ($conf->{bayes_auto_learn_on_error}) {
232							# learn-on-error strategy chosen:
233							# only allow learning if the autolearning classifier was unsure or
234							# had a different opinion from what we are trying to make it learn
235	2	50				8	#
236							my $tests = $scan->get_tag('TESTS');
237							if (defined $tests && $tests ne 'none') {
238							my %t = map { ($_,1) } split(/,/, $tests);
239							if ($isspam && $t{'BAYES_99'} \|\| !$isspam && $t{'BAYES_00'}) {
240	0					0	dbg("learn: auto-learn? no: learn-on-error, %s, already classified ".
241	0	0	0			0	"as such", $isspam ? 'spam' : 'ham');
242	0					0	return;
	0					0
243	0	0	0			0	}
			0
			0
244	0	0				0	}
245							}
246	0					0
247							dbg("learn: auto-learn? yes, ".($isspam?"spam ($score > $max)":"ham ($score < $min)")." autolearn_force=".($force_autolearn?"yes":"no"));
248
249							#Return an array reference because call_plugins only carry's one return value
250							return [$isspam, $force_autolearn, $force_autolearn_names];
251	2	50				20	}
		50
252
253							1;
254	2					9
255							=back
256
257							=cut