File Coverage

blib/lib/Mail/SpamAssassin/Plugin/AutoLearnThreshold.pm
Criterion Covered Total %
statement 55 85 64.7
branch 10 32 31.2
condition 1 15 6.6
subroutine 8 8 100.0
pod 2 3 66.6
total 76 143 53.1


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Plugin::AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning
21              
22             =head1 SYNOPSIS
23              
24             loadplugin Mail::SpamAssassin::Plugin::AutoLearnThreshold
25              
26             =head1 DESCRIPTION
27              
28             This plugin implements the threshold-based auto-learning discriminator
29             for SpamAssassin's Bayes subsystem. Auto-learning is a mechanism
30             whereby high-scoring mails (or low-scoring mails, for non-spam) are fed
31             into its learning systems without user intervention, during scanning.
32              
33             Note that certain tests are ignored when determining whether a message
34             should be trained upon:
35              
36             =over 4
37              
38             =item * rules with tflags set to 'learn' (the Bayesian rules)
39              
40             =item * rules with tflags set to 'userconf' (user configuration)
41              
42             =item * rules with tflags set to 'noautolearn'
43              
44             =back
45              
46             Also note that auto-learning occurs using scores from either scoreset 0
47             or 1, depending on what scoreset is used during message check. It is
48             likely that the message check and auto-learn scores will be different.
49              
50             =cut
51              
52             package Mail::SpamAssassin::Plugin::AutoLearnThreshold;
53              
54 21     21   172 use Mail::SpamAssassin::Plugin;
  21         53  
  21         706  
55 21     21   153 use Mail::SpamAssassin::Logger;
  21         68  
  21         1355  
56 21     21   145 use strict;
  21         99  
  21         609  
57 21     21   146 use warnings;
  21         46  
  21         919  
58             # use bytes;
59 21     21   179 use re 'taint';
  21         45  
  21         17977  
60              
61             our @ISA = qw(Mail::SpamAssassin::Plugin);
62              
63             sub new {
64 62     62 1 235 my $class = shift;
65 62         177 my $mailsaobject = shift;
66              
67 62   33     475 $class = ref($class) || $class;
68 62         402 my $self = $class->SUPER::new($mailsaobject);
69 62         218 bless ($self, $class);
70              
71 62         361 $self->set_config($mailsaobject->{conf});
72              
73 62         644 return $self;
74             }
75              
76             sub set_config {
77 62     62 0 206 my($self, $conf) = @_;
78 62         142 my @cmds;
79              
80             =head1 USER OPTIONS
81              
82             The following configuration settings are used to control auto-learning:
83              
84             =over 4
85              
86             =item bayes_auto_learn_threshold_nonspam n.nn (default: 0.1)
87              
88             The score threshold below which a mail has to score, to be fed into
89             SpamAssassin's learning systems automatically as a non-spam message.
90              
91             =cut
92              
93 62         375 push (@cmds, {
94             setting => 'bayes_auto_learn_threshold_nonspam',
95             default => 0.1,
96             type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
97             });
98              
99             =item bayes_auto_learn_threshold_spam n.nn (default: 12.0)
100              
101             The score threshold above which a mail has to score, to be fed into
102             SpamAssassin's learning systems automatically as a spam message.
103              
104             Note: SpamAssassin requires at least 3 points from the header, and 3
105             points from the body to auto-learn as spam. Therefore, the minimum
106             working value for this option is 6.
107              
108             If the test option autolearn_force is set, the minimum value will
109             remain at 6 points but there is no requirement that the points come
110             from body and header rules. This option is useful for autolearning
111             with rules that are considered to be extremely safe indicators of
112             the spaminess of a message.
113              
114             =cut
115              
116 62         307 push (@cmds, {
117             setting => 'bayes_auto_learn_threshold_spam',
118             default => 12.0,
119             type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
120             });
121              
122             =item bayes_auto_learn_on_error (0 | 1) (default: 0)
123              
124             With C<bayes_auto_learn_on_error> off, autolearning will be performed
125             even if bayes classifier already agrees with the new classification (i.e.
126             yielded BAYES_00 for what we are now trying to teach it as ham, or yielded
127             BAYES_99 for spam). This is a traditional setting, the default was chosen
128             to retain backward compatibility.
129              
130             With C<bayes_auto_learn_on_error> turned on, autolearning will be performed
131             only when a bayes classifier had a different opinion from what the autolearner
132             is now trying to teach it (i.e. it made an error in judgement). This strategy
133             may or may not produce better future classifications, but usually works
134             very well, while also preventing unnecessary overlearning and slows down
135             database growth.
136              
137             =cut
138              
139 62         251 push (@cmds, {
140             setting => 'bayes_auto_learn_on_error',
141             default => 0,
142             type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
143             });
144              
145 62         316 $conf->{parser}->register_commands(\@cmds);
146             }
147              
148             sub autolearn_discriminator {
149 12     12 1 36 my ($self, $params) = @_;
150              
151 12         30 my $scan = $params->{permsgstatus};
152 12         29 my $conf = $scan->{conf};
153              
154             # Figure out min/max for autolearning.
155             # Default to specified auto_learn_threshold settings
156 12         34 my $min = $conf->{bayes_auto_learn_threshold_nonspam};
157 12         38 my $max = $conf->{bayes_auto_learn_threshold_spam};
158              
159             # Find out what score we should consider this message to have ...
160 12         60 my $score = $scan->get_autolearn_points();
161 12         67 my $body_only_points = $scan->get_body_only_points();
162 12         59 my $head_only_points = $scan->get_head_only_points();
163 12         55 my $learned_points = $scan->get_learned_points();
164              
165             # find out if any of the tests added an autolearn_force status
166 12         54 my $force_autolearn = $scan->get_autolearn_force_status();
167 12         55 my $force_autolearn_names = $scan->get_autolearn_force_names();
168              
169 12         176 dbg("learn: auto-learn? ham=$min, spam=$max, ".
170             "body-points=".$body_only_points.", ".
171             "head-points=".$head_only_points.", ".
172             "learned-points=".$learned_points);
173              
174 12         25 my $isspam;
175 12 100       62 if ($score < $min) {
    50          
176 3         9 $isspam = 0;
177             } elsif ($score >= $max) {
178 0         0 $isspam = 1;
179             } else {
180 9         34 dbg("learn: auto-learn? no: inside auto-learn thresholds, not considered ham or spam");
181 9         81 return;
182             }
183              
184 3         5 my $learner_said_ham_points = -1.0;
185 3         6 my $learner_said_spam_points = 1.0;
186              
187 3 50       9 if ($isspam) {
188 0         0 my $required_body_points = 3;
189 0         0 my $required_head_points = 3;
190              
191             #Set a lower threshold of "just has to be spam" if autolearn_force was set on a rule
192 0 0       0 if ($force_autolearn) {
193 0         0 $required_body_points = -99;
194 0         0 $required_head_points = -99;
195 0         0 dbg("learn: auto-learn: autolearn_force flagged for a rule. Removing separate body and head point threshold. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)");
196 0         0 dbg("learn: auto-learn: autolearn_force flagged because of rule(s): $force_autolearn_names");
197             } else {
198 0         0 dbg("learn: auto-learn: autolearn_force not flagged for a rule. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)");
199             }
200              
201 0 0       0 if ($body_only_points < $required_body_points) {
202 0         0 dbg("learn: auto-learn? no: scored as spam but too few body points (".
203             $body_only_points." < ".$required_body_points.")");
204 0         0 return;
205             }
206 0 0       0 if ($head_only_points < $required_head_points) {
207 0         0 dbg("learn: auto-learn? no: scored as spam but too few head points (".
208             $head_only_points." < ".$required_head_points.")");
209 0         0 return;
210             }
211 0 0       0 if ($learned_points < $learner_said_ham_points) {
212 0         0 dbg("learn: auto-learn? no: scored as spam but learner indicated ham (".
213             $learned_points." < ".$learner_said_ham_points.")");
214 0         0 return;
215             }
216              
217 0 0       0 if (!$scan->is_spam()) {
218 0         0 dbg("learn: auto-learn? no: scored as ham but autolearn wanted spam");
219 0         0 return;
220             }
221              
222             } else {
223 3 50       12 if ($learned_points > $learner_said_spam_points) {
224 0         0 dbg("learn: auto-learn? no: scored as ham but learner indicated spam (".
225             $learned_points." > ".$learner_said_spam_points.")");
226 0         0 return;
227             }
228              
229 3 100       11 if ($scan->is_spam()) {
230 1         6 dbg("learn: auto-learn? no: scored as spam but autolearn wanted ham");
231 1         4 return;
232             }
233             }
234              
235 2 50       9 if ($conf->{bayes_auto_learn_on_error}) {
236             # learn-on-error strategy chosen:
237             # only allow learning if the autolearning classifier was unsure or
238             # had a different opinion from what we are trying to make it learn
239             #
240 0         0 my $tests = $scan->get_tag('TESTS');
241 0 0 0     0 if (defined $tests && $tests ne 'none') {
242 0         0 my %t = map { ($_,1) } split(/,/, $tests);
  0         0  
243 0 0 0     0 if ($isspam && $t{'BAYES_99'} || !$isspam && $t{'BAYES_00'}) {
      0        
      0        
244 0 0       0 dbg("learn: auto-learn? no: learn-on-error, %s, already classified ".
245             "as such", $isspam ? 'spam' : 'ham');
246 0         0 return;
247             }
248             }
249             }
250              
251 2 50       24 dbg("learn: auto-learn? yes, ".($isspam?"spam ($score > $max)":"ham ($score < $min)")." autolearn_force=".($force_autolearn?"yes":"no"));
    50          
252            
253             #Return an array reference because call_plugins only carry's one return value
254 2         10 return [$isspam, $force_autolearn, $force_autolearn_names];
255             }
256              
257             1;
258              
259             =back
260              
261             =cut