| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | # <@LICENSE> | 
| 2 |  |  |  |  |  |  | # Licensed to the Apache Software Foundation (ASF) under one or more | 
| 3 |  |  |  |  |  |  | # contributor license agreements.  See the NOTICE file distributed with | 
| 4 |  |  |  |  |  |  | # this work for additional information regarding copyright ownership. | 
| 5 |  |  |  |  |  |  | # The ASF licenses this file to you under the Apache License, Version 2.0 | 
| 6 |  |  |  |  |  |  | # (the "License"); you may not use this file except in compliance with | 
| 7 |  |  |  |  |  |  | # the License.  You may obtain a copy of the License at: | 
| 8 |  |  |  |  |  |  | # | 
| 9 |  |  |  |  |  |  | #     http://www.apache.org/licenses/LICENSE-2.0 | 
| 10 |  |  |  |  |  |  | # | 
| 11 |  |  |  |  |  |  | # Unless required by applicable law or agreed to in writing, software | 
| 12 |  |  |  |  |  |  | # distributed under the License is distributed on an "AS IS" BASIS, | 
| 13 |  |  |  |  |  |  | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
| 14 |  |  |  |  |  |  | # See the License for the specific language governing permissions and | 
| 15 |  |  |  |  |  |  | # limitations under the License. | 
| 16 |  |  |  |  |  |  | # </@LICENSE> | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | =head1 NAME | 
| 19 |  |  |  |  |  |  |  | 
| 20 |  |  |  |  |  |  | Mail::SpamAssassin::Plugin::AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 23 |  |  |  |  |  |  |  | 
| 24 |  |  |  |  |  |  | loadplugin     Mail::SpamAssassin::Plugin::AutoLearnThreshold | 
| 25 |  |  |  |  |  |  |  | 
| 26 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | This plugin implements the threshold-based auto-learning discriminator | 
| 29 |  |  |  |  |  |  | for SpamAssassin's Bayes subsystem.  Auto-learning is a mechanism | 
| 30 |  |  |  |  |  |  | whereby high-scoring mails (or low-scoring mails, for non-spam) are fed | 
| 31 |  |  |  |  |  |  | into its learning systems without user intervention, during scanning. | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | Note that certain tests are ignored when determining whether a message | 
| 34 |  |  |  |  |  |  | should be trained upon: | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | =over 4 | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | =item * rules with tflags set to 'learn' (the Bayesian rules) | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | =item * rules with tflags set to 'userconf' (user configuration) | 
| 41 |  |  |  |  |  |  |  | 
| 42 |  |  |  |  |  |  | =item * rules with tflags set to 'noautolearn' | 
| 43 |  |  |  |  |  |  |  | 
| 44 |  |  |  |  |  |  | =back | 
| 45 |  |  |  |  |  |  |  | 
| 46 |  |  |  |  |  |  | Also note that auto-learning occurs using scores from either scoreset 0 | 
| 47 |  |  |  |  |  |  | or 1, depending on what scoreset is used during message check.  It is | 
| 48 |  |  |  |  |  |  | likely that the message check and auto-learn scores will be different. | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | =cut | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | package Mail::SpamAssassin::Plugin::AutoLearnThreshold; | 
| 53 |  |  |  |  |  |  |  | 
| 54 | 21 |  |  | 21 |  | 172 | use Mail::SpamAssassin::Plugin; | 
|  | 21 |  |  |  |  | 53 |  | 
|  | 21 |  |  |  |  | 706 |  | 
| 55 | 21 |  |  | 21 |  | 153 | use Mail::SpamAssassin::Logger; | 
|  | 21 |  |  |  |  | 68 |  | 
|  | 21 |  |  |  |  | 1355 |  | 
| 56 | 21 |  |  | 21 |  | 145 | use strict; | 
|  | 21 |  |  |  |  | 99 |  | 
|  | 21 |  |  |  |  | 609 |  | 
| 57 | 21 |  |  | 21 |  | 146 | use warnings; | 
|  | 21 |  |  |  |  | 46 |  | 
|  | 21 |  |  |  |  | 919 |  | 
| 58 |  |  |  |  |  |  | # use bytes; | 
| 59 | 21 |  |  | 21 |  | 179 | use re 'taint'; | 
|  | 21 |  |  |  |  | 45 |  | 
|  | 21 |  |  |  |  | 17977 |  | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | our @ISA = qw(Mail::SpamAssassin::Plugin); | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | sub new { | 
| 64 | 62 |  |  | 62 | 1 | 235 | my $class = shift; | 
| 65 | 62 |  |  |  |  | 177 | my $mailsaobject = shift; | 
| 66 |  |  |  |  |  |  |  | 
| 67 | 62 |  | 33 |  |  | 475 | $class = ref($class) || $class; | 
| 68 | 62 |  |  |  |  | 402 | my $self = $class->SUPER::new($mailsaobject); | 
| 69 | 62 |  |  |  |  | 218 | bless ($self, $class); | 
| 70 |  |  |  |  |  |  |  | 
| 71 | 62 |  |  |  |  | 361 | $self->set_config($mailsaobject->{conf}); | 
| 72 |  |  |  |  |  |  |  | 
| 73 | 62 |  |  |  |  | 644 | return $self; | 
| 74 |  |  |  |  |  |  | } | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | sub set_config { | 
| 77 | 62 |  |  | 62 | 0 | 206 | my($self, $conf) = @_; | 
| 78 | 62 |  |  |  |  | 142 | my @cmds; | 
| 79 |  |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | =head1 USER OPTIONS | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | The following configuration settings are used to control auto-learning: | 
| 83 |  |  |  |  |  |  |  | 
| 84 |  |  |  |  |  |  | =over 4 | 
| 85 |  |  |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | =item bayes_auto_learn_threshold_nonspam n.nn   (default: 0.1) | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | The score threshold below which a mail has to score, to be fed into | 
| 89 |  |  |  |  |  |  | SpamAssassin's learning systems automatically as a non-spam message. | 
| 90 |  |  |  |  |  |  |  | 
| 91 |  |  |  |  |  |  | =cut | 
| 92 |  |  |  |  |  |  |  | 
| 93 | 62 |  |  |  |  | 375 | push (@cmds, { | 
| 94 |  |  |  |  |  |  | setting => 'bayes_auto_learn_threshold_nonspam', | 
| 95 |  |  |  |  |  |  | default => 0.1, | 
| 96 |  |  |  |  |  |  | type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC | 
| 97 |  |  |  |  |  |  | }); | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | =item bayes_auto_learn_threshold_spam n.nn      (default: 12.0) | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | The score threshold above which a mail has to score, to be fed into | 
| 102 |  |  |  |  |  |  | SpamAssassin's learning systems automatically as a spam message. | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | Note: SpamAssassin requires at least 3 points from the header, and 3 | 
| 105 |  |  |  |  |  |  | points from the body to auto-learn as spam.  Therefore, the minimum | 
| 106 |  |  |  |  |  |  | working value for this option is 6. | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | If the test option autolearn_force is set, the minimum value will | 
| 109 |  |  |  |  |  |  | remain at 6 points but there is no requirement that the points come | 
| 110 |  |  |  |  |  |  | from body and header rules.  This option is useful for autolearning | 
| 111 |  |  |  |  |  |  | with rules that are considered to be extremely safe indicators of | 
| 112 |  |  |  |  |  |  | the spaminess of a message. | 
| 113 |  |  |  |  |  |  |  | 
| 114 |  |  |  |  |  |  | =cut | 
| 115 |  |  |  |  |  |  |  | 
| 116 | 62 |  |  |  |  | 307 | push (@cmds, { | 
| 117 |  |  |  |  |  |  | setting => 'bayes_auto_learn_threshold_spam', | 
| 118 |  |  |  |  |  |  | default => 12.0, | 
| 119 |  |  |  |  |  |  | type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC | 
| 120 |  |  |  |  |  |  | }); | 
| 121 |  |  |  |  |  |  |  | 
| 122 |  |  |  |  |  |  | =item bayes_auto_learn_on_error (0 | 1)        (default: 0) | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | With C<bayes_auto_learn_on_error> off, autolearning will be performed | 
| 125 |  |  |  |  |  |  | even if bayes classifier already agrees with the new classification (i.e. | 
| 126 |  |  |  |  |  |  | yielded BAYES_00 for what we are now trying to teach it as ham, or yielded | 
| 127 |  |  |  |  |  |  | BAYES_99 for spam). This is a traditional setting, the default was chosen | 
| 128 |  |  |  |  |  |  | to retain backward compatibility. | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | With C<bayes_auto_learn_on_error> turned on, autolearning will be performed | 
| 131 |  |  |  |  |  |  | only when a bayes classifier had a different opinion from what the autolearner | 
| 132 |  |  |  |  |  |  | is now trying to teach it (i.e. it made an error in judgement). This strategy | 
| 133 |  |  |  |  |  |  | may or may not produce better future classifications, but usually works | 
| 134 |  |  |  |  |  |  | very well, while also preventing unnecessary overlearning and slows down | 
| 135 |  |  |  |  |  |  | database growth. | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | =cut | 
| 138 |  |  |  |  |  |  |  | 
| 139 | 62 |  |  |  |  | 251 | push (@cmds, { | 
| 140 |  |  |  |  |  |  | setting => 'bayes_auto_learn_on_error', | 
| 141 |  |  |  |  |  |  | default => 0, | 
| 142 |  |  |  |  |  |  | type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL | 
| 143 |  |  |  |  |  |  | }); | 
| 144 |  |  |  |  |  |  |  | 
| 145 | 62 |  |  |  |  | 316 | $conf->{parser}->register_commands(\@cmds); | 
| 146 |  |  |  |  |  |  | } | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | sub autolearn_discriminator { | 
| 149 | 12 |  |  | 12 | 1 | 36 | my ($self, $params) = @_; | 
| 150 |  |  |  |  |  |  |  | 
| 151 | 12 |  |  |  |  | 30 | my $scan = $params->{permsgstatus}; | 
| 152 | 12 |  |  |  |  | 29 | my $conf = $scan->{conf}; | 
| 153 |  |  |  |  |  |  |  | 
| 154 |  |  |  |  |  |  | # Figure out min/max for autolearning. | 
| 155 |  |  |  |  |  |  | # Default to specified auto_learn_threshold settings | 
| 156 | 12 |  |  |  |  | 34 | my $min = $conf->{bayes_auto_learn_threshold_nonspam}; | 
| 157 | 12 |  |  |  |  | 38 | my $max = $conf->{bayes_auto_learn_threshold_spam}; | 
| 158 |  |  |  |  |  |  |  | 
| 159 |  |  |  |  |  |  | # Find out what score we should consider this message to have ... | 
| 160 | 12 |  |  |  |  | 60 | my $score = $scan->get_autolearn_points(); | 
| 161 | 12 |  |  |  |  | 67 | my $body_only_points = $scan->get_body_only_points(); | 
| 162 | 12 |  |  |  |  | 59 | my $head_only_points = $scan->get_head_only_points(); | 
| 163 | 12 |  |  |  |  | 55 | my $learned_points = $scan->get_learned_points(); | 
| 164 |  |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | # find out if any of the tests added an autolearn_force status | 
| 166 | 12 |  |  |  |  | 54 | my $force_autolearn = $scan->get_autolearn_force_status(); | 
| 167 | 12 |  |  |  |  | 55 | my $force_autolearn_names = $scan->get_autolearn_force_names(); | 
| 168 |  |  |  |  |  |  |  | 
| 169 | 12 |  |  |  |  | 176 | dbg("learn: auto-learn? ham=$min, spam=$max, ". | 
| 170 |  |  |  |  |  |  | "body-points=".$body_only_points.", ". | 
| 171 |  |  |  |  |  |  | "head-points=".$head_only_points.", ". | 
| 172 |  |  |  |  |  |  | "learned-points=".$learned_points); | 
| 173 |  |  |  |  |  |  |  | 
| 174 | 12 |  |  |  |  | 25 | my $isspam; | 
| 175 | 12 | 100 |  |  |  | 62 | if ($score < $min) { | 
|  |  | 50 |  |  |  |  |  | 
| 176 | 3 |  |  |  |  | 9 | $isspam = 0; | 
| 177 |  |  |  |  |  |  | } elsif ($score >= $max) { | 
| 178 | 0 |  |  |  |  | 0 | $isspam = 1; | 
| 179 |  |  |  |  |  |  | } else { | 
| 180 | 9 |  |  |  |  | 34 | dbg("learn: auto-learn? no: inside auto-learn thresholds, not considered ham or spam"); | 
| 181 | 9 |  |  |  |  | 81 | return; | 
| 182 |  |  |  |  |  |  | } | 
| 183 |  |  |  |  |  |  |  | 
| 184 | 3 |  |  |  |  | 5 | my $learner_said_ham_points = -1.0; | 
| 185 | 3 |  |  |  |  | 6 | my $learner_said_spam_points = 1.0; | 
| 186 |  |  |  |  |  |  |  | 
| 187 | 3 | 50 |  |  |  | 9 | if ($isspam) { | 
| 188 | 0 |  |  |  |  | 0 | my $required_body_points = 3; | 
| 189 | 0 |  |  |  |  | 0 | my $required_head_points = 3; | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | #Set a lower threshold of "just has to be spam" if autolearn_force was set on a rule | 
| 192 | 0 | 0 |  |  |  | 0 | if ($force_autolearn) { | 
| 193 | 0 |  |  |  |  | 0 | $required_body_points = -99; | 
| 194 | 0 |  |  |  |  | 0 | $required_head_points = -99; | 
| 195 | 0 |  |  |  |  | 0 | dbg("learn: auto-learn: autolearn_force flagged for a rule.  Removing separate body and head point threshold.  Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)"); | 
| 196 | 0 |  |  |  |  | 0 | dbg("learn: auto-learn: autolearn_force flagged because of rule(s): $force_autolearn_names"); | 
| 197 |  |  |  |  |  |  | } else { | 
| 198 | 0 |  |  |  |  | 0 | dbg("learn: auto-learn: autolearn_force not flagged for a rule. Body Only Points: $body_only_points ($required_body_points req'd) / Head Only Points: $head_only_points ($required_head_points req'd)"); | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  |  | 
| 201 | 0 | 0 |  |  |  | 0 | if ($body_only_points < $required_body_points) { | 
| 202 | 0 |  |  |  |  | 0 | dbg("learn: auto-learn? no: scored as spam but too few body points (". | 
| 203 |  |  |  |  |  |  | $body_only_points." < ".$required_body_points.")"); | 
| 204 | 0 |  |  |  |  | 0 | return; | 
| 205 |  |  |  |  |  |  | } | 
| 206 | 0 | 0 |  |  |  | 0 | if ($head_only_points < $required_head_points) { | 
| 207 | 0 |  |  |  |  | 0 | dbg("learn: auto-learn? no: scored as spam but too few head points (". | 
| 208 |  |  |  |  |  |  | $head_only_points." < ".$required_head_points.")"); | 
| 209 | 0 |  |  |  |  | 0 | return; | 
| 210 |  |  |  |  |  |  | } | 
| 211 | 0 | 0 |  |  |  | 0 | if ($learned_points < $learner_said_ham_points) { | 
| 212 | 0 |  |  |  |  | 0 | dbg("learn: auto-learn? no: scored as spam but learner indicated ham (". | 
| 213 |  |  |  |  |  |  | $learned_points." < ".$learner_said_ham_points.")"); | 
| 214 | 0 |  |  |  |  | 0 | return; | 
| 215 |  |  |  |  |  |  | } | 
| 216 |  |  |  |  |  |  |  | 
| 217 | 0 | 0 |  |  |  | 0 | if (!$scan->is_spam()) { | 
| 218 | 0 |  |  |  |  | 0 | dbg("learn: auto-learn? no: scored as ham but autolearn wanted spam"); | 
| 219 | 0 |  |  |  |  | 0 | return; | 
| 220 |  |  |  |  |  |  | } | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | } else { | 
| 223 | 3 | 50 |  |  |  | 12 | if ($learned_points > $learner_said_spam_points) { | 
| 224 | 0 |  |  |  |  | 0 | dbg("learn: auto-learn? no: scored as ham but learner indicated spam (". | 
| 225 |  |  |  |  |  |  | $learned_points." > ".$learner_said_spam_points.")"); | 
| 226 | 0 |  |  |  |  | 0 | return; | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  |  | 
| 229 | 3 | 100 |  |  |  | 11 | if ($scan->is_spam()) { | 
| 230 | 1 |  |  |  |  | 6 | dbg("learn: auto-learn? no: scored as spam but autolearn wanted ham"); | 
| 231 | 1 |  |  |  |  | 4 | return; | 
| 232 |  |  |  |  |  |  | } | 
| 233 |  |  |  |  |  |  | } | 
| 234 |  |  |  |  |  |  |  | 
| 235 | 2 | 50 |  |  |  | 9 | if ($conf->{bayes_auto_learn_on_error}) { | 
| 236 |  |  |  |  |  |  | # learn-on-error strategy chosen: | 
| 237 |  |  |  |  |  |  | # only allow learning if the autolearning classifier was unsure or | 
| 238 |  |  |  |  |  |  | # had a different opinion from what we are trying to make it learn | 
| 239 |  |  |  |  |  |  | # | 
| 240 | 0 |  |  |  |  | 0 | my $tests = $scan->get_tag('TESTS'); | 
| 241 | 0 | 0 | 0 |  |  | 0 | if (defined $tests && $tests ne 'none') { | 
| 242 | 0 |  |  |  |  | 0 | my %t = map { ($_,1) } split(/,/, $tests); | 
|  | 0 |  |  |  |  | 0 |  | 
| 243 | 0 | 0 | 0 |  |  | 0 | if ($isspam && $t{'BAYES_99'} || !$isspam && $t{'BAYES_00'}) { | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 244 | 0 | 0 |  |  |  | 0 | dbg("learn: auto-learn? no: learn-on-error, %s, already classified ". | 
| 245 |  |  |  |  |  |  | "as such",  $isspam ? 'spam' : 'ham'); | 
| 246 | 0 |  |  |  |  | 0 | return; | 
| 247 |  |  |  |  |  |  | } | 
| 248 |  |  |  |  |  |  | } | 
| 249 |  |  |  |  |  |  | } | 
| 250 |  |  |  |  |  |  |  | 
| 251 | 2 | 50 |  |  |  | 24 | dbg("learn: auto-learn? yes, ".($isspam?"spam ($score > $max)":"ham ($score < $min)")." autolearn_force=".($force_autolearn?"yes":"no")); | 
|  |  | 50 |  |  |  |  |  | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | #Return an array reference because call_plugins only carry's one return value | 
| 254 | 2 |  |  |  |  | 10 | return [$isspam, $force_autolearn, $force_autolearn_names]; | 
| 255 |  |  |  |  |  |  | } | 
| 256 |  |  |  |  |  |  |  | 
| 257 |  |  |  |  |  |  | 1; | 
| 258 |  |  |  |  |  |  |  | 
| 259 |  |  |  |  |  |  | =back | 
| 260 |  |  |  |  |  |  |  | 
| 261 |  |  |  |  |  |  | =cut |