File Coverage

blib/lib/Mail/SpamAssassin/Plugin/Bayes.pm
Criterion Covered Total %
statement 553 753 73.4
branch 158 342 46.2
condition 45 119 37.8
subroutine 58 71 81.6
pod 11 21 52.3
total 825 1306 63.1


line stmt bran cond sub pod time code
1             # <@LICENSE>
2             # Licensed to the Apache Software Foundation (ASF) under one or more
3             # contributor license agreements. See the NOTICE file distributed with
4             # this work for additional information regarding copyright ownership.
5             # The ASF licenses this file to you under the Apache License, Version 2.0
6             # (the "License"); you may not use this file except in compliance with
7             # the License. You may obtain a copy of the License at:
8             #
9             # http://www.apache.org/licenses/LICENSE-2.0
10             #
11             # Unless required by applicable law or agreed to in writing, software
12             # distributed under the License is distributed on an "AS IS" BASIS,
13             # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14             # See the License for the specific language governing permissions and
15             # limitations under the License.
16             # </@LICENSE>
17              
18             =head1 NAME
19              
20             Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier
21              
22             =head1 DESCRIPTION
23              
24             This is a Bayesian-style probabilistic classifier, using an algorithm based on
25             the one detailed in Paul Graham's I<A Plan For Spam> paper at:
26              
27             http://www.paulgraham.com/spam.html
28              
29             It also incorporates some other aspects taken from Graham Robinson's webpage
30             on the subject at:
31              
32             http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
33              
34             And the chi-square probability combiner as described here:
35              
36             http://www.linuxjournal.com/print.php?sid=6467
37              
38             The results are incorporated into SpamAssassin as the BAYES_* rules.
39              
40             =head1 METHODS
41              
42             =cut
43              
44             package Mail::SpamAssassin::Plugin::Bayes;
45              
46 21     21   153 use strict;
  21         47  
  21         720  
47 21     21   120 use warnings;
  21         41  
  21         721  
48             # use bytes;
49 21     21   136 use re 'taint';
  21         42  
  21         2047  
50              
51             BEGIN {
52 21         155 eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
  21         1391  
  21         717  
53 21 50   21   80 or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
  0         0  
  0         0  
54             }
55              
56 21     21   165 use Mail::SpamAssassin;
  21         45  
  21         622  
57 21     21   128 use Mail::SpamAssassin::Plugin;
  21         41  
  21         678  
58 21     21   142 use Mail::SpamAssassin::PerMsgStatus;
  21         41  
  21         599  
59 21     21   171 use Mail::SpamAssassin::Logger;
  21         47  
  21         1337  
60 21     21   148 use Mail::SpamAssassin::Util qw(untaint_var);
  21         38  
  21         982  
61              
62             # pick ONLY ONE of these combining implementations.
63 21     21   7469 use Mail::SpamAssassin::Bayes::CombineChi;
  21         51  
  21         13201  
64             # use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
65              
66             our @ISA = qw(Mail::SpamAssassin::Plugin);
67              
68             # Which headers should we scan for tokens? Don't use all of them, as it's easy
69             # to pick up spurious clues from some. What we now do is use all of them
70             # *less* these well-known headers; that way we can pick up spammers' tracking
71             # headers (which are obviously not well-known in advance!).
72              
73             # Received is handled specially
74             our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise
75             |Delivered-To |Delivery-Date
76             |(?:X-)?Envelope-To
77             |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text
78              
79             |Subject # not worth a tiny gain vs. to db size increase
80              
81             # Date: can provide invalid cues if your spam corpus is
82             # older/newer than ham
83             |Date
84              
85             # List headers: ignore. a spamfiltering mailing list will
86             # become a nonspam sign.
87             |X-List|(?:X-)?Mailing-List
88             |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe
89             |Unsubscribe|Host|Id|Manager|Admin|Comment
90             |Name|Url)
91             |X-Unsub(?:scribe)?
92             |X-Mailman-Version |X-Been[Tt]here |X-Loop
93             |Mail-Followup-To
94             |X-eGroups-(?:Return|From)
95             |X-MDMailing-List
96             |X-XEmacs-List
97             |X-Sympa-To
98              
99             # gatewayed through mailing list (thanks to Allen Smith)
100             |(?:X-)?Resent-(?:From|To|Date)
101             |(?:X-)?Original-(?:From|To|Date)
102              
103             # Spamfilter/virus-scanner headers: too easy to chain from
104             # these
105             |X-MailScanner(?:-SpamCheck)?
106             |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))?
107             |X-Antispam |X-RBL-Warning |X-Mailscanner
108             |X-MDaemon-Deliver-To |X-Virus-Scanned
109             |X-Mass-Check-Id
110             |X-Pyzor |X-DCC-\S{2,25}-Metrics
111             |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner
112             |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status
113             |X-SpamCop-[^:]+
114             |X-SMTPD |(?:X-)?Spam-Apparently-To
115             |SPAM |X-Perlmx-Spam
116             |X-Bogosity
117              
118             # some noisy Outlook headers that add no good clues:
119             |Content-Class |Thread-(?:Index|Topic)
120             |X-Original[Aa]rrival[Tt]ime
121              
122             # Annotations from IMAP, POP, and MH:
123             |(?:X-)?Status |X-Flags |X-Keywords |Replied |Forwarded
124             |Lines |Content-Length
125             |X-UIDL? |X-IMAPbase
126              
127             # Annotations from Bugzilla
128             |X-Bugzilla-[^:]+
129              
130             # Annotations from VM: (thanks to Allen Smith)
131             |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified
132             |Summary-Format|VHeader|v\d-Data|Message-Order)
133              
134             # Annotations from Gnus:
135             | X-Gnus-Mail-Source
136             | Xref
137              
138             )}x;
139              
140             # Note only the presence of these headers, in order to reduce the
141             # hapaxen they generate.
142             our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
143             |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint
144             |D(?:KIM|omainKey)-Signature
145             )}ix;
146              
147             # tweaks tested as of Nov 18 2002 by jm posted to -devel at
148             # http://sourceforge.net/p/spamassassin/mailman/message/12977556/
149             # for results. The winners are now the default settings.
150 21     21   171 use constant IGNORE_TITLE_CASE => 1;
  21         52  
  21         1145  
151 21     21   129 use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0;
  21         46  
  21         940  
152 21     21   130 use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1;
  21         44  
  21         922  
153 21     21   144 use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
  21         46  
  21         882  
154              
155             # tweaks by jm on May 12 2003, see -devel email at
156             # http://sourceforge.net/p/spamassassin/mailman/message/14844556/
157 21     21   121 use constant PRE_CHEW_ADDR_HEADERS => 1;
  21         43  
  21         844  
158 21     21   119 use constant CHEW_BODY_URIS => 1;
  21         46  
  21         944  
159 21     21   121 use constant CHEW_BODY_MAILADDRS => 1;
  21         45  
  21         839  
160 21     21   116 use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
  21         46  
  21         854  
161 21     21   124 use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
  21         43  
  21         802  
162 21     21   110 use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
  21         41  
  21         801  
163 21     21   116 use constant IGNORE_MSGID_TOKENS => 0;
  21         43  
  21         816  
164              
165             # tweaks of 12 March 2004, see bug 2129.
166 21     21   126 use constant DECOMPOSE_BODY_TOKENS => 1;
  21         47  
  21         856  
167 21     21   121 use constant MAP_HEADERS_MID => 1;
  21         50  
  21         870  
168 21     21   113 use constant MAP_HEADERS_FROMTOCC => 1;
  21         41  
  21         844  
169 21     21   124 use constant MAP_HEADERS_USERAGENT => 1;
  21         47  
  21         866  
170              
171             # tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
172 21     21   116 use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
  21         41  
  21         987  
173 21     21   130 use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
  21         38  
  21         2628  
174              
175             # We store header-mined tokens in the db with a "HHeaderName:val" format.
176             # some headers may contain lots of gibberish tokens, so allow a little basic
177             # compression by mapping the header name at least here. these are the headers
178             # which appear with the most frequency in my db. note: this doesn't have to
179             # be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing
180             # tokens from multiple different headers may impact accuracy, so might as well
181             # avoid this if possible. These are the top ones from my corpus, BTW (jm).
182             our %HEADER_NAME_COMPRESSION = (
183             'Message-Id' => '*m',
184             'Message-ID' => '*M',
185             'Received' => '*r',
186             'User-Agent' => '*u',
187             'References' => '*f',
188             'In-Reply-To' => '*i',
189             'From' => '*F',
190             'Reply-To' => '*R',
191             'Return-Path' => '*p',
192             'Return-path' => '*rp',
193             'X-Mailer' => '*x',
194             'X-Authentication-Warning' => '*a',
195             'Organization' => '*o',
196             'Organisation' => '*o',
197             'Content-Type' => '*ct',
198             'Content-Disposition' => '*cd',
199             'Content-Transfer-Encoding' => '*ce',
200             'x-spam-relays-trusted' => '*RT',
201             'x-spam-relays-untrusted' => '*RU',
202             );
203              
204             # How many seconds should the opportunistic_expire lock be valid?
205             our $OPPORTUNISTIC_LOCK_VALID = 300;
206              
207             # Should we use the Robinson f(w) equation from
208             # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?
209             # It gives better results, in that scores are more likely to distribute
210             # into the <0.5 range for nonspam and >0.5 for spam.
211 21     21   147 use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
  21         41  
  21         1103  
212              
213             # How many of the most significant tokens should we use for the p(w)
214             # calculation?
215 21     21   132 use constant N_SIGNIFICANT_TOKENS => 150;
  21         52  
  21         1150  
216              
217             # How many significant tokens are required for a classifier score to
218             # be considered usable?
219 21     21   133 use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
  21         42  
  21         1079  
220              
221             # How long a token should we hold onto? (note: German speakers typically
222             # will require a longer token than English ones.)
223 21     21   138 use constant MAX_TOKEN_LENGTH => 15;
  21         40  
  21         165996  
224              
225             ###########################################################################
226              
227             sub new {
228 62     62 1 275 my $class = shift;
229 62         226 my ($main) = @_;
230              
231 62   33     446 $class = ref($class) || $class;
232 62         427 my $self = $class->SUPER::new($main);
233 62         166 bless ($self, $class);
234              
235 62         228 $self->{main} = $main;
236 62         181 $self->{conf} = $main->{conf};
237 62         237 $self->{use_ignores} = 1;
238              
239 62         357 $self->register_eval_rule("check_bayes");
240 62         553 $self;
241             }
242              
243             sub finish {
244 40     40 1 107 my $self = shift;
245 40 100       196 if ($self->{store}) {
246 39         297 $self->{store}->untie_db();
247             }
248 40         99 %{$self} = ();
  40         305  
249             }
250              
251             ###########################################################################
252              
253             # Plugin hook.
254             # Return this implementation object, for callers that need to know
255             # it. TODO: callers shouldn't *need* to know it!
256             # used only in test suite to get access to {store}, internal APIs.
257             #
258 1160     1160 0 2272 sub learner_get_implementation { return shift; }
259              
260             ###########################################################################
261              
262             # Plugin hook.
263             # Called in the parent process shortly before forking off child processes.
264             sub prefork_init {
265 0     0 0 0 my ($self) = @_;
266              
267 0 0 0     0 if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) {
268 0         0 $self->{store}->prefork_init;
269             }
270             }
271              
272             ###########################################################################
273              
274             # Plugin hook.
275             # Called in a child process shortly after being spawned.
276             sub spamd_child_init {
277 0     0 1 0 my ($self) = @_;
278              
279 0 0 0     0 if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) {
280 0         0 $self->{store}->spamd_child_init;
281             }
282             }
283              
284             ###########################################################################
285              
286             # Plugin hook.
287             sub check_bayes {
288 0     0 0 0 my ($self, $pms, $fulltext, $min, $max) = @_;
289              
290 0 0       0 return 0 if (!$self->{conf}->{use_learner});
291 0 0 0     0 return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules});
292              
293 0 0       0 if (!exists ($pms->{bayes_score})) {
294 0         0 my $timer = $self->{main}->time_method("check_bayes");
295 0         0 $pms->{bayes_score} = $self->scan($pms, $pms->{msg});
296             }
297              
298 0 0 0     0 if (defined $pms->{bayes_score} &&
      0        
      0        
      0        
299             ($min == 0 || $pms->{bayes_score} > $min) &&
300             ($max eq "undef" || $pms->{bayes_score} <= $max))
301             {
302 0 0       0 if ($self->{conf}->{detailed_bayes_score}) {
303             $pms->test_log(sprintf ("score: %3.4f, hits: %s",
304             $pms->{bayes_score},
305 0         0 $pms->{bayes_hits}));
306             }
307             else {
308 0         0 $pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score}));
309             }
310 0         0 return 1;
311             }
312              
313 0         0 return 0;
314             }
315              
316             ###########################################################################
317              
318             # Plugin hook.
319             sub learner_close {
320 6     6 1 12 my ($self, $params) = @_;
321 6         13 my $quiet = $params->{quiet};
322              
323             # do a sanity check here. Weird things happen if we remain tied
324             # after compiling; for example, spamd will never see that the
325             # number of messages has reached the bayes-scanning threshold.
326 6 50       24 if ($self->{store}->db_readable()) {
327 0 0       0 warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
328 0         0 $self->{store}->untie_db();
329             }
330             }
331              
332             ###########################################################################
333              
334             # read configuration items to control bayes behaviour. Called by
335             # BayesStore::read_db_configs().
336             sub read_db_configs {
337 42     42 0 82 my ($self) = @_;
338              
339             # use of hapaxes. Set on bayes object, since it controls prob
340             # computation.
341 42         167 $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
342             }
343             ###########################################################################
344              
345             sub ignore_message {
346 0     0 0 0 my ($self,$PMS) = @_;
347              
348 0 0       0 return 0 unless $self->{use_ignores};
349              
350 0         0 my $ig_from = $self->{main}->call_plugins ("check_wb_list",
351             { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
352 0         0 my $ig_to = $self->{main}->call_plugins ("check_wb_list",
353             { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
354              
355 0   0     0 my $ignore = $ig_from || $ig_to;
356              
357 0 0       0 dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
358              
359 0         0 return $ignore;
360             }
361              
362             ###########################################################################
363              
364             # Plugin hook.
365             sub learn_message {
366 10     10 1 33 my ($self, $params) = @_;
367 10         27 my $isspam = $params->{isspam};
368 10         23 my $msg = $params->{msg};
369 10         24 my $id = $params->{id};
370              
371 10 50       35 if (!$self->{conf}->{use_bayes}) { return; }
  0         0  
372              
373 10         66 my $msgdata = $self->get_body_from_msg ($msg);
374 10         25 my $ret;
375              
376             eval {
377 10         57 local $SIG{'__DIE__'}; # do not run user die() traps in here
378 10         48 my $timer = $self->{main}->time_method("b_learn");
379              
380 10         21 my $ok;
381 10 100       41 if ($self->{main}->{learn_to_journal}) {
382             # If we're going to learn to journal, we'll try going r/o first...
383             # If that fails for some reason, let's try going r/w. This happens
384             # if the DB doesn't exist yet.
385 2   33     15 $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
386             } else {
387 8         49 $ok = $self->{store}->tie_db_writable();
388             }
389              
390 10 100       36 if ($ok) {
391 8         46 $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
392              
393 8 50       52 if (!$self->{main}->{learn_caller_will_untie}) {
394 8         58 $self->{store}->untie_db();
395             }
396             }
397 10         83 1;
398 10 50       25 } or do { # if we died, untie the dbs.
399 0 0       0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
400 0         0 $self->{store}->untie_db();
401 0         0 die "bayes: (in learn) $eval_stat\n";
402             };
403              
404 10         61 return $ret;
405             }
406              
407             # this function is trapped by the wrapper above
408             sub _learn_trapped {
409 8     8   26 my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
410 8         27 my @msgid = ( $msgid );
411              
412 8 50       28 if (!defined $msgid) {
413 8         34 @msgid = $self->get_msgid($msg);
414             }
415              
416 8         44 foreach my $msgid_t ( @msgid ) {
417 12         75 my $seen = $self->{store}->seen_get ($msgid_t);
418              
419 12 100       63 if (defined ($seen)) {
420 4 100 66     72 if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
    50 33        
      66        
421 2         18 dbg("bayes: $msgid_t already learnt correctly, not learning twice");
422 2         9 return 0;
423             } elsif ($seen !~ /^[hs]$/) {
424 0         0 warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored");
425             } else {
426             # bug 3704: If the message was already learned, don't try learning it again.
427             # this prevents, for instance, manually learning as spam, then autolearning
428             # as ham, or visa versa.
429 2 50       15 if ($self->{main}->{learn_no_relearn}) {
430 0         0 dbg("bayes: $msgid_t already learnt as opposite, not re-learning");
431 0         0 return 0;
432             }
433              
434 2         19 dbg("bayes: $msgid_t already learnt as opposite, forgetting first");
435              
436             # kluge so that forget() won't untie the db on us ...
437 2         8 my $orig = $self->{main}->{learn_caller_will_untie};
438 2         6 $self->{main}->{learn_caller_will_untie} = 1;
439              
440 2         17 my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
441              
442             # reset the value post-forget() ...
443 2         11 $self->{main}->{learn_caller_will_untie} = $orig;
444            
445             # forget() gave us a fatal error, so propagate that up
446 2 50       12 if ($fatal) {
447 0         0 dbg("bayes: forget() returned a fatal error, so learn() will too");
448 0         0 return;
449             }
450             }
451              
452             # we're only going to have seen this once, so stop if it's been
453             # seen already
454 2         8 last;
455             }
456             }
457              
458             # Now that we're sure we haven't seen this message before ...
459 6         25 $msgid = $msgid[0];
460              
461 6         60 my $msgatime = $msg->receive_date();
462              
463             # If the message atime comes back as being more than 1 day in the
464             # future, something's messed up and we should revert to current time as
465             # a safety measure.
466             #
467 6 50       36 $msgatime = time if ( $msgatime - time > 86400 );
468              
469 6         50 my $tokens = $self->tokenize($msg, $msgdata);
470              
471 6         15 { my $timer = $self->{main}->time_method('b_count_change');
  6         41  
472 6 100       30 if ($isspam) {
473 4         59 $self->{store}->nspam_nham_change(1, 0);
474 4         48 $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
475             } else {
476 2         13 $self->{store}->nspam_nham_change(0, 1);
477 2         19 $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
478             }
479             }
480              
481 6 100       176 $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
482 6         56 $self->{store}->cleanup();
483              
484 6         83 $self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
485             isspam => $isspam,
486             msgid => $msgid,
487             msgatime => $msgatime,
488             });
489              
490 6         72 dbg("bayes: learned '$msgid', atime: $msgatime");
491              
492 6         410 1;
493             }
494              
495             ###########################################################################
496              
497             # Plugin hook.
498             sub forget_message {
499 4     4 1 15 my ($self, $params) = @_;
500 4         9 my $msg = $params->{msg};
501 4         11 my $id = $params->{id};
502              
503 4 50       16 if (!$self->{conf}->{use_bayes}) { return; }
  0         0  
504              
505 4         20 my $msgdata = $self->get_body_from_msg ($msg);
506 4         13 my $ret;
507              
508             # we still tie for writing here, since we write to the seen db
509             # synchronously
510             eval {
511 4         21 local $SIG{'__DIE__'}; # do not run user die() traps in here
512 4         19 my $timer = $self->{main}->time_method("b_learn");
513              
514 4         10 my $ok;
515 4 50       16 if ($self->{main}->{learn_to_journal}) {
516             # If we're going to learn to journal, we'll try going r/o first...
517             # If that fails for some reason, let's try going r/w. This happens
518             # if the DB doesn't exist yet.
519 0   0     0 $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
520             } else {
521 4         20 $ok = $self->{store}->tie_db_writable();
522             }
523              
524 4 50       18 if ($ok) {
525 4         21 $ret = $self->_forget_trapped ($msg, $msgdata, $id);
526              
527 4 100       34 if (!$self->{main}->{learn_caller_will_untie}) {
528 2         15 $self->{store}->untie_db();
529             }
530             }
531 4         35 1;
532 4 50       10 } or do { # if we died, untie the dbs.
533 0 0       0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
  0         0  
534 0         0 $self->{store}->untie_db();
535 0         0 die "bayes: (in forget) $eval_stat\n";
536             };
537              
538 4         62 return $ret;
539             }
540              
541             # this function is trapped by the wrapper above
542             sub _forget_trapped {
543 4     4   15 my ($self, $msg, $msgdata, $msgid) = @_;
544 4         12 my @msgid = ( $msgid );
545 4         7 my $isspam;
546              
547 4 50       18 if (!defined $msgid) {
548 4         13 @msgid = $self->get_msgid($msg);
549             }
550              
551 4         25 while( $msgid = shift @msgid ) {
552 4         25 my $seen = $self->{store}->seen_get ($msgid);
553              
554 4 50       29 if (defined ($seen)) {
555 4 100       25 if ($seen eq 's') {
    50          
556 2         8 $isspam = 1;
557             } elsif ($seen eq 'h') {
558 2         7 $isspam = 0;
559             } else {
560 0         0 dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
561 0         0 return 0;
562             }
563              
564             # messages should only be learned once, so stop if we find a msgid
565             # which was seen before
566 4         14 last;
567             }
568             else {
569 0         0 dbg("bayes: forget: msgid $msgid not learnt, ignored");
570             }
571             }
572              
573             # This message wasn't learnt before, so return
574 4 50       18 if (!defined $isspam) {
    100          
575 0         0 dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
576 0         0 return 0;
577             }
578             elsif ($isspam) {
579 2         11 $self->{store}->nspam_nham_change (-1, 0);
580             }
581             else {
582 2         12 $self->{store}->nspam_nham_change (0, -1);
583             }
584              
585 4         34 my $tokens = $self->tokenize($msg, $msgdata);
586              
587 4 100       20 if ($isspam) {
588 2         16 $self->{store}->multi_tok_count_change (-1, 0, $tokens);
589             } else {
590 2         18 $self->{store}->multi_tok_count_change (0, -1, $tokens);
591             }
592              
593 4         97 $self->{store}->seen_delete ($msgid);
594 4         44 $self->{store}->cleanup();
595              
596 4         58 $self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
597             isspam => $isspam,
598             msgid => $msgid,
599             });
600              
601 4         330 1;
602             }
603              
604             ###########################################################################
605              
606             # Plugin hook.
607             sub learner_sync {
608 2     2 1 9 my ($self, $params) = @_;
609 2 50       12 if (!$self->{conf}->{use_bayes}) { return 0; }
  0         0  
610 2         22 dbg("bayes: bayes journal sync starting");
611 2         37 $self->{store}->sync($params);
612 2         14 dbg("bayes: bayes journal sync completed");
613             }
614              
615             ###########################################################################
616              
617             # Plugin hook.
618             sub learner_expire_old_training {
619 0     0 1 0 my ($self, $params) = @_;
620 0 0       0 if (!$self->{conf}->{use_bayes}) { return 0; }
  0         0  
621 0         0 dbg("bayes: expiry starting");
622 0         0 my $timer = $self->{main}->time_method("expire_bayes");
623 0         0 $self->{store}->expire_old_tokens($params);
624 0         0 dbg("bayes: expiry completed");
625             }
626              
627             ###########################################################################
628              
629             # Plugin hook.
630             # Check to make sure we can tie() the DB, and we have enough entries to do a scan
631             # if we're told the caller will untie(), go ahead and leave the db tied.
632             sub learner_is_scan_available {
633 145     145 1 454 my ($self, $params) = @_;
634              
635 145 50       608 return 0 unless $self->{conf}->{use_bayes};
636 145 100       774 return 0 unless $self->{store}->tie_db_readonly();
637              
638             # We need the DB to stay tied, so if the journal sync occurs, don't untie!
639 6         31 my $caller_untie = $self->{main}->{learn_caller_will_untie};
640 6         29 $self->{main}->{learn_caller_will_untie} = 1;
641              
642             # Do a journal sync if necessary. Do this before the nspam_nham_get()
643             # call since the sync may cause an update in the number of messages
644             # learnt.
645 6         35 $self->_opportunistic_calls(1);
646              
647             # Reset the variable appropriately
648 6         25 $self->{main}->{learn_caller_will_untie} = $caller_untie;
649              
650 6         53 my ($ns, $nn) = $self->{store}->nspam_nham_get();
651              
652 6 50       47 if ($ns < $self->{conf}->{bayes_min_spam_num}) {
653 0         0 dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
654 0 0       0 if (!$self->{main}->{learn_caller_will_untie}) {
655 0         0 $self->{store}->untie_db();
656             }
657 0         0 return 0;
658             }
659 6 50       29 if ($nn < $self->{conf}->{bayes_min_ham_num}) {
660 0         0 dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
661 0 0       0 if (!$self->{main}->{learn_caller_will_untie}) {
662 0         0 $self->{store}->untie_db();
663             }
664 0         0 return 0;
665             }
666              
667 6         38 return 1;
668             }
669              
670             ###########################################################################
671              
672             sub scan {
673 4     4 0 18 my ($self, $permsgstatus, $msg) = @_;
674 4         9 my $score;
675              
676 4 50       18 return unless $self->{conf}->{use_learner};
677              
678             # When we're doing a scan, we'll guarantee that we'll do the untie,
679             # so override the global setting until we're done.
680 4         10 my $caller_untie = $self->{main}->{learn_caller_will_untie};
681 4         9 $self->{main}->{learn_caller_will_untie} = 1;
682              
683 4 50       42 goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
684              
685 4 50       21 goto skip unless $self->learner_is_scan_available();
686              
687 4         21 my ($ns, $nn) = $self->{store}->nspam_nham_get();
688              
689             ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
690             ## $self->{raw_counts} = " ns=$ns nn=$nn ";
691             ## }
692              
693 4         35 dbg("bayes: corpus size: nspam = $ns, nham = $nn");
694              
695 4         9 my $msgtokens;
696 4         7 { my $timer = $self->{main}->time_method('b_tokenize');
  4         25  
697 4         25 my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
698 4         41 $msgtokens = $self->tokenize($msg, $msgdata);
699             }
700              
701 4         12 my $tokensdata;
702 4         11 { my $timer = $self->{main}->time_method('b_tok_get_all');
  4         25  
703 4         11 $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
  4         218  
704             }
705              
706 4         75 my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
707              
708 4         26 my $probabilities_ref =
709             $self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
710              
711 4         11 my %pw;
712 4         7 foreach my $tokendata (@{$tokensdata}) {
  4         22  
713 1104         1618 my $prob = shift(@$probabilities_ref);
714 1104 100       2015 next unless defined $prob;
715 502         594 my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
  502         863  
716 502         1666 $pw{$token} = {
717             prob => $prob,
718             spam_count => $tok_spam,
719             ham_count => $tok_ham,
720             atime => $atime
721             };
722             }
723              
724 4         94 my @pw_keys = keys %pw;
725              
726             # If none of the tokens were found in the DB, we're going to skip
727             # this message...
728 4 50       16 if (!@pw_keys) {
729 0         0 dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
730 0         0 goto skip;
731             }
732              
733 4         14 my $tcount_total = keys %{$msgtokens};
  4         12  
734 4         11 my $tcount_learned = scalar @pw_keys;
735              
736             # Figure out the message receive time (used as atime below)
737             # If the message atime comes back as being in the future, something's
738             # messed up and we should revert to current time as a safety measure.
739             #
740 4         29 my $msgatime = $msg->receive_date();
741 4         16 my $now = time;
742 4 50       17 $msgatime = $now if ( $msgatime > $now );
743              
744 4         18 my @touch_tokens;
745 4         29 my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
746 4         25 my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
747              
748 4         373 my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
749 4         43 my $log_each_token = (would_log('dbg', 'bayes') > 1);
750              
751             # now take the most significant tokens and calculate probs using
752             # Robinson's formula.
753              
754 4         35 @pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
  2963         4014  
755              
756 4 100       28 if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
  2         22  
757              
758 4         11 my @sorted;
759 4         14 foreach my $tok (@pw_keys) {
760 414 100       790 next if $tok_strength{$tok} <
761             $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
762              
763 282         373 my $pw_tok = $pw{$tok};
764 282         376 my $pw_prob = $pw_tok->{prob};
765              
766             # What's more expensive, scanning headers for HAMMYTOKENS and
767             # SPAMMYTOKENS tags that aren't there or collecting data that
768             # won't be used? Just collecting the data is certainly simpler.
769             #
770 282   50     748 my $raw_token = $msgtokens->{$tok} || "(unknown)";
771 282         418 my $s = $pw_tok->{spam_count};
772 282         358 my $n = $pw_tok->{ham_count};
773 282         360 my $a = $pw_tok->{atime};
774              
775 282 100       353 push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
  282         919  
776             [$raw_token, $pw_prob, $s, $n, $a] );
777              
778 282         508 push(@sorted, $pw_prob);
779              
780             # update the atime on this token, it proved useful
781 282         387 push(@touch_tokens, $tok);
782              
783 282 50       793 if ($log_each_token) {
784 0         0 dbg("bayes: token '$raw_token' => $pw_prob");
785             }
786             }
787              
788 4 50 50     49 if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
789             $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
790             {
791 0         0 dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
792 0         0 goto skip;
793             }
794              
795 4         30 $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
796 4         23 undef $timer_compute_prob; # end a timing section
797              
798             # Couldn't come up with a probability?
799 4 50       16 goto skip unless defined $score;
800              
801 4         56 dbg("bayes: score = $score");
802              
803             # no need to call tok_touch_all unless there were significant
804             # tokens and a score was returned
805             # we don't really care about the return value here
806              
807 4         12 { my $timer = $self->{main}->time_method('b_tok_touch_all');
  4         28  
808 4         62 $self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
809             }
810              
811 4         19 my $timer_finish = $self->{main}->time_method('b_finish');
812              
813 4         16 $permsgstatus->{bayes_nspam} = $ns;
814 4         12 $permsgstatus->{bayes_nham} = $nn;
815              
816             ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
817             ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
818             ## }
819              
820 4         47 $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
821             probsref => \%pw,
822             score => $score,
823             msgatime => $msgatime,
824             significant_tokens => \@touch_tokens,
825             });
826              
827             skip:
828 4 50       22 if (!defined $score) {
829 0         0 dbg("bayes: not scoring message, returning undef");
830             }
831              
832 4         16 undef $timer_compute_prob; # end a timing section if still running
833 4 50       13 if (!defined $timer_finish) {
834 4         24 $timer_finish = $self->{main}->time_method('b_finish');
835             }
836              
837             # Take any opportunistic actions we can take
838 4 50       24 if ($self->{main}->{opportunistic_expire_check_only}) {
839             # we're supposed to report on expiry only -- so do the
840             # _opportunistic_calls() run for the journal only.
841 0         0 $self->_opportunistic_calls(1);
842 0         0 $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
843             }
844             else {
845 4         20 $self->_opportunistic_calls();
846             }
847              
848             # Do any cleanup we need to do
849 4         25 $self->{store}->cleanup();
850              
851             # Reset the value accordingly
852 4         20 $self->{main}->{learn_caller_will_untie} = $caller_untie;
853              
854             # If our caller won't untie the db, we need to do it.
855 4 50       19 if (!$caller_untie) {
856 4         33 $self->{store}->untie_db();
857             }
858              
859             $permsgstatus->set_tag ('BAYESTCHAMMY',
860 4 50       19 ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));
  4         31  
861             $permsgstatus->set_tag ('BAYESTCSPAMMY',
862 4 50       15 ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));
  4         17  
863 4         22 $permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);
864 4         15 $permsgstatus->set_tag ('BAYESTC', $tcount_total);
865              
866             $permsgstatus->set_tag ('HAMMYTOKENS', sub {
867 0     0   0 my $pms = shift;
868             $self->bayes_report_make_list
869 0         0 ($pms, $pms->{bayes_token_info_hammy}, shift);
870 4         39 });
871              
872             $permsgstatus->set_tag ('SPAMMYTOKENS', sub {
873 0     0   0 my $pms = shift;
874             $self->bayes_report_make_list
875 0         0 ($pms, $pms->{bayes_token_info_spammy}, shift);
876 4         73 });
877              
878             $permsgstatus->set_tag ('TOKENSUMMARY', sub {
879 0     0   0 my $pms = shift;
880 0 0       0 if ( defined $pms->{tag_data}{BAYESTC} )
881             {
882             my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED}
883             - $pms->{tag_data}{BAYESTCSPAMMY}
884 0         0 - $pms->{tag_data}{BAYESTCHAMMY};
885             my $tcount_new = $pms->{tag_data}{BAYESTC}
886 0         0 - $pms->{tag_data}{BAYESTCLEARNED};
887 0         0 "Tokens: new, $tcount_new; "
888             ."hammy, $pms->{tag_data}{BAYESTCHAMMY}; "
889             ."neutral, $tcount_neutral; "
890             ."spammy, $pms->{tag_data}{BAYESTCSPAMMY}."
891             } else {
892 0         0 "Bayes not run.";
893             }
894 4         43 });
895              
896              
897 4         749 return $score;
898             }
899              
900             ###########################################################################
901              
902             # Plugin hook.
903             sub learner_dump_database {
904 0     0 1 0 my ($self, $params) = @_;
905 0         0 my $magic = $params->{magic};
906 0         0 my $toks = $params->{toks};
907 0         0 my $regex = $params->{regex};
908              
909             # allow dump to occur even if use_bayes disables everything else ...
910             #return 0 unless $self->{conf}->{use_bayes};
911 0 0       0 return 0 unless $self->{store}->tie_db_readonly();
912            
913 0         0 my @vars = $self->{store}->get_storage_variables();
914              
915 0         0 my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
916              
917 0         0 my $template = '%3.3f %10u %10u %10u %s'."\n";
918              
919 0 0       0 if ( $magic ) {
920 0 0       0 printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version')
921             or die "Error writing: $!";
922 0 0       0 printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam')
923             or die "Error writing: $!";
924 0 0       0 printf($template, 0.0, 0, $nh, 0, 'non-token data: nham')
925             or die "Error writing: $!";
926 0 0       0 printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens')
927             or die "Error writing: $!";
928 0 0       0 printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime')
929             or die "Error writing: $!";
930 0 0       0 if ( $bv >= 2 ) {
931 0 0       0 printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime')
932             or die "Error writing: $!";
933             }
934 0 0       0 if ( $bv < 2 ) {
935 0 0       0 printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count')
936             or die "Error writing: $!";
937             }
938 0 0       0 if ( $bv >= 2 ) {
939 0 0       0 printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime')
940             or die "Error writing: $!";
941             }
942 0 0       0 printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime')
943             or die "Error writing: $!";
944 0 0       0 if ( $bv >= 2 ) {
945 0 0       0 printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta')
946             or die "Error writing: $!";
947              
948 0 0       0 printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count')
949             or die "Error writing: $!";
950             }
951             }
952              
953 0 0       0 if ( $toks ) {
954             # let the store sort out the db_toks
955 0         0 $self->{store}->dump_db_toks($template, $regex, @vars);
956             }
957              
958 0 0       0 if (!$self->{main}->{learn_caller_will_untie}) {
959 0         0 $self->{store}->untie_db();
960             }
961 0         0 return 1;
962             }
963              
964             ###########################################################################
965             # TODO: these are NOT public, but the test suite needs to call them.
966              
967             sub get_msgid {
968 14     14 0 55 my ($self, $msg) = @_;
969              
970 14         25 my @msgid;
971              
972 14         54 my $msgid = $msg->get_header("Message-Id");
973 14 50 33     363 if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
      33        
974             # remove \r and < and > prefix/suffixes
975 14         56 chomp $msgid;
976 14         86 $msgid =~ s/^<//; $msgid =~ s/>.*$//g;
  14         95  
977 14         57 push(@msgid, $msgid);
978             }
979              
980             # Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation
981              
982             # Use sha1_hex(Date: and top N bytes of body)
983             # where N is MIN(1024 bytes, 1/2 of body length)
984             #
985 14         61 my $date = $msg->get_header("Date");
986 14 50 33     143 $date = "None" if (!defined $date || $date eq ''); # No Date?
987              
988             #Removed per bug 5185
989             #my @rcvd = $msg->get_header("Received");
990             #my $rcvd = $rcvd[$#rcvd];
991             #$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
992              
993             # Make a copy since pristine_body is a reference ...
994 14         86 my $body = join('', $msg->get_pristine_body());
995              
996 14 50       77 if (length($body) > 64) { # Small Body?
997 14 50       60 my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
998 14         52 substr($body, $keep) = '';
999             }
1000              
1001             #Stripping all CR and LF so that testing midstream from MTA and post delivery don't
1002             #generate different id's simply because of LF<->CR<->CRLF changes.
1003 14         290 $body =~ s/[\r\n]//g;
1004              
1005 14         268 unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');
1006              
1007 14 50       123 return wantarray ? @msgid : $msgid[0];
1008             }
1009              
1010             sub get_body_from_msg {
1011 20     20 0 72 my ($self, $msg) = @_;
1012              
1013 20 50       79 if (!ref $msg) {
1014             # I have no idea why this seems to happen. TODO
1015 0         0 warn "bayes: msg not a ref: '$msg'";
1016 0         0 return { };
1017             }
1018              
1019             my $permsgstatus =
1020 20         181 Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
1021 20         143 $msg->extract_message_metadata ($permsgstatus);
1022 20         104 my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
1023 20         103 $permsgstatus->finish();
1024              
1025 20 50       87 if (!defined $msgdata) {
1026             # why?!
1027 0         0 warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
1028 0         0 return { };
1029             }
1030              
1031 20         97 return $msgdata;
1032             }
1033              
1034             sub _get_msgdata_from_permsgstatus {
1035 24     24   71 my ($self, $pms) = @_;
1036              
1037 24         70 my $t_src = $self->{conf}->{bayes_token_sources};
1038 24         55 my $msgdata = { };
1039             $msgdata->{bayes_token_body} =
1040 24 50       226 $pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible};
1041             $msgdata->{bayes_token_inviz} =
1042 24 50       181 $pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible};
1043             $msgdata->{bayes_mimepart_digests} =
1044 24 50       91 $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
1045 24         77 @{$msgdata->{bayes_token_uris}} =
1046 24 50       180 $pms->get_uri_list() if $t_src->{uri};
1047 24         75 return $msgdata;
1048             }
1049              
1050             ###########################################################################
1051              
1052             # The calling functions expect a uniq'ed array of tokens ...
1053             sub tokenize {
1054 16     16 0 68 my ($self, $msg, $msgdata) = @_;
1055              
1056 16         57 my $t_src = $self->{conf}->{bayes_token_sources};
1057 16         32 my @tokens;
1058              
1059             # visible tokens from the body
1060 16 50       75 if ($msgdata->{bayes_token_body}) {
1061             my(@t) = map($self->_tokenize_line ($_, '', 1),
1062 16         40 @{$msgdata->{bayes_token_body}} );
  16         115  
1063 16         579 dbg("bayes: tokenized body: %d tokens", scalar @t);
1064 16         1944 push(@tokens, @t);
1065             }
1066             # the URI list
1067 16 50       109 if ($msgdata->{bayes_token_uris}) {
1068             my(@t) = map($self->_tokenize_line ($_, '', 2),
1069 16         39 @{$msgdata->{bayes_token_uris}} );
  16         93  
1070 16         75 dbg("bayes: tokenized uri: %d tokens", scalar @t);
1071 16         74 push(@tokens, @t);
1072             }
1073             # add invisible tokens
1074 16 50       60 if ($msgdata->{bayes_token_inviz}) {
1075 16         40 my $tokprefix;
1076 16         27 if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' }
  16         71  
1077 16         31 if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' }
1078 16 50       48 if (defined $tokprefix) {
1079             my(@t) = map($self->_tokenize_line ($_, $tokprefix, 1),
1080 16         46 @{$msgdata->{bayes_token_inviz}} );
  16         50  
1081 16         60 dbg("bayes: tokenized invisible: %d tokens", scalar @t);
1082 16         46 push(@tokens, @t);
1083             }
1084             }
1085              
1086             # add digests and Content-Type of all MIME parts
1087 16 50       65 if ($msgdata->{bayes_mimepart_digests}) {
1088 0         0 my %shorthand = ( # some frequent MIME part contents for human readability
1089             'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
1090             'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
1091             'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML',
1092             'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
1093             'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
1094             '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
1095             '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
1096             );
1097             my(@t) = map('MIME:' . ($shorthand{$_} || $_),
1098 0   0     0 @{ $msgdata->{bayes_mimepart_digests} });
  0         0  
1099 0         0 dbg("bayes: tokenized mime parts: %d tokens", scalar @t);
1100 0         0 dbg("bayes: mime-part token %s", $_) for @t;
1101 0         0 push(@tokens, @t);
1102             }
1103              
1104             # Tokenize the headers
1105 16 50       58 if ($t_src->{header}) {
1106 16         45 my(@t);
1107 16         75 my %hdrs = $self->_tokenize_headers ($msg);
1108 16         121 while( my($prefix, $value) = each %hdrs ) {
1109 200         650 push(@t, $self->_tokenize_line ($value, "H$prefix:", 0));
1110             }
1111 16         89 dbg("bayes: tokenized header: %d tokens", scalar @t);
1112 16         778 push(@tokens, @t);
1113             }
1114              
1115             # Go ahead and uniq the array, skip null tokens (can happen sometimes)
1116             # generate an SHA1 hash and take the lower 40 bits as our token
1117 16         63 my %tokens;
1118 16         78 foreach my $token (@tokens) {
1119             # dbg("bayes: token: %s", $token);
1120 6714 50       41363 $tokens{substr(sha1($token), -5)} = $token if $token ne '';
1121             }
1122              
1123             # return the keys == tokens ...
1124 16         779 return \%tokens;
1125             }
1126              
1127             sub _tokenize_line {
1128 532     532   910 my $self = $_[0];
1129 532         835 my $tokprefix = $_[2];
1130 532         733 my $region = $_[3];
1131 532         1222 local ($_) = $_[1];
1132              
1133 532         805 my @rettokens;
1134              
1135             # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
1136             # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
1137             # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
1138              
1139             ### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
1140              
1141             ### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic)
1142 532         2120 s{ ( [A-Za-z0-9,@*!_'"\$. -]+ |
1143             [\xC0-\xDF][\x80-\xBF] |
1144             [\xE0-\xEF][\x80-\xBF]{2} |
1145             [\xF0-\xF4][\x80-\xBF]{3} |
1146 4844 100       18609 [\xA1-\xFF] ) | . }
1147             { defined $1 ? $1 : ' ' }xsge;
1148             # should we also turn NBSP ( \xC2\xA0 ) into space?
1149              
1150             # DO split on "..." or "--" or "---"; common formatting error resulting in
1151             # hapaxes. Keep the separator itself as a token, though, as long ones can
1152 532         1674 # be good spamsigns.
1153 532         922 s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
1154             s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
1155 532         689  
1156 532 100 100     1754 if (IGNORE_TITLE_CASE) {
1157             if ($region == 1 || $region == 2) {
1158             # lower-case Title Case at start of a full-stop-delimited line (as would
1159 332         2262 # be seen in a Western language).
  242         2395  
1160             s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
1161             }
1162             }
1163 532         1970  
1164             my $magic_re = $self->{store}->get_magic_re();
1165              
1166             # Note that split() in scope of 'use bytes' results in words with utf8 flag
1167             # cleared, even if the source string has perl characters semantics !!!
1168             # Is this really still desirable?
1169 532         4285  
1170 8592         18878 foreach my $token (split) {
1171 8592         15118 $token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
1172             $token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
1173              
1174             # Skip false magic tokens
1175             # TVD: we need to do a defined() check since SQL doesn't have magic
1176             # tokens, so the SQL BayesStore returns undef. I really want a way
1177             # of optimizing that out, but I haven't come up with anything yet.
1178 8592 50 33     38282 #
1179             next if ( defined $magic_re && $token =~ /$magic_re/ );
1180              
1181 8592         18384 # *do* keep 3-byte tokens; there's some solid signs in there
1182             my $len = length($token);
1183              
1184             # but extend the stop-list. These are squarely in the gray
1185             # area, and it just slows us down to record them.
1186             # See http://wiki.apache.org/spamassassin/BayesStopList for more info.
1187 8592 100 100     42245 #
1188             next if $len < 3 ||
1189             ($token =~ /^(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))$/i);
1190              
1191 5230 100 100     13661 # are we in the body? If so, apply some body-specific breakouts
1192 3534 100       10045 if ($region == 1 || $region == 2) {
    100          
1193 38         173 if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
1194             push (@rettokens, $self->_tokenize_mail_addrs ($token));
1195             }
1196 110         364 elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
1197 110         245 push (@rettokens, "UD:".$token); # the full token
  110         588  
1198 132         681 my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
1199             push (@rettokens, "UD:".$1); # UD = URL domain
1200             }
1201             }
1202             }
1203              
1204             # note: do not trim down overlong tokens if they contain '*'. This is
1205             # used as part of split tokens such as "HTo:D*net" indicating that
1206             # the domain ".net" appeared in the To header.
1207 5230 100 100     12967 #
1208             if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
1209 250 50       676  
1210             if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
1211             # Bug 7135
1212 0         0 # collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences
1213             my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] )
1214 0 0       0 [\x80-\xBF]{2} )/xsg;
1215 0         0 if (@t) {
1216 0         0 push (@rettokens, map($tokprefix.'u8:'.$_, @t));
1217             next;
1218             }
1219             }
1220 250         330  
1221             if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
1222             # Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
1223             # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
1224             # to me! (jm)
1225             while ($token =~ s/^(..?)//) {
1226             push (@rettokens, $tokprefix.'8:'.$1);
1227             }
1228             next;
1229             }
1230 250 100 100     1090  
      100        
      100        
      50        
      66        
1231             if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1232             || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1233             || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
1234             {
1235             # if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
1236             # Spambayes trick via Matt: Just retain 7 chars. Do not retain the
1237             # length, it does not help; see jm's mail to -devel on Nov 20 2002 at
1238             # http://sourceforge.net/p/spamassassin/mailman/message/12977605/
1239             # "sk:" stands for "skip".
1240             # Bug 7141: retain seven UTF-8 chars (or other bytes),
1241 232         1737 # if followed by at least two bytes
1242             $token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] |
1243             [\xC0-\xDF][\x80-\xBF] |
1244             [\xE0-\xEF][\x80-\xBF]{2} |
1245             [\xF0-\xF4][\x80-\xBF]{3} | . ){7} ))
1246             .{2,} \z }{sk:$1}xs;
1247             ## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes
1248             }
1249             }
1250              
1251 5230 100 100     12387 # decompose tokens? do this after shortening long tokens
1252 3534         4649 if ($region == 1 || $region == 2) {
1253 3534 100       7994 if (DECOMPOSE_BODY_TOKENS) {
1254 358         732 if ($token =~ /[^\w:\*]/) {
1255 358         1444 my $decompd = $token; # "Foo!"
1256 358         1256 $decompd =~ s/[^\w:\*]//gs;
1257             push (@rettokens, $tokprefix.$decompd); # "Foo"
1258             }
1259 3534 100       8690  
1260 712         1524 if ($token =~ /[A-Z]/) {
  712         1413  
1261 712         2002 my $decompd = $token; $decompd = lc $decompd;
1262             push (@rettokens, $tokprefix.$decompd); # "foo!"
1263 712 100       2404  
1264 76         276 if ($token =~ /[^\w:\*]/) {
1265 76         305 $decompd =~ s/[^\w:\*]//gs;
1266             push (@rettokens, $tokprefix.$decompd); # "foo"
1267             }
1268             }
1269             }
1270             }
1271 5230         21066  
1272             push (@rettokens, $tokprefix.$token);
1273             }
1274 532         8313  
1275             return @rettokens;
1276             }
1277              
1278 16     16   52 sub _tokenize_headers {
1279             my ($self, $msg) = @_;
1280 16         37  
1281             my %parsed;
1282              
1283 16         35 my %user_ignore;
  16         76  
1284             $user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
1285              
1286 16         41 # get headers in array context
1287             my @hdrs;
1288 16         102 my @rcvdlines;
1289             for ($msg->get_all_headers()) {
1290 202 100       607 # first, keep a copy of Received headers, so we can strip down to last 2
1291 52         154 if (/^Received:/i) {
1292 52         109 push(@rcvdlines, $_);
1293             next;
1294             }
1295 150 100       2112 # and now skip lines for headers we don't want (including all Received)
1296 102         174 next if /^${IGNORED_HDRS}:/i;
1297 102         306 next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
1298             push(@hdrs, $_);
1299 16         133 }
1300             push(@hdrs, $msg->get_all_metadata());
1301              
1302             # and re-add the last 2 received lines: usually a good source of
1303 16 100       75 # spamware tokens and HELO names.
  14         59  
1304 16 100       56 if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
  14         49  
1305             if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
1306 16         70  
1307 194 50       688 for (@hdrs) {
1308 194         881 next unless /\S/;
1309             my ($hdr, $val) = split(/:/, $_, 2);
1310              
1311             # remove user-specified headers here, after Received, in case they
1312 194 50       639 # want to ignore that too
1313             next if exists $user_ignore{lc $hdr};
1314              
1315 194   50     452 # Prep the header value
1316 194         388 $val ||= '';
1317             chomp($val);
1318              
1319 194 100       1570 # special tokenization for some headers:
    100          
    100          
    100          
    100          
    50          
1320 16         102 if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {
1321             $val = $self->_pre_chew_message_id ($val);
1322             }
1323             elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)
1324             (?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
1325 60         184 {
1326             $val = $self->_pre_chew_addr_header ($val);
1327             }
1328 28         114 elsif ($hdr eq 'Received') {
1329             $val = $self->_pre_chew_received ($val);
1330             }
1331 4         31 elsif ($hdr eq 'Content-Type') {
1332             $val = $self->_pre_chew_content_type ($val);
1333             }
1334 2         11 elsif ($hdr eq 'MIME-Version') {
1335             $val =~ s/1\.0//; # totally innocuous
1336             }
1337 0         0 elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
1338             $val = "1"; # just mark the presence, they create lots of hapaxen
1339             }
1340 194         359  
1341 194 100       575 if (MAP_HEADERS_MID) {
1342 16         59 if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {
1343             $parsed{"*MI"} = $val;
1344             }
1345 194         292 }
1346 194 100       574 if (MAP_HEADERS_FROMTOCC) {
1347 32         104 if ($hdr =~ /^(?:From|To|Cc)$/i) {
1348             $parsed{"*Ad"} = $val;
1349             }
1350 194         311 }
1351 194 50       482 if (MAP_HEADERS_USERAGENT) {
1352 0         0 if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {
1353             $parsed{"*UA"} = $val;
1354             }
1355             }
1356              
1357 194 100       523 # replace hdr name with "compressed" version if possible
1358 126         343 if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
1359             $hdr = $HEADER_NAME_COMPRESSION{$hdr};
1360             }
1361 194 100       499  
1362 26         106 if (exists $parsed{$hdr}) {
1363             $parsed{$hdr} .= " ".$val;
1364 168         550 } else {
1365             $parsed{$hdr} = $val;
1366 194 50       505 }
1367 0         0 if (would_log('dbg', 'bayes') > 1) {
1368             dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
1369             }
1370             }
1371 16         294  
1372             return %parsed;
1373             }
1374              
1375 4     4   20 sub _pre_chew_content_type {
1376             my ($self, $val) = @_;
1377              
1378 4 50       27 # hopefully this will retain good bits without too many hapaxen
1379 0         0 if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
1380 0 0       0 my $boundary = $1;
1381 0         0 $boundary = '' if !defined $boundary; # avoid a warning
1382             $boundary =~ s/[a-fA-F0-9]/H/gs;
1383 0         0 # break up blocks of separator chars so they become their own tokens
1384 0         0 $boundary =~ s/([-_\.=]+)/ $1 /gs;
1385             $val .= $boundary;
1386             }
1387              
1388 4         43 # stop-list words for Content-Type header: these wind up totally gray
1389             $val =~ s/\b(?:text|charset)\b//;
1390 4         18  
1391             $val;
1392             }
1393              
1394 16     16   69 sub _pre_chew_message_id {
1395             my ($self, $val) = @_;
1396             # we can (a) get rid of a lot of hapaxen and (b) increase the token
1397             # specificity by pre-parsing some common formats.
1398              
1399 16         62 # Outlook Express format:
1400             $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
1401             ([0-9a-f]{4})[0-9a-f]{4}\$
1402             ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
1403              
1404 16         40 # Exim:
1405             $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
1406              
1407 16         34 # Sendmail:
1408             $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
1409             [A-F0-9]{10,12}\@//gx;
1410              
1411             # try to split Message-ID segments on probable ID boundaries. Note that
1412             # Outlook message-ids seem to contain a server identifier ID in the last
1413             # 8 bytes before the @. Make sure this becomes its own token, it's a
1414 16         118 # great spam-sign for a learning system! Be sure to split on ".".
1415 16         60 $val =~ s/[^_A-Za-z0-9]/ /g;
1416             $val;
1417             }
1418              
1419 28     28   90 sub _pre_chew_received {
1420             my ($self, $val) = @_;
1421              
1422             # Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs
1423             # and valid-format RFC-822/2822 dates
1424 28         141  
1425 28         118 $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail
1426 28         173 $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail
1427 28         87 $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail
1428             $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
1429 28         289  
1430             $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
1431             [0-3\s]?[0-9]\s
1432             (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
1433             (?:19|20)?[0-9]{2}\s
1434             [0-2][0-9](?:\:[0-5][0-9]){1,2}\s
1435             (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
1436             //gx;
1437              
1438             # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
1439             # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
1440             # (on both sides)
1441             # also make a dup with the full IP, as fodder for
1442 28         218 # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"
1443 30 50 33     255 $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
      33        
1444 0         0 if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {
1445             $1.$2.$3.$4.
1446             " ip*".$1.$2.$3.$4." ";
1447 30         416 } else {
1448             $1.$2.$3.
1449             " ip*".$1.$2.$3.$4." ";
1450             }
1451             }gex;
1452              
1453             # trim these: they turn out as the most common tokens, but with a
1454 28         350 # prob of about .5. waste of space!
1455             $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;
1456 28         117  
1457             $val;
1458             }
1459              
1460 60     60   188 sub _pre_chew_addr_header {
1461 60         119 my ($self, $val) = @_;
1462             local ($_);
1463 60         260  
1464 60         121 my @addrs = $self->{main}->find_all_addrs_in_line ($val);
1465 60         127 my @toks;
1466 48         135 foreach (@addrs) {
1467             push (@toks, $self->_tokenize_mail_addrs ($_));
1468 60         354 }
1469             return join (' ', @toks);
1470             }
1471              
1472 86     86   267 sub _tokenize_mail_addrs {
1473             my ($self, $addr) = @_;
1474 86 50       468  
1475 86         154 ($addr =~ /(.+)\@(.+)$/) or return ();
1476 86         404 my @toks;
1477 86         208 push(@toks, "U*".$1, "D*".$2);
  86         465  
  76         376  
1478 86         498 $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
1479             return @toks;
1480             }
1481              
1482              
1483             ###########################################################################
1484              
1485             # compute the probability that a token is spammish for each token
1486 4     4   17 sub _compute_prob_for_all_tokens {
1487 4         10 my ($self, $tokensdata, $ns, $nn) = @_;
1488             my @probabilities;
1489 4 50 33     41  
1490             return if !$ns || !$nn;
1491 4         12  
1492 4         15 my $threshold = 1; # ignore low-freq tokens below this s+n threshold
1493             if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1494             $threshold = 10;
1495 4 50       17 }
1496 0         0 if (!$self->{use_hapaxes}) {
1497             $threshold = 2;
1498             }
1499 4         11  
  4         18  
1500 1104         1434 foreach my $tokendata (@{$tokensdata}) {
1501 1104         1486 my $s = $tokendata->[1]; # spam count
1502 1104         1325 my $n = $tokendata->[2]; # ham count
1503             my $prob;
1504 21     21   245  
  21         51  
  21         26809  
1505 1104 100       1815 no warnings 'uninitialized'; # treat undef as zero in addition
1506             if ($s + $n >= $threshold) {
1507             # ignoring low-freq tokens, also covers the (!$s && !$n) case
1508              
1509             # my $ratios = $s / $ns;
1510             # my $ration = $n / $nn;
1511             # $prob = $ratios / ($ration + $ratios);
1512 502         1120 #
1513             $prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster
1514 502         623  
1515             if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1516             # use Robinson's f(x) equation for low-n tokens, instead of just
1517 502         620 # ignoring them
1518 502         1088 my $robn = $s + $n;
1519             $prob =
1520             ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
1521             /
1522             ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
1523             }
1524             }
1525              
1526             # 'log_raw_counts' is used to log the raw data for the Bayes equations
1527             # during a mass-check, allowing the S and X constants to be optimized
1528             # quickly without requiring re-tokenization of the messages for each
1529             # attempt. There's really no need for this code to be uncommented in
1530             # normal use, however. It has never been publicly documented, so
1531             # commenting it out is fine. ;)
1532             #
1533             ## if ($self->{log_raw_counts}) {
1534             ## $self->{raw_counts} .= " s=$s,n=$n ";
1535             ## }
1536 1104         2043  
1537             push(@probabilities, $prob);
1538 4         26 }
1539             return \@probabilities;
1540             }
1541              
1542             # compute the probability that a token is spammish
1543 0     0   0 sub _compute_prob_for_token {
1544             my ($self, $token, $ns, $nn, $s, $n) = @_;
1545              
1546             # we allow the caller to give us the token information, just
1547 0 0 0     0 # to save a potentially expensive lookup
1548 0         0 if (!defined($s) || !defined($n)) {
1549             ($s, $n, undef) = $self->{store}->tok_get($token);
1550 0 0 0     0 }
1551             return if !$s && !$n;
1552 0         0  
1553             my $probabilities_ref =
1554             $self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);
1555 0         0  
1556             return $probabilities_ref->[0];
1557             }
1558              
1559             ###########################################################################
1560             # If a token is neither hammy nor spammy, return 0.
1561             # For a spammy token, return the minimum number of additional ham messages
1562             # it would have had to appear in to no longer be spammy. Hammy tokens
1563             # are handled similarly. That's what the function does (at the time
1564             # of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly
1565             # more useful if it returned the number of /additional/ ham messages
1566             # a spammy token would have to appear in to no longer be spammy but I
1567             # fear that might require the solution to a cubic equation, and I
1568             # just don't have the time for that now.
1569              
1570 0     0   0 sub _compute_declassification_distance {
1571             my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
1572 0 0 0     0  
1573             return 0 if $ns == 0 && $nn == 0;
1574 0         0  
1575 0 0       0 if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
  0 0       0  
1576             if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
1577 0 0 0     0  
1578 0 0       0 return 0 if $Ns == 0 || $Nn == 0;
1579             return 0 if abs( $prob - 0.5 ) <
1580             $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1581 0 0       0  
1582 0         0 my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
1583             my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1584 0         0  
1585             return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
1586             unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
1587 0         0  
1588 0         0 my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
1589 0         0 my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
1590 0         0 my $a = $Nb * ( 1 - $p );
1591 0         0 my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
1592 0         0 my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
1593 0 0       0 my $discrim = $b * $b - 4 * $a * $c;
1594 0         0 my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
1595             my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
1596              
1597 0 0       0 # This shouldn't be necessary. Should not be < 1
1598             return $dd_exact < 1 ? 1 : int($dd_exact);
1599             }
1600              
1601             ###########################################################################
1602              
1603 10     10   40 sub _opportunistic_calls {
1604             my($self, $journal_only) = @_;
1605              
1606 10 50       129 # If we're not already tied, abort.
1607 0         0 if (!$self->{store}->db_readable()) {
1608 0         0 dbg("bayes: opportunistic call attempt failed, DB not readable");
1609             return;
1610             }
1611              
1612 10         77 # Is an expire or sync running?
1613 10 50 33     39 my $running_expire = $self->{store}->get_running_expire_tok();
1614 0         0 if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
1615 0         0 dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
1616             return;
1617             }
1618              
1619 10 50 66     145 # handle expiry and syncing
    50          
1620 0         0 if (!$journal_only && $self->{store}->expiry_due()) {
1621             dbg("bayes: opportunistic call found expiry due");
1622              
1623             # sync will bring the DB R/W as necessary, and the expire will remove
1624 0         0 # the running_expire token, may untie as well.
1625             $self->{main}->{bayes_scanner}->sync(1,1);
1626             }
1627 0         0 elsif ( $self->{store}->sync_due() ) {
1628             dbg("bayes: opportunistic call found journal sync due");
1629              
1630 0         0 # sync will bring the DB R/W as necessary, may untie as well
1631             $self->{main}->{bayes_scanner}->sync(1,0);
1632              
1633 0 0       0 # We can only remove the running_expire token if we're doing R/W
1634 0         0 if ($self->{store}->db_writable()) {
1635             $self->{store}->remove_running_expire_tok();
1636             }
1637             }
1638 10         34  
1639             return;
1640             }
1641              
1642             ###########################################################################
1643              
1644 62     62 1 232 sub learner_new {
1645             my ($self) = @_;
1646 62         138  
1647 62         271 my $store;
1648 62 100       368 my $module = $self->{conf}->{bayes_store_module};
    50          
1649 56         191 if (!$module) {
1650             $module = 'Mail::SpamAssassin::BayesStore::DBM';
1651 6         40 } elsif ($module =~ /^([_A-Za-z0-9:]+)$/) {
1652             $module = untaint_var($module);
1653 0         0 } else {
1654             die "bayes: invalid module: $module\n";
1655             }
1656 62         322  
1657 62         225 dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module);
1658             undef $self->{store}; # DESTROYs previous object, if any
1659             eval '
1660             require '.$module.';
1661             $store = '.$module.'->new($self);
1662 62 50       8504 1;
1663 0 0       0 ' or do {
  0         0  
1664 0         0 my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1665             die "bayes: learner_new $module new() failed: $eval_stat\n";
1666             };
1667 62         488  
1668 62         179 dbg("bayes: learner_new: got store=%s", $store);
1669             $self->{store} = $store;
1670 62         254  
1671             $self;
1672             }
1673              
1674             ###########################################################################
1675              
1676 0     0 0   sub bayes_report_make_list {
1677 0 0         my ($self, $pms, $info, $param) = @_;
1678             return "Tokens not available." unless defined $info;
1679 0   0        
1680             my ($limit,$fmt_arg,$more) = split /,/, ($param || '5');
1681 0            
1682             my %formats = (
1683             short => '$t',
1684             Short => 'Token: \"$t\"',
1685             compact => '$p-$D--$t',
1686             Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
1687             medium => '$p-$D-$N--$t',
1688             long => '$p-$d--${h}h-${s}s--${a}d--$t',
1689             Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
1690             );
1691 0 0          
1692             my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
1693 0 0          
1694             return "Invalid format, must be one of: ".join(",",keys %formats)
1695             unless defined $raw_fmt;
1696 0            
1697 0 0         my $fmt = '"'.$raw_fmt.'"';
1698 0 0         my $amt = $limit < @$info ? $limit : @$info;
1699             return "" unless $amt;
1700 0            
1701 0           my $ns = $pms->{bayes_nspam};
1702 0 0   0     my $nh = $pms->{bayes_nham};
  0            
1703 0           my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
1704             my $now = time;
1705              
1706 0           join ', ', map {
1707 0           my($t,$prob,$s,$h,$u) = @$_;
1708 0           my $a = int(($now - $u)/(3600 * 24));
1709 0           my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);
1710 0           my $p = sprintf "%.3f", $prob;
1711 0 0         my $n = $s + $h;
1712 0           my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
1713 0           my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
1714 0           eval $fmt; ## no critic
  0            
1715             } @{$info}[0..$amt-1];
1716             }
1717              
1718             1;