File Coverage

blib/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Criterion	Covered	Total	%
statement	553	753	73.4
branch	158	342	46.2
condition	45	119	37.8
subroutine	58	71	81.6
pod	11	21	52.3
total	825	1306	63.1

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier
21
22							=head1 DESCRIPTION
23
24							This is a Bayesian-style probabilistic classifier, using an algorithm based on
25							the one detailed in Paul Graham's I<A Plan For Spam> paper at:
26
27							http://www.paulgraham.com/spam.html
28
29							It also incorporates some other aspects taken from Graham Robinson's webpage
30							on the subject at:
31
32							http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
33
34							And the chi-square probability combiner as described here:
35
36							http://www.linuxjournal.com/print.php?sid=6467
37
38							The results are incorporated into SpamAssassin as the BAYES_* rules.
39
40							=head1 METHODS
41
42							=cut
43
44							package Mail::SpamAssassin::Plugin::Bayes;
45
46	21			21		153	use strict;
	21					47
	21					720
47	21			21		120	use warnings;
	21					41
	21					721
48							# use bytes;
49	21			21		136	use re 'taint';
	21					42
	21					2047
50
51							BEGIN {
52	21					155	eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
	21					1391
	21					717
53	21	50		21		80	or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
	0					0
	0					0
54							}
55
56	21			21		165	use Mail::SpamAssassin;
	21					45
	21					622
57	21			21		128	use Mail::SpamAssassin::Plugin;
	21					41
	21					678
58	21			21		142	use Mail::SpamAssassin::PerMsgStatus;
	21					41
	21					599
59	21			21		171	use Mail::SpamAssassin::Logger;
	21					47
	21					1337
60	21			21		148	use Mail::SpamAssassin::Util qw(untaint_var);
	21					38
	21					982
61
62							# pick ONLY ONE of these combining implementations.
63	21			21		7469	use Mail::SpamAssassin::Bayes::CombineChi;
	21					51
	21					13201
64							# use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
65
66							our @ISA = qw(Mail::SpamAssassin::Plugin);
67
68							# Which headers should we scan for tokens? Don't use all of them, as it's easy
69							# to pick up spurious clues from some. What we now do is use all of them
70							# less these well-known headers; that way we can pick up spammers' tracking
71							# headers (which are obviously not well-known in advance!).
72
73							# Received is handled specially
74							our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise
75							\|Delivered-To \|Delivery-Date
76							\|(?:X-)?Envelope-To
77							\|X-MIME-Auto[Cc]onverted \|X-Converted-To-Plain-Text
78
79							\|Subject # not worth a tiny gain vs. to db size increase
80
81							# Date: can provide invalid cues if your spam corpus is
82							# older/newer than ham
83							\|Date
84
85							# List headers: ignore. a spamfiltering mailing list will
86							# become a nonspam sign.
87							\|X-List\|(?:X-)?Mailing-List
88							\|(?:X-)?List-(?:Archive\|Help\|Id\|Owner\|Post\|Subscribe
89							\|Unsubscribe\|Host\|Id\|Manager\|Admin\|Comment
90							\|Name\|Url)
91							\|X-Unsub(?:scribe)?
92							\|X-Mailman-Version \|X-Been[Tt]here \|X-Loop
93							\|Mail-Followup-To
94							\|X-eGroups-(?:Return\|From)
95							\|X-MDMailing-List
96							\|X-XEmacs-List
97							\|X-Sympa-To
98
99							# gatewayed through mailing list (thanks to Allen Smith)
100							\|(?:X-)?Resent-(?:From\|To\|Date)
101							\|(?:X-)?Original-(?:From\|To\|Date)
102
103							# Spamfilter/virus-scanner headers: too easy to chain from
104							# these
105							\|X-MailScanner(?:-SpamCheck)?
106							\|X-Spam(?:-(?:Status\|Level\|Flag\|Report\|Hits\|Score\|Checker-Version))?
107							\|X-Antispam \|X-RBL-Warning \|X-Mailscanner
108							\|X-MDaemon-Deliver-To \|X-Virus-Scanned
109							\|X-Mass-Check-Id
110							\|X-Pyzor \|X-DCC-\S{2,25}-Metrics
111							\|X-Filtered-B[Yy] \|X-Scanned-By \|X-Scanner
112							\|X-AP-Spam-(?:Score\|Status) \|X-RIPE-Spam-Status
113							\|X-SpamCop-[^:]+
114							\|X-SMTPD \|(?:X-)?Spam-Apparently-To
115							\|SPAM \|X-Perlmx-Spam
116							\|X-Bogosity
117
118							# some noisy Outlook headers that add no good clues:
119							\|Content-Class \|Thread-(?:Index\|Topic)
120							\|X-Original[Aa]rrival[Tt]ime
121
122							# Annotations from IMAP, POP, and MH:
123							\|(?:X-)?Status \|X-Flags \|X-Keywords \|Replied \|Forwarded
124							\|Lines \|Content-Length
125							\|X-UIDL? \|X-IMAPbase
126
127							# Annotations from Bugzilla
128							\|X-Bugzilla-[^:]+
129
130							# Annotations from VM: (thanks to Allen Smith)
131							\|X-VM-(?:Bookmark\|(?:POP\|IMAP)-Retrieved\|Labels\|Last-Modified
132							\|Summary-Format\|VHeader\|v\d-Data\|Message-Order)
133
134							# Annotations from Gnus:
135							\| X-Gnus-Mail-Source
136							\| Xref
137
138							)}x;
139
140							# Note only the presence of these headers, in order to reduce the
141							# hapaxen they generate.
142							our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
143							\|X-(?:Gnu-?PG\|PGP\|GPG)(?:-Key)?-Fingerprint
144							\|D(?:KIM\|omainKey)-Signature
145							)}ix;
146
147							# tweaks tested as of Nov 18 2002 by jm posted to -devel at
148							# http://sourceforge.net/p/spamassassin/mailman/message/12977556/
149							# for results. The winners are now the default settings.
150	21			21		171	use constant IGNORE_TITLE_CASE => 1;
	21					52
	21					1145
151	21			21		129	use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0;
	21					46
	21					940
152	21			21		130	use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1;
	21					44
	21					922
153	21			21		144	use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
	21					46
	21					882
154
155							# tweaks by jm on May 12 2003, see -devel email at
156							# http://sourceforge.net/p/spamassassin/mailman/message/14844556/
157	21			21		121	use constant PRE_CHEW_ADDR_HEADERS => 1;
	21					43
	21					844
158	21			21		119	use constant CHEW_BODY_URIS => 1;
	21					46
	21					944
159	21			21		121	use constant CHEW_BODY_MAILADDRS => 1;
	21					45
	21					839
160	21			21		116	use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
	21					46
	21					854
161	21			21		124	use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
	21					43
	21					802
162	21			21		110	use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
	21					41
	21					801
163	21			21		116	use constant IGNORE_MSGID_TOKENS => 0;
	21					43
	21					816
164
165							# tweaks of 12 March 2004, see bug 2129.
166	21			21		126	use constant DECOMPOSE_BODY_TOKENS => 1;
	21					47
	21					856
167	21			21		121	use constant MAP_HEADERS_MID => 1;
	21					50
	21					870
168	21			21		113	use constant MAP_HEADERS_FROMTOCC => 1;
	21					41
	21					844
169	21			21		124	use constant MAP_HEADERS_USERAGENT => 1;
	21					47
	21					866
170
171							# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
172	21			21		116	use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
	21					41
	21					987
173	21			21		130	use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
	21					38
	21					2628
174
175							# We store header-mined tokens in the db with a "HHeaderName:val" format.
176							# some headers may contain lots of gibberish tokens, so allow a little basic
177							# compression by mapping the header name at least here. these are the headers
178							# which appear with the most frequency in my db. note: this doesn't have to
179							# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing
180							# tokens from multiple different headers may impact accuracy, so might as well
181							# avoid this if possible. These are the top ones from my corpus, BTW (jm).
182							our %HEADER_NAME_COMPRESSION = (
183							'Message-Id' => '*m',
184							'Message-ID' => '*M',
185							'Received' => '*r',
186							'User-Agent' => '*u',
187							'References' => '*f',
188							'In-Reply-To' => '*i',
189							'From' => '*F',
190							'Reply-To' => '*R',
191							'Return-Path' => '*p',
192							'Return-path' => '*rp',
193							'X-Mailer' => '*x',
194							'X-Authentication-Warning' => '*a',
195							'Organization' => '*o',
196							'Organisation' => '*o',
197							'Content-Type' => '*ct',
198							'Content-Disposition' => '*cd',
199							'Content-Transfer-Encoding' => '*ce',
200							'x-spam-relays-trusted' => '*RT',
201							'x-spam-relays-untrusted' => '*RU',
202							);
203
204							# How many seconds should the opportunistic_expire lock be valid?
205							our $OPPORTUNISTIC_LOCK_VALID = 300;
206
207							# Should we use the Robinson f(w) equation from
208							# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?
209							# It gives better results, in that scores are more likely to distribute
210							# into the <0.5 range for nonspam and >0.5 for spam.
211	21			21		147	use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
	21					41
	21					1103
212
213							# How many of the most significant tokens should we use for the p(w)
214							# calculation?
215	21			21		132	use constant N_SIGNIFICANT_TOKENS => 150;
	21					52
	21					1150
216
217							# How many significant tokens are required for a classifier score to
218							# be considered usable?
219	21			21		133	use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
	21					42
	21					1079
220
221							# How long a token should we hold onto? (note: German speakers typically
222							# will require a longer token than English ones.)
223	21			21		138	use constant MAX_TOKEN_LENGTH => 15;
	21					40
	21					165996
224
225							###########################################################################
226
227							sub new {
228	62			62	1	275	my $class = shift;
229	62					226	my ($main) = @_;
230
231	62		33			446	$class = ref($class) \|\| $class;
232	62					427	my $self = $class->SUPER::new($main);
233	62					166	bless ($self, $class);
234
235	62					228	$self->{main} = $main;
236	62					181	$self->{conf} = $main->{conf};
237	62					237	$self->{use_ignores} = 1;
238
239	62					357	$self->register_eval_rule("check_bayes");
240	62					553	$self;
241							}
242
243							sub finish {
244	40			40	1	107	my $self = shift;
245	40	100				196	if ($self->{store}) {
246	39					297	$self->{store}->untie_db();
247							}
248	40					99	%{$self} = ();
	40					305
249							}
250
251							###########################################################################
252
253							# Plugin hook.
254							# Return this implementation object, for callers that need to know
255							# it. TODO: callers shouldn't need to know it!
256							# used only in test suite to get access to {store}, internal APIs.
257							#
258	1160			1160	0	2272	sub learner_get_implementation { return shift; }
259
260							###########################################################################
261
262							# Plugin hook.
263							# Called in the parent process shortly before forking off child processes.
264							sub prefork_init {
265	0			0	0	0	my ($self) = @_;
266
267	0	0	0			0	if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) {
268	0					0	$self->{store}->prefork_init;
269							}
270							}
271
272							###########################################################################
273
274							# Plugin hook.
275							# Called in a child process shortly after being spawned.
276							sub spamd_child_init {
277	0			0	1	0	my ($self) = @_;
278
279	0	0	0			0	if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) {
280	0					0	$self->{store}->spamd_child_init;
281							}
282							}
283
284							###########################################################################
285
286							# Plugin hook.
287							sub check_bayes {
288	0			0	0	0	my ($self, $pms, $fulltext, $min, $max) = @_;
289
290	0	0				0	return 0 if (!$self->{conf}->{use_learner});
291	0	0	0			0	return 0 if (!$self->{conf}->{use_bayes} \|\| !$self->{conf}->{use_bayes_rules});
292
293	0	0				0	if (!exists ($pms->{bayes_score})) {
294	0					0	my $timer = $self->{main}->time_method("check_bayes");
295	0					0	$pms->{bayes_score} = $self->scan($pms, $pms->{msg});
296							}
297
298	0	0	0			0	if (defined $pms->{bayes_score} &&
			0
			0
			0
299							($min == 0 \|\| $pms->{bayes_score} > $min) &&
300							($max eq "undef" \|\| $pms->{bayes_score} <= $max))
301							{
302	0	0				0	if ($self->{conf}->{detailed_bayes_score}) {
303							$pms->test_log(sprintf ("score: %3.4f, hits: %s",
304							$pms->{bayes_score},
305	0					0	$pms->{bayes_hits}));
306							}
307							else {
308	0					0	$pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score}));
309							}
310	0					0	return 1;
311							}
312
313	0					0	return 0;
314							}
315
316							###########################################################################
317
318							# Plugin hook.
319							sub learner_close {
320	6			6	1	12	my ($self, $params) = @_;
321	6					13	my $quiet = $params->{quiet};
322
323							# do a sanity check here. Weird things happen if we remain tied
324							# after compiling; for example, spamd will never see that the
325							# number of messages has reached the bayes-scanning threshold.
326	6	50				24	if ($self->{store}->db_readable()) {
327	0	0				0	warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
328	0					0	$self->{store}->untie_db();
329							}
330							}
331
332							###########################################################################
333
334							# read configuration items to control bayes behaviour. Called by
335							# BayesStore::read_db_configs().
336							sub read_db_configs {
337	42			42	0	82	my ($self) = @_;
338
339							# use of hapaxes. Set on bayes object, since it controls prob
340							# computation.
341	42					167	$self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
342							}
343							###########################################################################
344
345							sub ignore_message {
346	0			0	0	0	my ($self,$PMS) = @_;
347
348	0	0				0	return 0 unless $self->{use_ignores};
349
350	0					0	my $ig_from = $self->{main}->call_plugins ("check_wb_list",
351							{ permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
352	0					0	my $ig_to = $self->{main}->call_plugins ("check_wb_list",
353							{ permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
354
355	0		0			0	my $ignore = $ig_from \|\| $ig_to;
356
357	0	0				0	dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
358
359	0					0	return $ignore;
360							}
361
362							###########################################################################
363
364							# Plugin hook.
365							sub learn_message {
366	10			10	1	33	my ($self, $params) = @_;
367	10					27	my $isspam = $params->{isspam};
368	10					23	my $msg = $params->{msg};
369	10					24	my $id = $params->{id};
370
371	10	50				35	if (!$self->{conf}->{use_bayes}) { return; }
	0					0
372
373	10					66	my $msgdata = $self->get_body_from_msg ($msg);
374	10					25	my $ret;
375
376							eval {
377	10					57	local $SIG{'__DIE__'}; # do not run user die() traps in here
378	10					48	my $timer = $self->{main}->time_method("b_learn");
379
380	10					21	my $ok;
381	10	100				41	if ($self->{main}->{learn_to_journal}) {
382							# If we're going to learn to journal, we'll try going r/o first...
383							# If that fails for some reason, let's try going r/w. This happens
384							# if the DB doesn't exist yet.
385	2		33			15	$ok = $self->{store}->tie_db_readonly() \|\| $self->{store}->tie_db_writable();
386							} else {
387	8					49	$ok = $self->{store}->tie_db_writable();
388							}
389
390	10	100				36	if ($ok) {
391	8					46	$ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
392
393	8	50				52	if (!$self->{main}->{learn_caller_will_untie}) {
394	8					58	$self->{store}->untie_db();
395							}
396							}
397	10					83	1;
398	10	50				25	} or do { # if we died, untie the dbs.
399	0	0				0	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
	0					0
400	0					0	$self->{store}->untie_db();
401	0					0	die "bayes: (in learn) $eval_stat\n";
402							};
403
404	10					61	return $ret;
405							}
406
407							# this function is trapped by the wrapper above
408							sub _learn_trapped {
409	8			8		26	my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
410	8					27	my @msgid = ( $msgid );
411
412	8	50				28	if (!defined $msgid) {
413	8					34	@msgid = $self->get_msgid($msg);
414							}
415
416	8					44	foreach my $msgid_t ( @msgid ) {
417	12					75	my $seen = $self->{store}->seen_get ($msgid_t);
418
419	12	100				63	if (defined ($seen)) {
420	4	100	66			72	if (($seen eq 's' && $isspam) \|\| ($seen eq 'h' && !$isspam)) {
		50	33
			66
421	2					18	dbg("bayes: $msgid_t already learnt correctly, not learning twice");
422	2					9	return 0;
423							} elsif ($seen !~ /^[hs]$/) {
424	0					0	warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored");
425							} else {
426							# bug 3704: If the message was already learned, don't try learning it again.
427							# this prevents, for instance, manually learning as spam, then autolearning
428							# as ham, or visa versa.
429	2	50				15	if ($self->{main}->{learn_no_relearn}) {
430	0					0	dbg("bayes: $msgid_t already learnt as opposite, not re-learning");
431	0					0	return 0;
432							}
433
434	2					19	dbg("bayes: $msgid_t already learnt as opposite, forgetting first");
435
436							# kluge so that forget() won't untie the db on us ...
437	2					8	my $orig = $self->{main}->{learn_caller_will_untie};
438	2					6	$self->{main}->{learn_caller_will_untie} = 1;
439
440	2					17	my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
441
442							# reset the value post-forget() ...
443	2					11	$self->{main}->{learn_caller_will_untie} = $orig;
444
445							# forget() gave us a fatal error, so propagate that up
446	2	50				12	if ($fatal) {
447	0					0	dbg("bayes: forget() returned a fatal error, so learn() will too");
448	0					0	return;
449							}
450							}
451
452							# we're only going to have seen this once, so stop if it's been
453							# seen already
454	2					8	last;
455							}
456							}
457
458							# Now that we're sure we haven't seen this message before ...
459	6					25	$msgid = $msgid[0];
460
461	6					60	my $msgatime = $msg->receive_date();
462
463							# If the message atime comes back as being more than 1 day in the
464							# future, something's messed up and we should revert to current time as
465							# a safety measure.
466							#
467	6	50				36	$msgatime = time if ( $msgatime - time > 86400 );
468
469	6					50	my $tokens = $self->tokenize($msg, $msgdata);
470
471	6					15	{ my $timer = $self->{main}->time_method('b_count_change');
	6					41
472	6	100				30	if ($isspam) {
473	4					59	$self->{store}->nspam_nham_change(1, 0);
474	4					48	$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
475							} else {
476	2					13	$self->{store}->nspam_nham_change(0, 1);
477	2					19	$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
478							}
479							}
480
481	6	100				176	$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
482	6					56	$self->{store}->cleanup();
483
484	6					83	$self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
485							isspam => $isspam,
486							msgid => $msgid,
487							msgatime => $msgatime,
488							});
489
490	6					72	dbg("bayes: learned '$msgid', atime: $msgatime");
491
492	6					410	1;
493							}
494
495							###########################################################################
496
497							# Plugin hook.
498							sub forget_message {
499	4			4	1	15	my ($self, $params) = @_;
500	4					9	my $msg = $params->{msg};
501	4					11	my $id = $params->{id};
502
503	4	50				16	if (!$self->{conf}->{use_bayes}) { return; }
	0					0
504
505	4					20	my $msgdata = $self->get_body_from_msg ($msg);
506	4					13	my $ret;
507
508							# we still tie for writing here, since we write to the seen db
509							# synchronously
510							eval {
511	4					21	local $SIG{'__DIE__'}; # do not run user die() traps in here
512	4					19	my $timer = $self->{main}->time_method("b_learn");
513
514	4					10	my $ok;
515	4	50				16	if ($self->{main}->{learn_to_journal}) {
516							# If we're going to learn to journal, we'll try going r/o first...
517							# If that fails for some reason, let's try going r/w. This happens
518							# if the DB doesn't exist yet.
519	0		0			0	$ok = $self->{store}->tie_db_readonly() \|\| $self->{store}->tie_db_writable();
520							} else {
521	4					20	$ok = $self->{store}->tie_db_writable();
522							}
523
524	4	50				18	if ($ok) {
525	4					21	$ret = $self->_forget_trapped ($msg, $msgdata, $id);
526
527	4	100				34	if (!$self->{main}->{learn_caller_will_untie}) {
528	2					15	$self->{store}->untie_db();
529							}
530							}
531	4					35	1;
532	4	50				10	} or do { # if we died, untie the dbs.
533	0	0				0	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
	0					0
534	0					0	$self->{store}->untie_db();
535	0					0	die "bayes: (in forget) $eval_stat\n";
536							};
537
538	4					62	return $ret;
539							}
540
541							# this function is trapped by the wrapper above
542							sub _forget_trapped {
543	4			4		15	my ($self, $msg, $msgdata, $msgid) = @_;
544	4					12	my @msgid = ( $msgid );
545	4					7	my $isspam;
546
547	4	50				18	if (!defined $msgid) {
548	4					13	@msgid = $self->get_msgid($msg);
549							}
550
551	4					25	while( $msgid = shift @msgid ) {
552	4					25	my $seen = $self->{store}->seen_get ($msgid);
553
554	4	50				29	if (defined ($seen)) {
555	4	100				25	if ($seen eq 's') {
		50
556	2					8	$isspam = 1;
557							} elsif ($seen eq 'h') {
558	2					7	$isspam = 0;
559							} else {
560	0					0	dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
561	0					0	return 0;
562							}
563
564							# messages should only be learned once, so stop if we find a msgid
565							# which was seen before
566	4					14	last;
567							}
568							else {
569	0					0	dbg("bayes: forget: msgid $msgid not learnt, ignored");
570							}
571							}
572
573							# This message wasn't learnt before, so return
574	4	50				18	if (!defined $isspam) {
		100
575	0					0	dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
576	0					0	return 0;
577							}
578							elsif ($isspam) {
579	2					11	$self->{store}->nspam_nham_change (-1, 0);
580							}
581							else {
582	2					12	$self->{store}->nspam_nham_change (0, -1);
583							}
584
585	4					34	my $tokens = $self->tokenize($msg, $msgdata);
586
587	4	100				20	if ($isspam) {
588	2					16	$self->{store}->multi_tok_count_change (-1, 0, $tokens);
589							} else {
590	2					18	$self->{store}->multi_tok_count_change (0, -1, $tokens);
591							}
592
593	4					97	$self->{store}->seen_delete ($msgid);
594	4					44	$self->{store}->cleanup();
595
596	4					58	$self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
597							isspam => $isspam,
598							msgid => $msgid,
599							});
600
601	4					330	1;
602							}
603
604							###########################################################################
605
606							# Plugin hook.
607							sub learner_sync {
608	2			2	1	9	my ($self, $params) = @_;
609	2	50				12	if (!$self->{conf}->{use_bayes}) { return 0; }
	0					0
610	2					22	dbg("bayes: bayes journal sync starting");
611	2					37	$self->{store}->sync($params);
612	2					14	dbg("bayes: bayes journal sync completed");
613							}
614
615							###########################################################################
616
617							# Plugin hook.
618							sub learner_expire_old_training {
619	0			0	1	0	my ($self, $params) = @_;
620	0	0				0	if (!$self->{conf}->{use_bayes}) { return 0; }
	0					0
621	0					0	dbg("bayes: expiry starting");
622	0					0	my $timer = $self->{main}->time_method("expire_bayes");
623	0					0	$self->{store}->expire_old_tokens($params);
624	0					0	dbg("bayes: expiry completed");
625							}
626
627							###########################################################################
628
629							# Plugin hook.
630							# Check to make sure we can tie() the DB, and we have enough entries to do a scan
631							# if we're told the caller will untie(), go ahead and leave the db tied.
632							sub learner_is_scan_available {
633	145			145	1	454	my ($self, $params) = @_;
634
635	145	50				608	return 0 unless $self->{conf}->{use_bayes};
636	145	100				774	return 0 unless $self->{store}->tie_db_readonly();
637
638							# We need the DB to stay tied, so if the journal sync occurs, don't untie!
639	6					31	my $caller_untie = $self->{main}->{learn_caller_will_untie};
640	6					29	$self->{main}->{learn_caller_will_untie} = 1;
641
642							# Do a journal sync if necessary. Do this before the nspam_nham_get()
643							# call since the sync may cause an update in the number of messages
644							# learnt.
645	6					35	$self->_opportunistic_calls(1);
646
647							# Reset the variable appropriately
648	6					25	$self->{main}->{learn_caller_will_untie} = $caller_untie;
649
650	6					53	my ($ns, $nn) = $self->{store}->nspam_nham_get();
651
652	6	50				47	if ($ns < $self->{conf}->{bayes_min_spam_num}) {
653	0					0	dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
654	0	0				0	if (!$self->{main}->{learn_caller_will_untie}) {
655	0					0	$self->{store}->untie_db();
656							}
657	0					0	return 0;
658							}
659	6	50				29	if ($nn < $self->{conf}->{bayes_min_ham_num}) {
660	0					0	dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
661	0	0				0	if (!$self->{main}->{learn_caller_will_untie}) {
662	0					0	$self->{store}->untie_db();
663							}
664	0					0	return 0;
665							}
666
667	6					38	return 1;
668							}
669
670							###########################################################################
671
672							sub scan {
673	4			4	0	18	my ($self, $permsgstatus, $msg) = @_;
674	4					9	my $score;
675
676	4	50				18	return unless $self->{conf}->{use_learner};
677
678							# When we're doing a scan, we'll guarantee that we'll do the untie,
679							# so override the global setting until we're done.
680	4					10	my $caller_untie = $self->{main}->{learn_caller_will_untie};
681	4					9	$self->{main}->{learn_caller_will_untie} = 1;
682
683	4	50				42	goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
684
685	4	50				21	goto skip unless $self->learner_is_scan_available();
686
687	4					21	my ($ns, $nn) = $self->{store}->nspam_nham_get();
688
689							## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
690							## $self->{raw_counts} = " ns=$ns nn=$nn ";
691							## }
692
693	4					35	dbg("bayes: corpus size: nspam = $ns, nham = $nn");
694
695	4					9	my $msgtokens;
696	4					7	{ my $timer = $self->{main}->time_method('b_tokenize');
	4					25
697	4					25	my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
698	4					41	$msgtokens = $self->tokenize($msg, $msgdata);
699							}
700
701	4					12	my $tokensdata;
702	4					11	{ my $timer = $self->{main}->time_method('b_tok_get_all');
	4					25
703	4					11	$tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
	4					218
704							}
705
706	4					75	my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
707
708	4					26	my $probabilities_ref =
709							$self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
710
711	4					11	my %pw;
712	4					7	foreach my $tokendata (@{$tokensdata}) {
	4					22
713	1104					1618	my $prob = shift(@$probabilities_ref);
714	1104	100				2015	next unless defined $prob;
715	502					594	my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
	502					863
716	502					1666	$pw{$token} = {
717							prob => $prob,
718							spam_count => $tok_spam,
719							ham_count => $tok_ham,
720							atime => $atime
721							};
722							}
723
724	4					94	my @pw_keys = keys %pw;
725
726							# If none of the tokens were found in the DB, we're going to skip
727							# this message...
728	4	50				16	if (!@pw_keys) {
729	0					0	dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
730	0					0	goto skip;
731							}
732
733	4					14	my $tcount_total = keys %{$msgtokens};
	4					12
734	4					11	my $tcount_learned = scalar @pw_keys;
735
736							# Figure out the message receive time (used as atime below)
737							# If the message atime comes back as being in the future, something's
738							# messed up and we should revert to current time as a safety measure.
739							#
740	4					29	my $msgatime = $msg->receive_date();
741	4					16	my $now = time;
742	4	50				17	$msgatime = $now if ( $msgatime > $now );
743
744	4					18	my @touch_tokens;
745	4					29	my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
746	4					25	my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
747
748	4					373	my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
749	4					43	my $log_each_token = (would_log('dbg', 'bayes') > 1);
750
751							# now take the most significant tokens and calculate probs using
752							# Robinson's formula.
753
754	4					35	@pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
	2963					4014
755
756	4	100				28	if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
	2					22
757
758	4					11	my @sorted;
759	4					14	foreach my $tok (@pw_keys) {
760	414	100				790	next if $tok_strength{$tok} <
761							$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
762
763	282					373	my $pw_tok = $pw{$tok};
764	282					376	my $pw_prob = $pw_tok->{prob};
765
766							# What's more expensive, scanning headers for HAMMYTOKENS and
767							# SPAMMYTOKENS tags that aren't there or collecting data that
768							# won't be used? Just collecting the data is certainly simpler.
769							#
770	282		50			748	my $raw_token = $msgtokens->{$tok} \|\| "(unknown)";
771	282					418	my $s = $pw_tok->{spam_count};
772	282					358	my $n = $pw_tok->{ham_count};
773	282					360	my $a = $pw_tok->{atime};
774
775	282	100				353	push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
	282					919
776							[$raw_token, $pw_prob, $s, $n, $a] );
777
778	282					508	push(@sorted, $pw_prob);
779
780							# update the atime on this token, it proved useful
781	282					387	push(@touch_tokens, $tok);
782
783	282	50				793	if ($log_each_token) {
784	0					0	dbg("bayes: token '$raw_token' => $pw_prob");
785							}
786							}
787
788	4	50	50			49	if (!@sorted \|\| (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
789							$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
790							{
791	0					0	dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
792	0					0	goto skip;
793							}
794
795	4					30	$score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
796	4					23	undef $timer_compute_prob; # end a timing section
797
798							# Couldn't come up with a probability?
799	4	50				16	goto skip unless defined $score;
800
801	4					56	dbg("bayes: score = $score");
802
803							# no need to call tok_touch_all unless there were significant
804							# tokens and a score was returned
805							# we don't really care about the return value here
806
807	4					12	{ my $timer = $self->{main}->time_method('b_tok_touch_all');
	4					28
808	4					62	$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
809							}
810
811	4					19	my $timer_finish = $self->{main}->time_method('b_finish');
812
813	4					16	$permsgstatus->{bayes_nspam} = $ns;
814	4					12	$permsgstatus->{bayes_nham} = $nn;
815
816							## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
817							## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
818							## }
819
820	4					47	$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
821							probsref => \%pw,
822							score => $score,
823							msgatime => $msgatime,
824							significant_tokens => \@touch_tokens,
825							});
826
827							skip:
828	4	50				22	if (!defined $score) {
829	0					0	dbg("bayes: not scoring message, returning undef");
830							}
831
832	4					16	undef $timer_compute_prob; # end a timing section if still running
833	4	50				13	if (!defined $timer_finish) {
834	4					24	$timer_finish = $self->{main}->time_method('b_finish');
835							}
836
837							# Take any opportunistic actions we can take
838	4	50				24	if ($self->{main}->{opportunistic_expire_check_only}) {
839							# we're supposed to report on expiry only -- so do the
840							# _opportunistic_calls() run for the journal only.
841	0					0	$self->_opportunistic_calls(1);
842	0					0	$permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
843							}
844							else {
845	4					20	$self->_opportunistic_calls();
846							}
847
848							# Do any cleanup we need to do
849	4					25	$self->{store}->cleanup();
850
851							# Reset the value accordingly
852	4					20	$self->{main}->{learn_caller_will_untie} = $caller_untie;
853
854							# If our caller won't untie the db, we need to do it.
855	4	50				19	if (!$caller_untie) {
856	4					33	$self->{store}->untie_db();
857							}
858
859							$permsgstatus->set_tag ('BAYESTCHAMMY',
860	4	50				19	($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));
	4					31
861							$permsgstatus->set_tag ('BAYESTCSPAMMY',
862	4	50				15	($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));
	4					17
863	4					22	$permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);
864	4					15	$permsgstatus->set_tag ('BAYESTC', $tcount_total);
865
866							$permsgstatus->set_tag ('HAMMYTOKENS', sub {
867	0			0		0	my $pms = shift;
868							$self->bayes_report_make_list
869	0					0	($pms, $pms->{bayes_token_info_hammy}, shift);
870	4					39	});
871
872							$permsgstatus->set_tag ('SPAMMYTOKENS', sub {
873	0			0		0	my $pms = shift;
874							$self->bayes_report_make_list
875	0					0	($pms, $pms->{bayes_token_info_spammy}, shift);
876	4					73	});
877
878							$permsgstatus->set_tag ('TOKENSUMMARY', sub {
879	0			0		0	my $pms = shift;
880	0	0				0	if ( defined $pms->{tag_data}{BAYESTC} )
881							{
882							my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED}
883							- $pms->{tag_data}{BAYESTCSPAMMY}
884	0					0	- $pms->{tag_data}{BAYESTCHAMMY};
885							my $tcount_new = $pms->{tag_data}{BAYESTC}
886	0					0	- $pms->{tag_data}{BAYESTCLEARNED};
887	0					0	"Tokens: new, $tcount_new; "
888							."hammy, $pms->{tag_data}{BAYESTCHAMMY}; "
889							."neutral, $tcount_neutral; "
890							."spammy, $pms->{tag_data}{BAYESTCSPAMMY}."
891							} else {
892	0					0	"Bayes not run.";
893							}
894	4					43	});
895
896
897	4					749	return $score;
898							}
899
900							###########################################################################
901
902							# Plugin hook.
903							sub learner_dump_database {
904	0			0	1	0	my ($self, $params) = @_;
905	0					0	my $magic = $params->{magic};
906	0					0	my $toks = $params->{toks};
907	0					0	my $regex = $params->{regex};
908
909							# allow dump to occur even if use_bayes disables everything else ...
910							#return 0 unless $self->{conf}->{use_bayes};
911	0	0				0	return 0 unless $self->{store}->tie_db_readonly();
912
913	0					0	my @vars = $self->{store}->get_storage_variables();
914
915	0					0	my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
916
917	0					0	my $template = '%3.3f %10u %10u %10u %s'."\n";
918
919	0	0				0	if ( $magic ) {
920	0	0				0	printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version')
921							or die "Error writing: $!";
922	0	0				0	printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam')
923							or die "Error writing: $!";
924	0	0				0	printf($template, 0.0, 0, $nh, 0, 'non-token data: nham')
925							or die "Error writing: $!";
926	0	0				0	printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens')
927							or die "Error writing: $!";
928	0	0				0	printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime')
929							or die "Error writing: $!";
930	0	0				0	if ( $bv >= 2 ) {
931	0	0				0	printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime')
932							or die "Error writing: $!";
933							}
934	0	0				0	if ( $bv < 2 ) {
935	0	0				0	printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count')
936							or die "Error writing: $!";
937							}
938	0	0				0	if ( $bv >= 2 ) {
939	0	0				0	printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime')
940							or die "Error writing: $!";
941							}
942	0	0				0	printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime')
943							or die "Error writing: $!";
944	0	0				0	if ( $bv >= 2 ) {
945	0	0				0	printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta')
946							or die "Error writing: $!";
947
948	0	0				0	printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count')
949							or die "Error writing: $!";
950							}
951							}
952
953	0	0				0	if ( $toks ) {
954							# let the store sort out the db_toks
955	0					0	$self->{store}->dump_db_toks($template, $regex, @vars);
956							}
957
958	0	0				0	if (!$self->{main}->{learn_caller_will_untie}) {
959	0					0	$self->{store}->untie_db();
960							}
961	0					0	return 1;
962							}
963
964							###########################################################################
965							# TODO: these are NOT public, but the test suite needs to call them.
966
967							sub get_msgid {
968	14			14	0	55	my ($self, $msg) = @_;
969
970	14					25	my @msgid;
971
972	14					54	my $msgid = $msg->get_header("Message-Id");
973	14	50	33			363	if (defined $msgid && $msgid ne '' && $msgid !~ /^\s<\s(?:\@sa_generated)?>.*$/) {
			33
974							# remove \r and < and > prefix/suffixes
975	14					56	chomp $msgid;
976	14					86	$msgid =~ s/^<//; $msgid =~ s/>.*$//g;
	14					95
977	14					57	push(@msgid, $msgid);
978							}
979
980							# Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation
981
982							# Use sha1_hex(Date: and top N bytes of body)
983							# where N is MIN(1024 bytes, 1/2 of body length)
984							#
985	14					61	my $date = $msg->get_header("Date");
986	14	50	33			143	$date = "None" if (!defined $date \|\| $date eq ''); # No Date?
987
988							#Removed per bug 5185
989							#my @rcvd = $msg->get_header("Received");
990							#my $rcvd = $rcvd[$#rcvd];
991							#$rcvd = "None" if (!defined $rcvd \|\| $rcvd eq ''); # No Received?
992
993							# Make a copy since pristine_body is a reference ...
994	14					86	my $body = join('', $msg->get_pristine_body());
995
996	14	50				77	if (length($body) > 64) { # Small Body?
997	14	50				60	my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
998	14					52	substr($body, $keep) = '';
999							}
1000
1001							#Stripping all CR and LF so that testing midstream from MTA and post delivery don't
1002							#generate different id's simply because of LF<->CR<->CRLF changes.
1003	14					290	$body =~ s/[\r\n]//g;
1004
1005	14					268	unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');
1006
1007	14	50				123	return wantarray ? @msgid : $msgid[0];
1008							}
1009
1010							sub get_body_from_msg {
1011	20			20	0	72	my ($self, $msg) = @_;
1012
1013	20	50				79	if (!ref $msg) {
1014							# I have no idea why this seems to happen. TODO
1015	0					0	warn "bayes: msg not a ref: '$msg'";
1016	0					0	return { };
1017							}
1018
1019							my $permsgstatus =
1020	20					181	Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
1021	20					143	$msg->extract_message_metadata ($permsgstatus);
1022	20					104	my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
1023	20					103	$permsgstatus->finish();
1024
1025	20	50				87	if (!defined $msgdata) {
1026							# why?!
1027	0					0	warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
1028	0					0	return { };
1029							}
1030
1031	20					97	return $msgdata;
1032							}
1033
1034							sub _get_msgdata_from_permsgstatus {
1035	24			24		71	my ($self, $pms) = @_;
1036
1037	24					70	my $t_src = $self->{conf}->{bayes_token_sources};
1038	24					55	my $msgdata = { };
1039							$msgdata->{bayes_token_body} =
1040	24	50				226	$pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible};
1041							$msgdata->{bayes_token_inviz} =
1042	24	50				181	$pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible};
1043							$msgdata->{bayes_mimepart_digests} =
1044	24	50				91	$pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
1045	24					77	@{$msgdata->{bayes_token_uris}} =
1046	24	50				180	$pms->get_uri_list() if $t_src->{uri};
1047	24					75	return $msgdata;
1048							}
1049
1050							###########################################################################
1051
1052							# The calling functions expect a uniq'ed array of tokens ...
1053							sub tokenize {
1054	16			16	0	68	my ($self, $msg, $msgdata) = @_;
1055
1056	16					57	my $t_src = $self->{conf}->{bayes_token_sources};
1057	16					32	my @tokens;
1058
1059							# visible tokens from the body
1060	16	50				75	if ($msgdata->{bayes_token_body}) {
1061							my(@t) = map($self->_tokenize_line ($_, '', 1),
1062	16					40	@{$msgdata->{bayes_token_body}} );
	16					115
1063	16					579	dbg("bayes: tokenized body: %d tokens", scalar @t);
1064	16					1944	push(@tokens, @t);
1065							}
1066							# the URI list
1067	16	50				109	if ($msgdata->{bayes_token_uris}) {
1068							my(@t) = map($self->_tokenize_line ($_, '', 2),
1069	16					39	@{$msgdata->{bayes_token_uris}} );
	16					93
1070	16					75	dbg("bayes: tokenized uri: %d tokens", scalar @t);
1071	16					74	push(@tokens, @t);
1072							}
1073							# add invisible tokens
1074	16	50				60	if ($msgdata->{bayes_token_inviz}) {
1075	16					40	my $tokprefix;
1076	16					27	if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' }
	16					71
1077	16					31	if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' }
1078	16	50				48	if (defined $tokprefix) {
1079							my(@t) = map($self->_tokenize_line ($_, $tokprefix, 1),
1080	16					46	@{$msgdata->{bayes_token_inviz}} );
	16					50
1081	16					60	dbg("bayes: tokenized invisible: %d tokens", scalar @t);
1082	16					46	push(@tokens, @t);
1083							}
1084							}
1085
1086							# add digests and Content-Type of all MIME parts
1087	16	50				65	if ($msgdata->{bayes_mimepart_digests}) {
1088	0					0	my %shorthand = ( # some frequent MIME part contents for human readability
1089							'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
1090							'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
1091							'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML',
1092							'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
1093							'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
1094							'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
1095							'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
1096							);
1097							my(@t) = map('MIME:' . ($shorthand{$_} \|\| $_),
1098	0		0			0	@{ $msgdata->{bayes_mimepart_digests} });
	0					0
1099	0					0	dbg("bayes: tokenized mime parts: %d tokens", scalar @t);
1100	0					0	dbg("bayes: mime-part token %s", $_) for @t;
1101	0					0	push(@tokens, @t);
1102							}
1103
1104							# Tokenize the headers
1105	16	50				58	if ($t_src->{header}) {
1106	16					45	my(@t);
1107	16					75	my %hdrs = $self->_tokenize_headers ($msg);
1108	16					121	while( my($prefix, $value) = each %hdrs ) {
1109	200					650	push(@t, $self->_tokenize_line ($value, "H$prefix:", 0));
1110							}
1111	16					89	dbg("bayes: tokenized header: %d tokens", scalar @t);
1112	16					778	push(@tokens, @t);
1113							}
1114
1115							# Go ahead and uniq the array, skip null tokens (can happen sometimes)
1116							# generate an SHA1 hash and take the lower 40 bits as our token
1117	16					63	my %tokens;
1118	16					78	foreach my $token (@tokens) {
1119							# dbg("bayes: token: %s", $token);
1120	6714	50				41363	$tokens{substr(sha1($token), -5)} = $token if $token ne '';
1121							}
1122
1123							# return the keys == tokens ...
1124	16					779	return \%tokens;
1125							}
1126
1127							sub _tokenize_line {
1128	532			532		910	my $self = $_[0];
1129	532					835	my $tokprefix = $_[2];
1130	532					733	my $region = $_[3];
1131	532					1222	local ($_) = $_[1];
1132
1133	532					805	my @rettokens;
1134
1135							# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
1136							# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
1137							# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
1138
1139							### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
1140
1141							### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic)
1142	532					2120	s{ ( [A-Za-z0-9,@*!_'"\$. -]+ \|
1143							[\xC0-\xDF][\x80-\xBF] \|
1144							[\xE0-\xEF][\x80-\xBF]{2} \|
1145							[\xF0-\xF4][\x80-\xBF]{3} \|
1146	4844	100				18609	[\xA1-\xFF] ) \| . }
1147							{ defined $1 ? $1 : ' ' }xsge;
1148							# should we also turn NBSP ( \xC2\xA0 ) into space?
1149
1150							# DO split on "..." or "--" or "---"; common formatting error resulting in
1151							# hapaxes. Keep the separator itself as a token, though, as long ones can
1152	532					1674	# be good spamsigns.
1153	532					922	s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
1154							s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
1155	532					689
1156	532	100	100			1754	if (IGNORE_TITLE_CASE) {
1157							if ($region == 1 \|\| $region == 2) {
1158							# lower-case Title Case at start of a full-stop-delimited line (as would
1159	332					2262	# be seen in a Western language).
	242					2395
1160							s/(?:^\|\.\s+)([A-Z])([^A-Z]+)(?:\s\|$)/ ' '. (lc $1) . $2 . ' ' /ge;
1161							}
1162							}
1163	532					1970
1164							my $magic_re = $self->{store}->get_magic_re();
1165
1166							# Note that split() in scope of 'use bytes' results in words with utf8 flag
1167							# cleared, even if the source string has perl characters semantics !!!
1168							# Is this really still desirable?
1169	532					4285
1170	8592					18878	foreach my $token (split) {
1171	8592					15118	$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
1172							$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
1173
1174							# Skip false magic tokens
1175							# TVD: we need to do a defined() check since SQL doesn't have magic
1176							# tokens, so the SQL BayesStore returns undef. I really want a way
1177							# of optimizing that out, but I haven't come up with anything yet.
1178	8592	50	33			38282	#
1179							next if ( defined $magic_re && $token =~ /$magic_re/ );
1180
1181	8592					18384	# do keep 3-byte tokens; there's some solid signs in there
1182							my $len = length($token);
1183
1184							# but extend the stop-list. These are squarely in the gray
1185							# area, and it just slows us down to record them.
1186							# See http://wiki.apache.org/spamassassin/BayesStopList for more info.
1187	8592	100	100			42245	#
1188							next if $len < 3 \|\|
1189							($token =~ /^(?:a(?:ble\|l(?:ready\|l)\|n[dy]\|re)\|b(?:ecause\|oth)\|c(?:an\|ome)\|e(?:ach\|mail\|ven)\|f(?:ew\|irst\|or\|rom)\|give\|h(?:a(?:ve\|s)\|ttp)\|i(?:n(?:formation\|to)\|t\'s)\|just\|know\|l(?:ike\|o(?:ng\|ok))\|m(?:a(?:de\|il(?:(?:ing\|to))?\|ke\|ny)\|o(?:re\|st)\|uch)\|n(?:eed\|o[tw]\|umber)\|o(?:ff\|n(?:ly\|e)\|ut\|wn)\|p(?:eople\|lace)\|right\|s(?:ame\|ee\|uch)\|t(?:h(?:at\|is\|rough\|e)\|ime)\|using\|w(?:eb\|h(?:ere\|y)\|ith(?:out)?\|or(?:ld\|k))\|y(?:ears?\|ou(?:(?:\'re\|r))?))$/i);
1190
1191	5230	100	100			13661	# are we in the body? If so, apply some body-specific breakouts
1192	3534	100				10045	if ($region == 1 \|\| $region == 2) {
		100
1193	38					173	if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
1194							push (@rettokens, $self->_tokenize_mail_addrs ($token));
1195							}
1196	110					364	elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
1197	110					245	push (@rettokens, "UD:".$token); # the full token
	110					588
1198	132					681	my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
1199							push (@rettokens, "UD:".$1); # UD = URL domain
1200							}
1201							}
1202							}
1203
1204							# note: do not trim down overlong tokens if they contain '*'. This is
1205							# used as part of split tokens such as "HTo:D*net" indicating that
1206							# the domain ".net" appeared in the To header.
1207	5230	100	100			12967	#
1208							if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
1209	250	50				676
1210							if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
1211							# Bug 7135
1212	0					0	# collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences
1213							my(@t) = $token =~ /( (?: [\xE0-\xEF] \| [\xF0-\xF4][\x80-\xBF] )
1214	0	0				0	[\x80-\xBF]{2} )/xsg;
1215	0					0	if (@t) {
1216	0					0	push (@rettokens, map($tokprefix.'u8:'.$_, @t));
1217							next;
1218							}
1219							}
1220	250					330
1221							if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
1222							# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
1223							# but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
1224							# to me! (jm)
1225							while ($token =~ s/^(..?)//) {
1226							push (@rettokens, $tokprefix.'8:'.$1);
1227							}
1228							next;
1229							}
1230	250	100	100			1090
			100
			100
			50
			66
1231							if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1232							\|\| ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1233							\|\| ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
1234							{
1235							# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
1236							# Spambayes trick via Matt: Just retain 7 chars. Do not retain the
1237							# length, it does not help; see jm's mail to -devel on Nov 20 2002 at
1238							# http://sourceforge.net/p/spamassassin/mailman/message/12977605/
1239							# "sk:" stands for "skip".
1240							# Bug 7141: retain seven UTF-8 chars (or other bytes),
1241	232					1737	# if followed by at least two bytes
1242							$token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] \|
1243							[\xC0-\xDF][\x80-\xBF] \|
1244							[\xE0-\xEF][\x80-\xBF]{2} \|
1245							[\xF0-\xF4][\x80-\xBF]{3} \| . ){7} ))
1246							.{2,} \z }{sk:$1}xs;
1247							## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes
1248							}
1249							}
1250
1251	5230	100	100			12387	# decompose tokens? do this after shortening long tokens
1252	3534					4649	if ($region == 1 \|\| $region == 2) {
1253	3534	100				7994	if (DECOMPOSE_BODY_TOKENS) {
1254	358					732	if ($token =~ /[^\w:\*]/) {
1255	358					1444	my $decompd = $token; # "Foo!"
1256	358					1256	$decompd =~ s/[^\w:\*]//gs;
1257							push (@rettokens, $tokprefix.$decompd); # "Foo"
1258							}
1259	3534	100				8690
1260	712					1524	if ($token =~ /[A-Z]/) {
	712					1413
1261	712					2002	my $decompd = $token; $decompd = lc $decompd;
1262							push (@rettokens, $tokprefix.$decompd); # "foo!"
1263	712	100				2404
1264	76					276	if ($token =~ /[^\w:\*]/) {
1265	76					305	$decompd =~ s/[^\w:\*]//gs;
1266							push (@rettokens, $tokprefix.$decompd); # "foo"
1267							}
1268							}
1269							}
1270							}
1271	5230					21066
1272							push (@rettokens, $tokprefix.$token);
1273							}
1274	532					8313
1275							return @rettokens;
1276							}
1277
1278	16			16		52	sub _tokenize_headers {
1279							my ($self, $msg) = @_;
1280	16					37
1281							my %parsed;
1282
1283	16					35	my %user_ignore;
	16					76
1284							$user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
1285
1286	16					41	# get headers in array context
1287							my @hdrs;
1288	16					102	my @rcvdlines;
1289							for ($msg->get_all_headers()) {
1290	202	100				607	# first, keep a copy of Received headers, so we can strip down to last 2
1291	52					154	if (/^Received:/i) {
1292	52					109	push(@rcvdlines, $_);
1293							next;
1294							}
1295	150	100				2112	# and now skip lines for headers we don't want (including all Received)
1296	102					174	next if /^${IGNORED_HDRS}:/i;
1297	102					306	next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
1298							push(@hdrs, $_);
1299	16					133	}
1300							push(@hdrs, $msg->get_all_metadata());
1301
1302							# and re-add the last 2 received lines: usually a good source of
1303	16	100				75	# spamware tokens and HELO names.
	14					59
1304	16	100				56	if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
	14					49
1305							if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
1306	16					70
1307	194	50				688	for (@hdrs) {
1308	194					881	next unless /\S/;
1309							my ($hdr, $val) = split(/:/, $_, 2);
1310
1311							# remove user-specified headers here, after Received, in case they
1312	194	50				639	# want to ignore that too
1313							next if exists $user_ignore{lc $hdr};
1314
1315	194		50			452	# Prep the header value
1316	194					388	$val \|\|= '';
1317							chomp($val);
1318
1319	194	100				1570	# special tokenization for some headers:
		100
		100
		100
		100
		50
1320	16					102	if ($hdr =~ /^(?:\|X-\|Resent-)Message-Id$/i) {
1321							$val = $self->_pre_chew_message_id ($val);
1322							}
1323							elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:\|X-\|Resent-)
1324							(?:Return-Path\|From\|To\|Cc\|Reply-To\|Errors-To\|Mail-Followup-To\|Sender)$/ix)
1325	60					184	{
1326							$val = $self->_pre_chew_addr_header ($val);
1327							}
1328	28					114	elsif ($hdr eq 'Received') {
1329							$val = $self->_pre_chew_received ($val);
1330							}
1331	4					31	elsif ($hdr eq 'Content-Type') {
1332							$val = $self->_pre_chew_content_type ($val);
1333							}
1334	2					11	elsif ($hdr eq 'MIME-Version') {
1335							$val =~ s/1\.0//; # totally innocuous
1336							}
1337	0					0	elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
1338							$val = "1"; # just mark the presence, they create lots of hapaxen
1339							}
1340	194					359
1341	194	100				575	if (MAP_HEADERS_MID) {
1342	16					59	if ($hdr =~ /^(?:In-Reply-To\|References\|Message-ID)$/i) {
1343							$parsed{"*MI"} = $val;
1344							}
1345	194					292	}
1346	194	100				574	if (MAP_HEADERS_FROMTOCC) {
1347	32					104	if ($hdr =~ /^(?:From\|To\|Cc)$/i) {
1348							$parsed{"*Ad"} = $val;
1349							}
1350	194					311	}
1351	194	50				482	if (MAP_HEADERS_USERAGENT) {
1352	0					0	if ($hdr =~ /^(?:X-Mailer\|User-Agent)$/i) {
1353							$parsed{"*UA"} = $val;
1354							}
1355							}
1356
1357	194	100				523	# replace hdr name with "compressed" version if possible
1358	126					343	if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
1359							$hdr = $HEADER_NAME_COMPRESSION{$hdr};
1360							}
1361	194	100				499
1362	26					106	if (exists $parsed{$hdr}) {
1363							$parsed{$hdr} .= " ".$val;
1364	168					550	} else {
1365							$parsed{$hdr} = $val;
1366	194	50				505	}
1367	0					0	if (would_log('dbg', 'bayes') > 1) {
1368							dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
1369							}
1370							}
1371	16					294
1372							return %parsed;
1373							}
1374
1375	4			4		20	sub _pre_chew_content_type {
1376							my ($self, $val) = @_;
1377
1378	4	50				27	# hopefully this will retain good bits without too many hapaxen
1379	0					0	if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
1380	0	0				0	my $boundary = $1;
1381	0					0	$boundary = '' if !defined $boundary; # avoid a warning
1382							$boundary =~ s/[a-fA-F0-9]/H/gs;
1383	0					0	# break up blocks of separator chars so they become their own tokens
1384	0					0	$boundary =~ s/([-_\.=]+)/ $1 /gs;
1385							$val .= $boundary;
1386							}
1387
1388	4					43	# stop-list words for Content-Type header: these wind up totally gray
1389							$val =~ s/\b(?:text\|charset)\b//;
1390	4					18
1391							$val;
1392							}
1393
1394	16			16		69	sub _pre_chew_message_id {
1395							my ($self, $val) = @_;
1396							# we can (a) get rid of a lot of hapaxen and (b) increase the token
1397							# specificity by pre-parsing some common formats.
1398
1399	16					62	# Outlook Express format:
1400							$val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
1401							([0-9a-f]{4})[0-9a-f]{4}\$
1402							([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
1403
1404	16					40	# Exim:
1405							$val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
1406
1407	16					34	# Sendmail:
1408							$val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
1409							[A-F0-9]{10,12}\@//gx;
1410
1411							# try to split Message-ID segments on probable ID boundaries. Note that
1412							# Outlook message-ids seem to contain a server identifier ID in the last
1413							# 8 bytes before the @. Make sure this becomes its own token, it's a
1414	16					118	# great spam-sign for a learning system! Be sure to split on ".".
1415	16					60	$val =~ s/[^_A-Za-z0-9]/ /g;
1416							$val;
1417							}
1418
1419	28			28		90	sub _pre_chew_received {
1420							my ($self, $val) = @_;
1421
1422							# Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs
1423							# and valid-format RFC-822/2822 dates
1424	28					141
1425	28					118	$val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail
1426	28					173	$val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail
1427	28					87	$val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail
1428							$val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
1429	28					289
1430							$val =~ s/(?:(?:Mon\|Tue\|Wed\|Thu\|Fri\|Sat\|Sun),\s)?
1431							[0-3\s]?[0-9]\s
1432							(?:Jan\|Feb\|Ma[ry]\|Apr\|Ju[nl]\|Aug\|Sep\|Oct\|Nov\|Dec)\s
1433							(?:19\|20)?[0-9]{2}\s
1434							[0-2][0-9](?:\:[0-5][0-9]){1,2}\s
1435							(?:\s$\|$\|\s(?:[+-][0-9]{4})\|\s(?:UT\|[A-Z]{2,3}T))
1436							//gx;
1437
1438							# IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
1439							# IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
1440							# (on both sides)
1441							# also make a dup with the full IP, as fodder for
1442	28					218	# bayes_dump_to_trusted_networks: "Hr:ipaaa.bbb.ccc.ddd"
1443	30	50	33			255	$val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
			33
1444	0					0	if ($2 eq '10' \|\| ($2 eq '192' && $3 eq '168')) {
1445							$1.$2.$3.$4.
1446							" ip*".$1.$2.$3.$4." ";
1447	30					416	} else {
1448							$1.$2.$3.
1449							" ip*".$1.$2.$3.$4." ";
1450							}
1451							}gex;
1452
1453							# trim these: they turn out as the most common tokens, but with a
1454	28					350	# prob of about .5. waste of space!
1455							$val =~ s/\b(?:with\|from\|for\|SMTP\|ESMTP)\b/ /g;
1456	28					117
1457							$val;
1458							}
1459
1460	60			60		188	sub _pre_chew_addr_header {
1461	60					119	my ($self, $val) = @_;
1462							local ($_);
1463	60					260
1464	60					121	my @addrs = $self->{main}->find_all_addrs_in_line ($val);
1465	60					127	my @toks;
1466	48					135	foreach (@addrs) {
1467							push (@toks, $self->_tokenize_mail_addrs ($_));
1468	60					354	}
1469							return join (' ', @toks);
1470							}
1471
1472	86			86		267	sub _tokenize_mail_addrs {
1473							my ($self, $addr) = @_;
1474	86	50				468
1475	86					154	($addr =~ /(.+)\@(.+)$/) or return ();
1476	86					404	my @toks;
1477	86					208	push(@toks, "U".$1, "D".$2);
	86					465
	76					376
1478	86					498	$_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
1479							return @toks;
1480							}
1481
1482
1483							###########################################################################
1484
1485							# compute the probability that a token is spammish for each token
1486	4			4		17	sub _compute_prob_for_all_tokens {
1487	4					10	my ($self, $tokensdata, $ns, $nn) = @_;
1488							my @probabilities;
1489	4	50	33			41
1490							return if !$ns \|\| !$nn;
1491	4					12
1492	4					15	my $threshold = 1; # ignore low-freq tokens below this s+n threshold
1493							if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1494							$threshold = 10;
1495	4	50				17	}
1496	0					0	if (!$self->{use_hapaxes}) {
1497							$threshold = 2;
1498							}
1499	4					11
	4					18
1500	1104					1434	foreach my $tokendata (@{$tokensdata}) {
1501	1104					1486	my $s = $tokendata->[1]; # spam count
1502	1104					1325	my $n = $tokendata->[2]; # ham count
1503							my $prob;
1504	21			21		245
	21					51
	21					26809
1505	1104	100				1815	no warnings 'uninitialized'; # treat undef as zero in addition
1506							if ($s + $n >= $threshold) {
1507							# ignoring low-freq tokens, also covers the (!$s && !$n) case
1508
1509							# my $ratios = $s / $ns;
1510							# my $ration = $n / $nn;
1511							# $prob = $ratios / ($ration + $ratios);
1512	502					1120	#
1513							$prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster
1514	502					623
1515							if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1516							# use Robinson's f(x) equation for low-n tokens, instead of just
1517	502					620	# ignoring them
1518	502					1088	my $robn = $s + $n;
1519							$prob =
1520							($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
1521							/
1522							($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
1523							}
1524							}
1525
1526							# 'log_raw_counts' is used to log the raw data for the Bayes equations
1527							# during a mass-check, allowing the S and X constants to be optimized
1528							# quickly without requiring re-tokenization of the messages for each
1529							# attempt. There's really no need for this code to be uncommented in
1530							# normal use, however. It has never been publicly documented, so
1531							# commenting it out is fine. ;)
1532							#
1533							## if ($self->{log_raw_counts}) {
1534							## $self->{raw_counts} .= " s=$s,n=$n ";
1535							## }
1536	1104					2043
1537							push(@probabilities, $prob);
1538	4					26	}
1539							return \@probabilities;
1540							}
1541
1542							# compute the probability that a token is spammish
1543	0			0		0	sub _compute_prob_for_token {
1544							my ($self, $token, $ns, $nn, $s, $n) = @_;
1545
1546							# we allow the caller to give us the token information, just
1547	0	0	0			0	# to save a potentially expensive lookup
1548	0					0	if (!defined($s) \|\| !defined($n)) {
1549							($s, $n, undef) = $self->{store}->tok_get($token);
1550	0	0	0			0	}
1551							return if !$s && !$n;
1552	0					0
1553							my $probabilities_ref =
1554							$self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);
1555	0					0
1556							return $probabilities_ref->[0];
1557							}
1558
1559							###########################################################################
1560							# If a token is neither hammy nor spammy, return 0.
1561							# For a spammy token, return the minimum number of additional ham messages
1562							# it would have had to appear in to no longer be spammy. Hammy tokens
1563							# are handled similarly. That's what the function does (at the time
1564							# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly
1565							# more useful if it returned the number of /additional/ ham messages
1566							# a spammy token would have to appear in to no longer be spammy but I
1567							# fear that might require the solution to a cubic equation, and I
1568							# just don't have the time for that now.
1569
1570	0			0		0	sub _compute_declassification_distance {
1571							my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
1572	0	0	0			0
1573							return 0 if $ns == 0 && $nn == 0;
1574	0					0
1575	0	0				0	if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
	0	0				0
1576							if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
1577	0	0	0			0
1578	0	0				0	return 0 if $Ns == 0 \|\| $Nn == 0;
1579							return 0 if abs( $prob - 0.5 ) <
1580							$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1581	0	0				0
1582	0					0	my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
1583							my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1584	0					0
1585							return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
1586							unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
1587	0					0
1588	0					0	my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
1589	0					0	my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
1590	0					0	my $a = $Nb * ( 1 - $p );
1591	0					0	my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
1592	0					0	my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
1593	0	0				0	my $discrim = $b * $b - 4 * $a * $c;
1594	0					0	my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
1595							my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
1596
1597	0	0				0	# This shouldn't be necessary. Should not be < 1
1598							return $dd_exact < 1 ? 1 : int($dd_exact);
1599							}
1600
1601							###########################################################################
1602
1603	10			10		40	sub _opportunistic_calls {
1604							my($self, $journal_only) = @_;
1605
1606	10	50				129	# If we're not already tied, abort.
1607	0					0	if (!$self->{store}->db_readable()) {
1608	0					0	dbg("bayes: opportunistic call attempt failed, DB not readable");
1609							return;
1610							}
1611
1612	10					77	# Is an expire or sync running?
1613	10	50	33			39	my $running_expire = $self->{store}->get_running_expire_tok();
1614	0					0	if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
1615	0					0	dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
1616							return;
1617							}
1618
1619	10	50	66			145	# handle expiry and syncing
		50
1620	0					0	if (!$journal_only && $self->{store}->expiry_due()) {
1621							dbg("bayes: opportunistic call found expiry due");
1622
1623							# sync will bring the DB R/W as necessary, and the expire will remove
1624	0					0	# the running_expire token, may untie as well.
1625							$self->{main}->{bayes_scanner}->sync(1,1);
1626							}
1627	0					0	elsif ( $self->{store}->sync_due() ) {
1628							dbg("bayes: opportunistic call found journal sync due");
1629
1630	0					0	# sync will bring the DB R/W as necessary, may untie as well
1631							$self->{main}->{bayes_scanner}->sync(1,0);
1632
1633	0	0				0	# We can only remove the running_expire token if we're doing R/W
1634	0					0	if ($self->{store}->db_writable()) {
1635							$self->{store}->remove_running_expire_tok();
1636							}
1637							}
1638	10					34
1639							return;
1640							}
1641
1642							###########################################################################
1643
1644	62			62	1	232	sub learner_new {
1645							my ($self) = @_;
1646	62					138
1647	62					271	my $store;
1648	62	100				368	my $module = $self->{conf}->{bayes_store_module};
		50
1649	56					191	if (!$module) {
1650							$module = 'Mail::SpamAssassin::BayesStore::DBM';
1651	6					40	} elsif ($module =~ /^([_A-Za-z0-9:]+)$/) {
1652							$module = untaint_var($module);
1653	0					0	} else {
1654							die "bayes: invalid module: $module\n";
1655							}
1656	62					322
1657	62					225	dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module);
1658							undef $self->{store}; # DESTROYs previous object, if any
1659							eval '
1660							require '.$module.';
1661							$store = '.$module.'->new($self);
1662	62	50				8504	1;
1663	0	0				0	' or do {
	0					0
1664	0					0	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1665							die "bayes: learner_new $module new() failed: $eval_stat\n";
1666							};
1667	62					488
1668	62					179	dbg("bayes: learner_new: got store=%s", $store);
1669							$self->{store} = $store;
1670	62					254
1671							$self;
1672							}
1673
1674							###########################################################################
1675
1676	0			0	0		sub bayes_report_make_list {
1677	0	0					my ($self, $pms, $info, $param) = @_;
1678							return "Tokens not available." unless defined $info;
1679	0		0
1680							my ($limit,$fmt_arg,$more) = split /,/, ($param \|\| '5');
1681	0
1682							my %formats = (
1683							short => '$t',
1684							Short => 'Token: \"$t\"',
1685							compact => '$p-$D--$t',
1686							Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
1687							medium => '$p-$D-$N--$t',
1688							long => '$p-$d--${h}h-${s}s--${a}d--$t',
1689							Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
1690							);
1691	0	0
1692							my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
1693	0	0
1694							return "Invalid format, must be one of: ".join(",",keys %formats)
1695							unless defined $raw_fmt;
1696	0
1697	0	0					my $fmt = '"'.$raw_fmt.'"';
1698	0	0					my $amt = $limit < @$info ? $limit : @$info;
1699							return "" unless $amt;
1700	0
1701	0						my $ns = $pms->{bayes_nspam};
1702	0	0		0			my $nh = $pms->{bayes_nham};
	0
1703	0						my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
1704							my $now = time;
1705
1706	0						join ', ', map {
1707	0						my($t,$prob,$s,$h,$u) = @$_;
1708	0						my $a = int(($now - $u)/(3600 * 24));
1709	0						my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);
1710	0						my $p = sprintf "%.3f", $prob;
1711	0	0					my $n = $s + $h;
1712	0						my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
1713	0						my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
1714	0						eval $fmt; ## no critic
	0
1715							} @{$info}[0..$amt-1];
1716							}
1717
1718							1;