File Coverage

blib/lib/Mail/SpamAssassin/Plugin/Bayes.pm

Criterion	Covered	Total	%
statement	551	750	73.4
branch	157	340	46.1
condition	45	119	37.8
subroutine	58	71	81.6
pod	11	21	52.3
total	822	1301	63.1

line	stmt	bran	cond	sub	pod	time	code
1							# <@LICENSE>
2							# Licensed to the Apache Software Foundation (ASF) under one or more
3							# contributor license agreements. See the NOTICE file distributed with
4							# this work for additional information regarding copyright ownership.
5							# The ASF licenses this file to you under the Apache License, Version 2.0
6							# (the "License"); you may not use this file except in compliance with
7							# the License. You may obtain a copy of the License at:
8							#
9							# http://www.apache.org/licenses/LICENSE-2.0
10							#
11							# Unless required by applicable law or agreed to in writing, software
12							# distributed under the License is distributed on an "AS IS" BASIS,
13							# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14							# See the License for the specific language governing permissions and
15							# limitations under the License.
16							# </@LICENSE>
17
18							=head1 NAME
19
20							Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier
21
22							=head1 DESCRIPTION
23
24							This is a Bayesian-style probabilistic classifier, using an algorithm based on
25							the one detailed in Paul Graham's I<A Plan For Spam> paper at:
26
27							http://www.paulgraham.com/spam.html
28
29							It also incorporates some other aspects taken from Graham Robinson's webpage
30							on the subject at:
31
32							http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
33
34							And the chi-square probability combiner as described here:
35
36							http://www.linuxjournal.com/print.php?sid=6467
37
38							The results are incorporated into SpamAssassin as the BAYES_* rules.
39
40							=head1 METHODS
41
42							=cut
43
44							package Mail::SpamAssassin::Plugin::Bayes;
45
46	22			22		147	use strict;
	22					45
	22					715
47	22			22		131	use warnings;
	22					46
	22					710
48							# use bytes;
49	22			22		125	use re 'taint';
	22					50
	22					1732
50
51							BEGIN {
52	22					120	eval { require Digest::SHA; import Digest::SHA qw(sha1 sha1_hex); 1 }
	22					914
	22					587
53	22	50		22		100	or do { require Digest::SHA1; import Digest::SHA1 qw(sha1 sha1_hex) }
	0					0
	0					0
54							}
55
56	22			22		152	use Mail::SpamAssassin;
	22					48
	22					665
57	22			22		130	use Mail::SpamAssassin::Plugin;
	22					47
	22					577
58	22			22		134	use Mail::SpamAssassin::PerMsgStatus;
	22					49
	22					678
59	22			22		132	use Mail::SpamAssassin::Logger;
	22					65
	22					1411
60	22			22		143	use Mail::SpamAssassin::Util qw(untaint_var);
	22					47
	22					1016
61
62							# pick ONLY ONE of these combining implementations.
63	22			22		7377	use Mail::SpamAssassin::Bayes::CombineChi;
	22					56
	22					13343
64							# use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
65
66							our @ISA = qw(Mail::SpamAssassin::Plugin);
67
68							# Which headers should we scan for tokens? Don't use all of them, as it's easy
69							# to pick up spurious clues from some. What we now do is use all of them
70							# less these well-known headers; that way we can pick up spammers' tracking
71							# headers (which are obviously not well-known in advance!).
72
73							# Received is handled specially
74							our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise
75							\|Delivered-To \|Delivery-Date
76							\|(?:X-)?Envelope-To
77							\|X-MIME-Auto[Cc]onverted \|X-Converted-To-Plain-Text
78
79							\|Subject # not worth a tiny gain vs. to db size increase
80
81							# Date: can provide invalid cues if your spam corpus is
82							# older/newer than ham
83							\|Date
84
85							# List headers: ignore. a spamfiltering mailing list will
86							# become a nonspam sign.
87							\|X-List\|(?:X-)?Mailing-List
88							\|(?:X-)?List-(?:Archive\|Help\|Id\|Owner\|Post\|Subscribe
89							\|Unsubscribe\|Host\|Id\|Manager\|Admin\|Comment
90							\|Name\|Url)
91							\|X-Unsub(?:scribe)?
92							\|X-Mailman-Version \|X-Been[Tt]here \|X-Loop
93							\|Mail-Followup-To
94							\|X-eGroups-(?:Return\|From)
95							\|X-MDMailing-List
96							\|X-XEmacs-List
97
98							# gatewayed through mailing list (thanks to Allen Smith)
99							\|(?:X-)?Resent-(?:From\|To\|Date)
100							\|(?:X-)?Original-(?:From\|To\|Date)
101
102							# Spamfilter/virus-scanner headers: too easy to chain from
103							# these
104							\|X-MailScanner(?:-SpamCheck)?
105							\|X-Spam(?:-(?:Status\|Level\|Flag\|Report\|Hits\|Score\|Checker-Version))?
106							\|X-Antispam \|X-RBL-Warning \|X-Mailscanner
107							\|X-MDaemon-Deliver-To \|X-Virus-Scanned
108							\|X-Mass-Check-Id
109							\|X-Pyzor \|X-DCC-\S{2,25}-Metrics
110							\|X-Filtered-B[Yy] \|X-Scanned-By \|X-Scanner
111							\|X-AP-Spam-(?:Score\|Status) \|X-RIPE-Spam-Status
112							\|X-SpamCop-[^:]+
113							\|X-SMTPD \|(?:X-)?Spam-Apparently-To
114							\|SPAM \|X-Perlmx-Spam
115							\|X-Bogosity
116
117							# some noisy Outlook headers that add no good clues:
118							\|Content-Class \|Thread-(?:Index\|Topic)
119							\|X-Original[Aa]rrival[Tt]ime
120
121							# Annotations from IMAP, POP, and MH:
122							\|(?:X-)?Status \|X-Flags \|X-Keywords \|Replied \|Forwarded
123							\|Lines \|Content-Length
124							\|X-UIDL? \|X-IMAPbase
125
126							# Annotations from Bugzilla
127							\|X-Bugzilla-[^:]+
128
129							# Annotations from VM: (thanks to Allen Smith)
130							\|X-VM-(?:Bookmark\|(?:POP\|IMAP)-Retrieved\|Labels\|Last-Modified
131							\|Summary-Format\|VHeader\|v\d-Data\|Message-Order)
132
133							# Annotations from Gnus:
134							\| X-Gnus-Mail-Source
135							\| Xref
136
137							)}x;
138
139							# Note only the presence of these headers, in order to reduce the
140							# hapaxen they generate.
141							our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
142							\|X-(?:Gnu-?PG\|PGP\|GPG)(?:-Key)?-Fingerprint
143							\|D(?:KIM\|omainKey)-Signature
144							)}ix;
145
146							# tweaks tested as of Nov 18 2002 by jm posted to -devel at
147							# http://sourceforge.net/p/spamassassin/mailman/message/12977556/
148							# for results. The winners are now the default settings.
149	22			22		186	use constant IGNORE_TITLE_CASE => 1;
	22					50
	22					1159
150	22			22		122	use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0;
	22					43
	22					898
151	22			22		111	use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1;
	22					45
	22					853
152	22			22		111	use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
	22					38
	22					846
153
154							# tweaks by jm on May 12 2003, see -devel email at
155							# http://sourceforge.net/p/spamassassin/mailman/message/14844556/
156	22			22		115	use constant PRE_CHEW_ADDR_HEADERS => 1;
	22					38
	22					817
157	22			22		118	use constant CHEW_BODY_URIS => 1;
	22					37
	22					846
158	22			22		112	use constant CHEW_BODY_MAILADDRS => 1;
	22					38
	22					796
159	22			22		107	use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
	22					49
	22					776
160	22			22		106	use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
	22					37
	22					788
161	22			22		119	use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
	22					89
	22					777
162	22			22		110	use constant IGNORE_MSGID_TOKENS => 0;
	22					39
	22					777
163
164							# tweaks of 12 March 2004, see bug 2129.
165	22			22		117	use constant DECOMPOSE_BODY_TOKENS => 1;
	22					47
	22					823
166	22			22		110	use constant MAP_HEADERS_MID => 1;
	22					36
	22					860
167	22			22		113	use constant MAP_HEADERS_FROMTOCC => 1;
	22					39
	22					834
168	22			22		118	use constant MAP_HEADERS_USERAGENT => 1;
	22					39
	22					877
169
170							# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
171	22			22		114	use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
	22					41
	22					913
172	22			22		133	use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
	22					41
	22					2514
173
174							# We store header-mined tokens in the db with a "HHeaderName:val" format.
175							# some headers may contain lots of gibberish tokens, so allow a little basic
176							# compression by mapping the header name at least here. these are the headers
177							# which appear with the most frequency in my db. note: this doesn't have to
178							# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing
179							# tokens from multiple different headers may impact accuracy, so might as well
180							# avoid this if possible. These are the top ones from my corpus, BTW (jm).
181							our %HEADER_NAME_COMPRESSION = (
182							'Message-Id' => '*m',
183							'Message-ID' => '*M',
184							'Received' => '*r',
185							'User-Agent' => '*u',
186							'References' => '*f',
187							'In-Reply-To' => '*i',
188							'From' => '*F',
189							'Reply-To' => '*R',
190							'Return-Path' => '*p',
191							'Return-path' => '*rp',
192							'X-Mailer' => '*x',
193							'X-Authentication-Warning' => '*a',
194							'Organization' => '*o',
195							'Organisation' => '*o',
196							'Content-Type' => '*ct',
197							'Content-Disposition' => '*cd',
198							'Content-Transfer-Encoding' => '*ce',
199							'x-spam-relays-trusted' => '*RT',
200							'x-spam-relays-untrusted' => '*RU',
201							);
202
203							# How many seconds should the opportunistic_expire lock be valid?
204							our $OPPORTUNISTIC_LOCK_VALID = 300;
205
206							# Should we use the Robinson f(w) equation from
207							# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?
208							# It gives better results, in that scores are more likely to distribute
209							# into the <0.5 range for nonspam and >0.5 for spam.
210	22			22		168	use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
	22					54
	22					1146
211
212							# How many of the most significant tokens should we use for the p(w)
213							# calculation?
214	22			22		127	use constant N_SIGNIFICANT_TOKENS => 150;
	22					51
	22					1021
215
216							# How many significant tokens are required for a classifier score to
217							# be considered usable?
218	22			22		127	use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
	22					53
	22					1190
219
220							# How long a token should we hold onto? (note: German speakers typically
221							# will require a longer token than English ones.)
222	22			22		131	use constant MAX_TOKEN_LENGTH => 15;
	22					57
	22					166045
223
224							###########################################################################
225
226							sub new {
227	63			63	1	229	my $class = shift;
228	63					225	my ($main) = @_;
229
230	63		33			456	$class = ref($class) \|\| $class;
231	63					419	my $self = $class->SUPER::new($main);
232	63					193	bless ($self, $class);
233
234	63					234	$self->{main} = $main;
235	63					218	$self->{conf} = $main->{conf};
236	63					206	$self->{use_ignores} = 1;
237
238	63					335	$self->register_eval_rule("check_bayes");
239	63					528	$self;
240							}
241
242							sub finish {
243	40			40	1	99	my $self = shift;
244	40	100				206	if ($self->{store}) {
245	39					284	$self->{store}->untie_db();
246							}
247	40					118	%{$self} = ();
	40					268
248							}
249
250							###########################################################################
251
252							# Plugin hook.
253							# Return this implementation object, for callers that need to know
254							# it. TODO: callers shouldn't need to know it!
255							# used only in test suite to get access to {store}, internal APIs.
256							#
257	1160			1160	0	1695	sub learner_get_implementation { return shift; }
258
259							###########################################################################
260
261							# Plugin hook.
262							# Called in the parent process shortly before forking off child processes.
263							sub prefork_init {
264	0			0	0	0	my ($self) = @_;
265
266	0	0	0			0	if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) {
267	0					0	$self->{store}->prefork_init;
268							}
269							}
270
271							###########################################################################
272
273							# Plugin hook.
274							# Called in a child process shortly after being spawned.
275							sub spamd_child_init {
276	0			0	1	0	my ($self) = @_;
277
278	0	0	0			0	if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) {
279	0					0	$self->{store}->spamd_child_init;
280							}
281							}
282
283							###########################################################################
284
285							# Plugin hook.
286							sub check_bayes {
287	0			0	0	0	my ($self, $pms, $fulltext, $min, $max) = @_;
288
289	0	0				0	return 0 if (!$self->{conf}->{use_learner});
290	0	0	0			0	return 0 if (!$self->{conf}->{use_bayes} \|\| !$self->{conf}->{use_bayes_rules});
291
292	0	0				0	if (!exists ($pms->{bayes_score})) {
293	0					0	my $timer = $self->{main}->time_method("check_bayes");
294	0					0	$pms->{bayes_score} = $self->scan($pms, $pms->{msg});
295							}
296
297	0	0	0			0	if (defined $pms->{bayes_score} &&
			0
			0
			0
298							($min == 0 \|\| $pms->{bayes_score} > $min) &&
299							($max eq "undef" \|\| $pms->{bayes_score} <= $max))
300							{
301	0	0				0	if ($self->{conf}->{detailed_bayes_score}) {
302							$pms->test_log(sprintf ("score: %3.4f, hits: %s",
303							$pms->{bayes_score},
304	0					0	$pms->{bayes_hits}));
305							}
306							else {
307	0					0	$pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score}));
308							}
309	0					0	return 1;
310							}
311
312	0					0	return 0;
313							}
314
315							###########################################################################
316
317							# Plugin hook.
318							sub learner_close {
319	6			6	1	25	my ($self, $params) = @_;
320	6					11	my $quiet = $params->{quiet};
321
322							# do a sanity check here. Weird things happen if we remain tied
323							# after compiling; for example, spamd will never see that the
324							# number of messages has reached the bayes-scanning threshold.
325	6	50				28	if ($self->{store}->db_readable()) {
326	0	0				0	warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
327	0					0	$self->{store}->untie_db();
328							}
329							}
330
331							###########################################################################
332
333							# read configuration items to control bayes behaviour. Called by
334							# BayesStore::read_db_configs().
335							sub read_db_configs {
336	42			42	0	87	my ($self) = @_;
337
338							# use of hapaxes. Set on bayes object, since it controls prob
339							# computation.
340	42					119	$self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
341							}
342							###########################################################################
343
344							sub ignore_message {
345	0			0	0	0	my ($self,$PMS) = @_;
346
347	0	0				0	return 0 unless $self->{use_ignores};
348
349	0					0	my $ig_from = $self->{main}->call_plugins ("check_wb_list",
350							{ permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
351	0					0	my $ig_to = $self->{main}->call_plugins ("check_wb_list",
352							{ permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
353
354	0		0			0	my $ignore = $ig_from \|\| $ig_to;
355
356	0	0				0	dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
357
358	0					0	return $ignore;
359							}
360
361							###########################################################################
362
363							# Plugin hook.
364							sub learn_message {
365	10			10	1	55	my ($self, $params) = @_;
366	10					24	my $isspam = $params->{isspam};
367	10					23	my $msg = $params->{msg};
368	10					22	my $id = $params->{id};
369
370	10	50				40	if (!$self->{conf}->{use_bayes}) { return; }
	0					0
371
372	10					49	my $msgdata = $self->get_body_from_msg ($msg);
373	10					22	my $ret;
374
375							eval {
376	10					70	local $SIG{'__DIE__'}; # do not run user die() traps in here
377	10					39	my $timer = $self->{main}->time_method("b_learn");
378
379	10					18	my $ok;
380	10	100				39	if ($self->{main}->{learn_to_journal}) {
381							# If we're going to learn to journal, we'll try going r/o first...
382							# If that fails for some reason, let's try going r/w. This happens
383							# if the DB doesn't exist yet.
384	2		33			11	$ok = $self->{store}->tie_db_readonly() \|\| $self->{store}->tie_db_writable();
385							} else {
386	8					38	$ok = $self->{store}->tie_db_writable();
387							}
388
389	10	100				38	if ($ok) {
390	8					60	$ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
391
392	8	50				51	if (!$self->{main}->{learn_caller_will_untie}) {
393	8					54	$self->{store}->untie_db();
394							}
395							}
396	10					77	1;
397	10	50				21	} or do { # if we died, untie the dbs.
398	0	0				0	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
	0					0
399	0					0	$self->{store}->untie_db();
400	0					0	die "bayes: (in learn) $eval_stat\n";
401							};
402
403	10					61	return $ret;
404							}
405
406							# this function is trapped by the wrapper above
407							sub _learn_trapped {
408	8			8		37	my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
409	8					28	my @msgid = ( $msgid );
410
411	8	50				31	if (!defined $msgid) {
412	8					35	@msgid = $self->get_msgid($msg);
413							}
414
415	8					33	foreach my $msgid_t ( @msgid ) {
416	12					59	my $seen = $self->{store}->seen_get ($msgid_t);
417
418	12	100				55	if (defined ($seen)) {
419	4	100	66			52	if (($seen eq 's' && $isspam) \|\| ($seen eq 'h' && !$isspam)) {
		50	33
			66
420	2					18	dbg("bayes: $msgid_t already learnt correctly, not learning twice");
421	2					7	return 0;
422							} elsif ($seen !~ /^[hs]$/) {
423	0					0	warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored");
424							} else {
425							# bug 3704: If the message was already learned, don't try learning it again.
426							# this prevents, for instance, manually learning as spam, then autolearning
427							# as ham, or visa versa.
428	2	50				13	if ($self->{main}->{learn_no_relearn}) {
429	0					0	dbg("bayes: $msgid_t already learnt as opposite, not re-learning");
430	0					0	return 0;
431							}
432
433	2					13	dbg("bayes: $msgid_t already learnt as opposite, forgetting first");
434
435							# kluge so that forget() won't untie the db on us ...
436	2					5	my $orig = $self->{main}->{learn_caller_will_untie};
437	2					5	$self->{main}->{learn_caller_will_untie} = 1;
438
439	2					12	my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
440
441							# reset the value post-forget() ...
442	2					10	$self->{main}->{learn_caller_will_untie} = $orig;
443
444							# forget() gave us a fatal error, so propagate that up
445	2	50				10	if ($fatal) {
446	0					0	dbg("bayes: forget() returned a fatal error, so learn() will too");
447	0					0	return;
448							}
449							}
450
451							# we're only going to have seen this once, so stop if it's been
452							# seen already
453	2					9	last;
454							}
455							}
456
457							# Now that we're sure we haven't seen this message before ...
458	6					18	$msgid = $msgid[0];
459
460	6					44	my $msgatime = $msg->receive_date();
461
462							# If the message atime comes back as being more than 1 day in the
463							# future, something's messed up and we should revert to current time as
464							# a safety measure.
465							#
466	6	50				23	$msgatime = time if ( $msgatime - time > 86400 );
467
468	6					42	my $tokens = $self->tokenize($msg, $msgdata);
469
470	6					19	{ my $timer = $self->{main}->time_method('b_count_change');
	6					39
471	6	100				25	if ($isspam) {
472	4					48	$self->{store}->nspam_nham_change(1, 0);
473	4					34	$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
474							} else {
475	2					16	$self->{store}->nspam_nham_change(0, 1);
476	2					14	$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
477							}
478							}
479
480	6	100				147	$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
481	6					50	$self->{store}->cleanup();
482
483	6					74	$self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
484							isspam => $isspam,
485							msgid => $msgid,
486							msgatime => $msgatime,
487							});
488
489	6					69	dbg("bayes: learned '$msgid', atime: $msgatime");
490
491	6					282	1;
492							}
493
494							###########################################################################
495
496							# Plugin hook.
497							sub forget_message {
498	4			4	1	15	my ($self, $params) = @_;
499	4					11	my $msg = $params->{msg};
500	4					8	my $id = $params->{id};
501
502	4	50				18	if (!$self->{conf}->{use_bayes}) { return; }
	0					0
503
504	4					17	my $msgdata = $self->get_body_from_msg ($msg);
505	4					9	my $ret;
506
507							# we still tie for writing here, since we write to the seen db
508							# synchronously
509							eval {
510	4					28	local $SIG{'__DIE__'}; # do not run user die() traps in here
511	4					17	my $timer = $self->{main}->time_method("b_learn");
512
513	4					11	my $ok;
514	4	50				19	if ($self->{main}->{learn_to_journal}) {
515							# If we're going to learn to journal, we'll try going r/o first...
516							# If that fails for some reason, let's try going r/w. This happens
517							# if the DB doesn't exist yet.
518	0		0			0	$ok = $self->{store}->tie_db_readonly() \|\| $self->{store}->tie_db_writable();
519							} else {
520	4					17	$ok = $self->{store}->tie_db_writable();
521							}
522
523	4	50				22	if ($ok) {
524	4					24	$ret = $self->_forget_trapped ($msg, $msgdata, $id);
525
526	4	100				22	if (!$self->{main}->{learn_caller_will_untie}) {
527	2					12	$self->{store}->untie_db();
528							}
529							}
530	4					38	1;
531	4	50				9	} or do { # if we died, untie the dbs.
532	0	0				0	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
	0					0
533	0					0	$self->{store}->untie_db();
534	0					0	die "bayes: (in forget) $eval_stat\n";
535							};
536
537	4					29	return $ret;
538							}
539
540							# this function is trapped by the wrapper above
541							sub _forget_trapped {
542	4			4		15	my ($self, $msg, $msgdata, $msgid) = @_;
543	4					10	my @msgid = ( $msgid );
544	4					7	my $isspam;
545
546	4	50				14	if (!defined $msgid) {
547	4					15	@msgid = $self->get_msgid($msg);
548							}
549
550	4					32	while( $msgid = shift @msgid ) {
551	4					22	my $seen = $self->{store}->seen_get ($msgid);
552
553	4	50				20	if (defined ($seen)) {
554	4	100				90	if ($seen eq 's') {
		50
555	2					7	$isspam = 1;
556							} elsif ($seen eq 'h') {
557	2					4	$isspam = 0;
558							} else {
559	0					0	dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
560	0					0	return 0;
561							}
562
563							# messages should only be learned once, so stop if we find a msgid
564							# which was seen before
565	4					12	last;
566							}
567							else {
568	0					0	dbg("bayes: forget: msgid $msgid not learnt, ignored");
569							}
570							}
571
572							# This message wasn't learnt before, so return
573	4	50				17	if (!defined $isspam) {
		100
574	0					0	dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
575	0					0	return 0;
576							}
577							elsif ($isspam) {
578	2					13	$self->{store}->nspam_nham_change (-1, 0);
579							}
580							else {
581	2					12	$self->{store}->nspam_nham_change (0, -1);
582							}
583
584	4					42	my $tokens = $self->tokenize($msg, $msgdata);
585
586	4	100				15	if ($isspam) {
587	2					15	$self->{store}->multi_tok_count_change (-1, 0, $tokens);
588							} else {
589	2					16	$self->{store}->multi_tok_count_change (0, -1, $tokens);
590							}
591
592	4					107	$self->{store}->seen_delete ($msgid);
593	4					36	$self->{store}->cleanup();
594
595	4					48	$self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
596							isspam => $isspam,
597							msgid => $msgid,
598							});
599
600	4					223	1;
601							}
602
603							###########################################################################
604
605							# Plugin hook.
606							sub learner_sync {
607	2			2	1	13	my ($self, $params) = @_;
608	2	50				16	if (!$self->{conf}->{use_bayes}) { return 0; }
	0					0
609	2					12	dbg("bayes: bayes journal sync starting");
610	2					29	$self->{store}->sync($params);
611	2					16	dbg("bayes: bayes journal sync completed");
612							}
613
614							###########################################################################
615
616							# Plugin hook.
617							sub learner_expire_old_training {
618	0			0	1	0	my ($self, $params) = @_;
619	0	0				0	if (!$self->{conf}->{use_bayes}) { return 0; }
	0					0
620	0					0	dbg("bayes: expiry starting");
621	0					0	my $timer = $self->{main}->time_method("expire_bayes");
622	0					0	$self->{store}->expire_old_tokens($params);
623	0					0	dbg("bayes: expiry completed");
624							}
625
626							###########################################################################
627
628							# Plugin hook.
629							# Check to make sure we can tie() the DB, and we have enough entries to do a scan
630							# if we're told the caller will untie(), go ahead and leave the db tied.
631							sub learner_is_scan_available {
632	146			146	1	461	my ($self, $params) = @_;
633
634	146	50				580	return 0 unless $self->{conf}->{use_bayes};
635	146	100				812	return 0 unless $self->{store}->tie_db_readonly();
636
637							# We need the DB to stay tied, so if the journal sync occurs, don't untie!
638	6					22	my $caller_untie = $self->{main}->{learn_caller_will_untie};
639	6					25	$self->{main}->{learn_caller_will_untie} = 1;
640
641							# Do a journal sync if necessary. Do this before the nspam_nham_get()
642							# call since the sync may cause an update in the number of messages
643							# learnt.
644	6					43	$self->_opportunistic_calls(1);
645
646							# Reset the variable appropriately
647	6					21	$self->{main}->{learn_caller_will_untie} = $caller_untie;
648
649	6					63	my ($ns, $nn) = $self->{store}->nspam_nham_get();
650
651	6	50				41	if ($ns < $self->{conf}->{bayes_min_spam_num}) {
652	0					0	dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
653	0	0				0	if (!$self->{main}->{learn_caller_will_untie}) {
654	0					0	$self->{store}->untie_db();
655							}
656	0					0	return 0;
657							}
658	6	50				27	if ($nn < $self->{conf}->{bayes_min_ham_num}) {
659	0					0	dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
660	0	0				0	if (!$self->{main}->{learn_caller_will_untie}) {
661	0					0	$self->{store}->untie_db();
662							}
663	0					0	return 0;
664							}
665
666	6					29	return 1;
667							}
668
669							###########################################################################
670
671							sub scan {
672	4			4	0	17	my ($self, $permsgstatus, $msg) = @_;
673	4					8	my $score;
674
675	4	50				27	return unless $self->{conf}->{use_learner};
676
677							# When we're doing a scan, we'll guarantee that we'll do the untie,
678							# so override the global setting until we're done.
679	4					18	my $caller_untie = $self->{main}->{learn_caller_will_untie};
680	4					12	$self->{main}->{learn_caller_will_untie} = 1;
681
682	4	50				39	goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
683
684	4	50				22	goto skip unless $self->learner_is_scan_available();
685
686	4					17	my ($ns, $nn) = $self->{store}->nspam_nham_get();
687
688							## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
689							## $self->{raw_counts} = " ns=$ns nn=$nn ";
690							## }
691
692	4					30	dbg("bayes: corpus size: nspam = $ns, nham = $nn");
693
694	4					7	my $msgtokens;
695	4					8	{ my $timer = $self->{main}->time_method('b_tokenize');
	4					25
696	4					21	my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
697	4					37	$msgtokens = $self->tokenize($msg, $msgdata);
698							}
699
700	4					11	my $tokensdata;
701	4					10	{ my $timer = $self->{main}->time_method('b_tok_get_all');
	4					48
702	4					16	$tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
	4					213
703							}
704
705	4					73	my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
706
707	4					26	my $probabilities_ref =
708							$self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
709
710	4					18	my %pw;
711	4					10	foreach my $tokendata (@{$tokensdata}) {
	4					19
712	1104					1325	my $prob = shift(@$probabilities_ref);
713	1104	100				1667	next unless defined $prob;
714	502					546	my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
	502					800
715	502					1514	$pw{$token} = {
716							prob => $prob,
717							spam_count => $tok_spam,
718							ham_count => $tok_ham,
719							atime => $atime
720							};
721							}
722
723	4					95	my @pw_keys = keys %pw;
724
725							# If none of the tokens were found in the DB, we're going to skip
726							# this message...
727	4	50				28	if (!@pw_keys) {
728	0					0	dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
729	0					0	goto skip;
730							}
731
732	4					8	my $tcount_total = keys %{$msgtokens};
	4					13
733	4					10	my $tcount_learned = scalar @pw_keys;
734
735							# Figure out the message receive time (used as atime below)
736							# If the message atime comes back as being in the future, something's
737							# messed up and we should revert to current time as a safety measure.
738							#
739	4					39	my $msgatime = $msg->receive_date();
740	4					12	my $now = time;
741	4	50				22	$msgatime = $now if ( $msgatime > $now );
742
743	4					9	my @touch_tokens;
744	4					23	my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
745	4					29	my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
746
747	4					367	my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
748	4					57	my $log_each_token = (would_log('dbg', 'bayes') > 1);
749
750							# now take the most significant tokens and calculate probs using
751							# Robinson's formula.
752
753	4					27	@pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
	2933					3462
754
755	4	100				20	if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
	2					16
756
757	4					16	my @sorted;
758	4					14	foreach my $tok (@pw_keys) {
759	414	100				657	next if $tok_strength{$tok} <
760							$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
761
762	272					331	my $pw_tok = $pw{$tok};
763	272					364	my $pw_prob = $pw_tok->{prob};
764
765							# What's more expensive, scanning headers for HAMMYTOKENS and
766							# SPAMMYTOKENS tags that aren't there or collecting data that
767							# won't be used? Just collecting the data is certainly simpler.
768							#
769	272		50			564	my $raw_token = $msgtokens->{$tok} \|\| "(unknown)";
770	272					308	my $s = $pw_tok->{spam_count};
771	272					303	my $n = $pw_tok->{ham_count};
772	272					309	my $a = $pw_tok->{atime};
773
774	272	100				299	push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
	272					710
775							[$raw_token, $pw_prob, $s, $n, $a] );
776
777	272					395	push(@sorted, $pw_prob);
778
779							# update the atime on this token, it proved useful
780	272					341	push(@touch_tokens, $tok);
781
782	272	50				500	if ($log_each_token) {
783	0					0	dbg("bayes: token '$raw_token' => $pw_prob");
784							}
785							}
786
787	4	50	50			44	if (!@sorted \|\| (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
788							$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
789							{
790	0					0	dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
791	0					0	goto skip;
792							}
793
794	4					38	$score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
795	4					11	undef $timer_compute_prob; # end a timing section
796
797							# Couldn't come up with a probability?
798	4	50				15	goto skip unless defined $score;
799
800	4					67	dbg("bayes: score = $score");
801
802							# no need to call tok_touch_all unless there were significant
803							# tokens and a score was returned
804							# we don't really care about the return value here
805
806	4					8	{ my $timer = $self->{main}->time_method('b_tok_touch_all');
	4					26
807	4					82	$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
808							}
809
810	4					19	my $timer_finish = $self->{main}->time_method('b_finish');
811
812	4					13	$permsgstatus->{bayes_nspam} = $ns;
813	4					12	$permsgstatus->{bayes_nham} = $nn;
814
815							## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
816							## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
817							## }
818
819	4					63	$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
820							probsref => \%pw,
821							score => $score,
822							msgatime => $msgatime,
823							significant_tokens => \@touch_tokens,
824							});
825
826							skip:
827	4	50				22	if (!defined $score) {
828	0					0	dbg("bayes: not scoring message, returning undef");
829							}
830
831	4					17	undef $timer_compute_prob; # end a timing section if still running
832	4	50				14	if (!defined $timer_finish) {
833	4					17	$timer_finish = $self->{main}->time_method('b_finish');
834							}
835
836							# Take any opportunistic actions we can take
837	4	50				17	if ($self->{main}->{opportunistic_expire_check_only}) {
838							# we're supposed to report on expiry only -- so do the
839							# _opportunistic_calls() run for the journal only.
840	0					0	$self->_opportunistic_calls(1);
841	0					0	$permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
842							}
843							else {
844	4					25	$self->_opportunistic_calls();
845							}
846
847							# Do any cleanup we need to do
848	4					27	$self->{store}->cleanup();
849
850							# Reset the value accordingly
851	4					15	$self->{main}->{learn_caller_will_untie} = $caller_untie;
852
853							# If our caller won't untie the db, we need to do it.
854	4	50				23	if (!$caller_untie) {
855	4					30	$self->{store}->untie_db();
856							}
857
858							$permsgstatus->set_tag ('BAYESTCHAMMY',
859	4	50				27	($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));
	4					39
860							$permsgstatus->set_tag ('BAYESTCSPAMMY',
861	4	50				18	($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));
	4					19
862	4					14	$permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);
863	4					23	$permsgstatus->set_tag ('BAYESTC', $tcount_total);
864
865							$permsgstatus->set_tag ('HAMMYTOKENS', sub {
866	0			0		0	my $pms = shift;
867							$self->bayes_report_make_list
868	0					0	($pms, $pms->{bayes_token_info_hammy}, shift);
869	4					53	});
870
871							$permsgstatus->set_tag ('SPAMMYTOKENS', sub {
872	0			0		0	my $pms = shift;
873							$self->bayes_report_make_list
874	0					0	($pms, $pms->{bayes_token_info_spammy}, shift);
875	4					25	});
876
877							$permsgstatus->set_tag ('TOKENSUMMARY', sub {
878	0			0		0	my $pms = shift;
879	0	0				0	if ( defined $pms->{tag_data}{BAYESTC} )
880							{
881							my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED}
882							- $pms->{tag_data}{BAYESTCSPAMMY}
883	0					0	- $pms->{tag_data}{BAYESTCHAMMY};
884							my $tcount_new = $pms->{tag_data}{BAYESTC}
885	0					0	- $pms->{tag_data}{BAYESTCLEARNED};
886	0					0	"Tokens: new, $tcount_new; "
887							."hammy, $pms->{tag_data}{BAYESTCHAMMY}; "
888							."neutral, $tcount_neutral; "
889							."spammy, $pms->{tag_data}{BAYESTCSPAMMY}."
890							} else {
891	0					0	"Bayes not run.";
892							}
893	4					36	});
894
895
896	4					645	return $score;
897							}
898
899							###########################################################################
900
901							# Plugin hook.
902							sub learner_dump_database {
903	0			0	1	0	my ($self, $params) = @_;
904	0					0	my $magic = $params->{magic};
905	0					0	my $toks = $params->{toks};
906	0					0	my $regex = $params->{regex};
907
908							# allow dump to occur even if use_bayes disables everything else ...
909							#return 0 unless $self->{conf}->{use_bayes};
910	0	0				0	return 0 unless $self->{store}->tie_db_readonly();
911
912	0					0	my @vars = $self->{store}->get_storage_variables();
913
914	0					0	my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
915
916	0					0	my $template = '%3.3f %10u %10u %10u %s'."\n";
917
918	0	0				0	if ( $magic ) {
919	0	0				0	printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version')
920							or die "Error writing: $!";
921	0	0				0	printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam')
922							or die "Error writing: $!";
923	0	0				0	printf($template, 0.0, 0, $nh, 0, 'non-token data: nham')
924							or die "Error writing: $!";
925	0	0				0	printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens')
926							or die "Error writing: $!";
927	0	0				0	printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime')
928							or die "Error writing: $!";
929	0	0				0	if ( $bv >= 2 ) {
930	0	0				0	printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime')
931							or die "Error writing: $!";
932							}
933	0	0				0	if ( $bv < 2 ) {
934	0	0				0	printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count')
935							or die "Error writing: $!";
936							}
937	0	0				0	if ( $bv >= 2 ) {
938	0	0				0	printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime')
939							or die "Error writing: $!";
940							}
941	0	0				0	printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime')
942							or die "Error writing: $!";
943	0	0				0	if ( $bv >= 2 ) {
944	0	0				0	printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta')
945							or die "Error writing: $!";
946
947	0	0				0	printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count')
948							or die "Error writing: $!";
949							}
950							}
951
952	0	0				0	if ( $toks ) {
953							# let the store sort out the db_toks
954	0					0	$self->{store}->dump_db_toks($template, $regex, @vars);
955							}
956
957	0	0				0	if (!$self->{main}->{learn_caller_will_untie}) {
958	0					0	$self->{store}->untie_db();
959							}
960	0					0	return 1;
961							}
962
963							###########################################################################
964							# TODO: these are NOT public, but the test suite needs to call them.
965
966							sub get_msgid {
967	14			14	0	45	my ($self, $msg) = @_;
968
969	14					22	my @msgid;
970
971	14					47	my $msgid = $msg->get_header("Message-Id");
972	14	50	33			187	if (defined $msgid && $msgid ne '' && $msgid !~ /^\s<\s(?:\@sa_generated)?>.*$/) {
			33
973							# remove \r and < and > prefix/suffixes
974	14					36	chomp $msgid;
975	14					63	$msgid =~ s/^<//; $msgid =~ s/>.*$//g;
	14					71
976	14					49	push(@msgid, $msgid);
977							}
978
979							# Modified 2012-01-17 per bug 5185 to remove last received from msg_id calculation
980
981							# Use sha1_hex(Date: and top N bytes of body)
982							# where N is MIN(1024 bytes, 1/2 of body length)
983							#
984	14					42	my $date = $msg->get_header("Date");
985	14	50	33			70	$date = "None" if (!defined $date \|\| $date eq ''); # No Date?
986
987							#Removed per bug 5185
988							#my @rcvd = $msg->get_header("Received");
989							#my $rcvd = $rcvd[$#rcvd];
990							#$rcvd = "None" if (!defined $rcvd \|\| $rcvd eq ''); # No Received?
991
992							# Make a copy since pristine_body is a reference ...
993	14					64	my $body = join('', $msg->get_pristine_body());
994
995	14	50				47	if (length($body) > 64) { # Small Body?
996	14	50				32	my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
997	14					42	substr($body, $keep) = '';
998							}
999
1000							#Stripping all CR and LF so that testing midstream from MTA and post delivery don't
1001							#generate different id's simply because of LF<->CR<->CRLF changes.
1002	14					257	$body =~ s/[\r\n]//g;
1003
1004	14					217	unshift(@msgid, sha1_hex($date."\000".$body).'@sa_generated');
1005
1006	14	50				69	return wantarray ? @msgid : $msgid[0];
1007							}
1008
1009							sub get_body_from_msg {
1010	20			20	0	73	my ($self, $msg) = @_;
1011
1012	20	50				74	if (!ref $msg) {
1013							# I have no idea why this seems to happen. TODO
1014	0					0	warn "bayes: msg not a ref: '$msg'";
1015	0					0	return { };
1016							}
1017
1018							my $permsgstatus =
1019	20					174	Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
1020	20					153	$msg->extract_message_metadata ($permsgstatus);
1021	20					116	my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
1022	20					110	$permsgstatus->finish();
1023
1024	20	50				76	if (!defined $msgdata) {
1025							# why?!
1026	0					0	warn "bayes: failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
1027	0					0	return { };
1028							}
1029
1030	20					80	return $msgdata;
1031							}
1032
1033							sub _get_msgdata_from_permsgstatus {
1034	24			24		63	my ($self, $pms) = @_;
1035
1036	24					63	my $t_src = $self->{conf}->{bayes_token_sources};
1037	24					59	my $msgdata = { };
1038							$msgdata->{bayes_token_body} =
1039	24	50				184	$pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible};
1040							$msgdata->{bayes_token_inviz} =
1041	24	50				158	$pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible};
1042							$msgdata->{bayes_mimepart_digests} =
1043	24	50				69	$pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
1044	24					75	@{$msgdata->{bayes_token_uris}} =
1045	24	50				171	$pms->get_uri_list() if $t_src->{uri};
1046	24					62	return $msgdata;
1047							}
1048
1049							###########################################################################
1050
1051							# The calling functions expect a uniq'ed array of tokens ...
1052							sub tokenize {
1053	16			16	0	56	my ($self, $msg, $msgdata) = @_;
1054
1055	16					49	my $t_src = $self->{conf}->{bayes_token_sources};
1056	16					32	my @tokens;
1057
1058							# visible tokens from the body
1059	16	50				60	if ($msgdata->{bayes_token_body}) {
1060							my(@t) = map($self->_tokenize_line ($_, '', 1),
1061	16					34	@{$msgdata->{bayes_token_body}} );
	16					92
1062	16					111	dbg("bayes: tokenized body: %d tokens", scalar @t);
1063	16					1099	push(@tokens, @t);
1064							}
1065							# the URI list
1066	16	50				64	if ($msgdata->{bayes_token_uris}) {
1067							my(@t) = map($self->_tokenize_line ($_, '', 2),
1068	16					32	@{$msgdata->{bayes_token_uris}} );
	16					79
1069	16					78	dbg("bayes: tokenized uri: %d tokens", scalar @t);
1070	16					87	push(@tokens, @t);
1071							}
1072							# add invisible tokens
1073	16	50				59	if ($msgdata->{bayes_token_inviz}) {
1074	16					36	my $tokprefix;
1075	16					24	if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' }
	16					48
1076	16					33	if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' }
1077	16	50				39	if (defined $tokprefix) {
1078							my(@t) = map($self->_tokenize_line ($_, $tokprefix, 1),
1079	16					31	@{$msgdata->{bayes_token_inviz}} );
	16					48
1080	16					50	dbg("bayes: tokenized invisible: %d tokens", scalar @t);
1081	16					43	push(@tokens, @t);
1082							}
1083							}
1084
1085							# add digests and Content-Type of all MIME parts
1086	16	50				55	if ($msgdata->{bayes_mimepart_digests}) {
1087	0					0	my %shorthand = ( # some frequent MIME part contents for human readability
1088							'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
1089							'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
1090							'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML',
1091							'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
1092							'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
1093							'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
1094							'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
1095							);
1096							my(@t) = map('MIME:' . ($shorthand{$_} \|\| $_),
1097	0		0			0	@{ $msgdata->{bayes_mimepart_digests} });
	0					0
1098	0					0	dbg("bayes: tokenized mime parts: %d tokens", scalar @t);
1099	0					0	dbg("bayes: mime-part token %s", $_) for @t;
1100	0					0	push(@tokens, @t);
1101							}
1102
1103							# Tokenize the headers
1104	16	50				59	if ($t_src->{header}) {
1105	16					28	my(@t);
1106	16					81	my %hdrs = $self->_tokenize_headers ($msg);
1107	16					98	while( my($prefix, $value) = each %hdrs ) {
1108	200					522	push(@t, $self->_tokenize_line ($value, "H$prefix:", 0));
1109							}
1110	16					74	dbg("bayes: tokenized header: %d tokens", scalar @t);
1111	16					341	push(@tokens, @t);
1112							}
1113
1114							# Go ahead and uniq the array, skip null tokens (can happen sometimes)
1115							# generate an SHA1 hash and take the lower 40 bits as our token
1116	16					44	my %tokens;
1117	16					67	foreach my $token (@tokens) {
1118							# dbg("bayes: token: %s", $token);
1119	6746	50				28857	$tokens{substr(sha1($token), -5)} = $token if $token ne '';
1120							}
1121
1122							# return the keys == tokens ...
1123	16					399	return \%tokens;
1124							}
1125
1126							sub _tokenize_line {
1127	538			538		738	my $self = $_[0];
1128	538					663	my $tokprefix = $_[2];
1129	538					592	my $region = $_[3];
1130	538					810	local ($_) = $_[1];
1131
1132	538					663	my @rettokens;
1133
1134							# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
1135							# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
1136							# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
1137
1138							### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
1139
1140							### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic)
1141	538					1838	s{ ( [A-Za-z0-9,@*!_'"\$. -]+ \|
1142							[\xC0-\xDF][\x80-\xBF] \|
1143							[\xE0-\xEF][\x80-\xBF]{2} \|
1144							[\xF0-\xF4][\x80-\xBF]{3} \|
1145	4844	100				13359	[\xA1-\xFF] ) \| . }
1146							{ defined $1 ? $1 : ' ' }xsge;
1147							# should we also turn NBSP ( \xC2\xA0 ) into space?
1148
1149							# DO split on "..." or "--" or "---"; common formatting error resulting in
1150							# hapaxes. Keep the separator itself as a token, though, as long ones can
1151	538					1222	# be good spamsigns.
1152	538					792	s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
1153							s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
1154	538					595
1155	538	100	100			1378	if (IGNORE_TITLE_CASE) {
1156							if ($region == 1 \|\| $region == 2) {
1157							# lower-case Title Case at start of a full-stop-delimited line (as would
1158	338					2058	# be seen in a Western language).
	242					1825
1159							s/(?:^\|\.\s+)([A-Z])([^A-Z]+)(?:\s\|$)/ ' '. (lc $1) . $2 . ' ' /ge;
1160							}
1161							}
1162	538					1449
1163							my $magic_re = $self->{store}->get_magic_re();
1164
1165							# Note that split() in scope of 'use bytes' results in words with utf8 flag
1166							# cleared, even if the source string has perl characters semantics !!!
1167							# Is this really still desirable?
1168	538					2483
1169	8598					12986	foreach my $token (split) {
1170	8598					11129	$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
1171							$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
1172
1173							# Skip false magic tokens
1174							# TVD: we need to do a defined() check since SQL doesn't have magic
1175							# tokens, so the SQL BayesStore returns undef. I really want a way
1176							# of optimizing that out, but I haven't come up with anything yet.
1177	8598	50	33			28654	#
1178							next if ( defined $magic_re && $token =~ /$magic_re/ );
1179
1180	8598					10334	# do keep 3-byte tokens; there's some solid signs in there
1181							my $len = length($token);
1182
1183							# but extend the stop-list. These are squarely in the gray
1184							# area, and it just slows us down to record them.
1185							# See http://wiki.apache.org/spamassassin/BayesStopList for more info.
1186	8598	100	100			24715	#
1187							next if $len < 3 \|\|
1188							($token =~ /^(?:a(?:ble\|l(?:ready\|l)\|n[dy]\|re)\|b(?:ecause\|oth)\|c(?:an\|ome)\|e(?:ach\|mail\|ven)\|f(?:ew\|irst\|or\|rom)\|give\|h(?:a(?:ve\|s)\|ttp)\|i(?:n(?:formation\|to)\|t\'s)\|just\|know\|l(?:ike\|o(?:ng\|ok))\|m(?:a(?:de\|il(?:(?:ing\|to))?\|ke\|ny)\|o(?:re\|st)\|uch)\|n(?:eed\|o[tw]\|umber)\|o(?:ff\|n(?:ly\|e)\|ut\|wn)\|p(?:eople\|lace)\|right\|s(?:ame\|ee\|uch)\|t(?:h(?:at\|is\|rough\|e)\|ime)\|using\|w(?:eb\|h(?:ere\|y)\|ith(?:out)?\|or(?:ld\|k))\|y(?:ears?\|ou(?:(?:\'re\|r))?))$/i);
1189
1190	5236	100	100			10043	# are we in the body? If so, apply some body-specific breakouts
1191	3540	100				7234	if ($region == 1 \|\| $region == 2) {
		100
1192	44					175	if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
1193							push (@rettokens, $self->_tokenize_mail_addrs ($token));
1194							}
1195	110					253	elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
1196	110					152	push (@rettokens, "UD:".$token); # the full token
	110					468
1197	132					473	my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
1198							push (@rettokens, "UD:".$1); # UD = URL domain
1199							}
1200							}
1201							}
1202
1203							# note: do not trim down overlong tokens if they contain '*'. This is
1204							# used as part of split tokens such as "HTo:D*net" indicating that
1205							# the domain ".net" appeared in the To header.
1206	5236	100	100			9040	#
1207							if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
1208	256	50				492
1209							if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
1210							# Bug 7135
1211	0					0	# collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences
1212							my(@t) = $token =~ /( (?: [\xE0-\xEF] \| [\xF0-\xF4][\x80-\xBF] )
1213	0	0				0	[\x80-\xBF]{2} )/xsg;
1214	0					0	if (@t) {
1215	0					0	push (@rettokens, map($tokprefix.'u8:'.$_, @t));
1216							next;
1217							}
1218							}
1219	256					267
1220							if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
1221							# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
1222							# but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
1223							# to me! (jm)
1224							while ($token =~ s/^(..?)//) {
1225							push (@rettokens, $tokprefix.'8:'.$1);
1226							}
1227							next;
1228							}
1229	256	100	100			924
			100
			100
			50
			66
1230							if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1231							\|\| ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
1232							\|\| ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
1233							{
1234							# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
1235							# Spambayes trick via Matt: Just retain 7 chars. Do not retain the
1236							# length, it does not help; see jm's mail to -devel on Nov 20 2002 at
1237							# http://sourceforge.net/p/spamassassin/mailman/message/12977605/
1238							# "sk:" stands for "skip".
1239							# Bug 7141: retain seven UTF-8 chars (or other bytes),
1240	232					1571	# if followed by at least two bytes
1241							$token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] \|
1242							[\xC0-\xDF][\x80-\xBF] \|
1243							[\xE0-\xEF][\x80-\xBF]{2} \|
1244							[\xF0-\xF4][\x80-\xBF]{3} \| . ){7} ))
1245							.{2,} \z }{sk:$1}xs;
1246							## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes
1247							}
1248							}
1249
1250	5236	100	100			9603	# decompose tokens? do this after shortening long tokens
1251	3540					3969	if ($region == 1 \|\| $region == 2) {
1252	3540	100				6049	if (DECOMPOSE_BODY_TOKENS) {
1253	364					485	if ($token =~ /[^\w:\*]/) {
1254	364					1065	my $decompd = $token; # "Foo!"
1255	364					768	$decompd =~ s/[^\w:\*]//gs;
1256							push (@rettokens, $tokprefix.$decompd); # "Foo"
1257							}
1258	3540	100				5798
1259	712					913	if ($token =~ /[A-Z]/) {
	712					914
1260	712					1229	my $decompd = $token; $decompd = lc $decompd;
1261							push (@rettokens, $tokprefix.$decompd); # "foo!"
1262	712	100				1371
1263	76					165	if ($token =~ /[^\w:\*]/) {
1264	76					164	$decompd =~ s/[^\w:\*]//gs;
1265							push (@rettokens, $tokprefix.$decompd); # "foo"
1266							}
1267							}
1268							}
1269							}
1270	5236					10090
1271							push (@rettokens, $tokprefix.$token);
1272							}
1273	538					3801
1274							return @rettokens;
1275							}
1276
1277	16			16		46	sub _tokenize_headers {
1278							my ($self, $msg) = @_;
1279	16					33
1280							my %parsed;
1281
1282	16					34	my %user_ignore;
	16					87
1283							$user_ignore{lc $_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
1284
1285	16					51	# get headers in array context
1286							my @hdrs;
1287	16					104	my @rcvdlines;
1288							for ($msg->get_all_headers()) {
1289	202	100				406	# first, keep a copy of Received headers, so we can strip down to last 2
1290	52					87	if (/^Received:/i) {
1291	52					78	push(@rcvdlines, $_);
1292							next;
1293							}
1294	150	100				1820	# and now skip lines for headers we don't want (including all Received)
1295	102					132	next if /^${IGNORED_HDRS}:/i;
1296	102					207	next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
1297							push(@hdrs, $_);
1298	16					142	}
1299							push(@hdrs, $msg->get_all_metadata());
1300
1301							# and re-add the last 2 received lines: usually a good source of
1302	16	100				63	# spamware tokens and HELO names.
	14					38
1303	16	100				49	if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
	14					36
1304							if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
1305	16					42
1306	194	50				524	for (@hdrs) {
1307	194					521	next unless /\S/;
1308							my ($hdr, $val) = split(/:/, $_, 2);
1309
1310							# remove user-specified headers here, after Received, in case they
1311	194	50				406	# want to ignore that too
1312							next if exists $user_ignore{lc $hdr};
1313
1314	194		50			303	# Prep the header value
1315	194					267	$val \|\|= '';
1316							chomp($val);
1317
1318	194	100				1255	# special tokenization for some headers:
		100
		100
		100
		100
		50
1319	16					90	if ($hdr =~ /^(?:\|X-\|Resent-)Message-Id$/i) {
1320							$val = $self->_pre_chew_message_id ($val);
1321							}
1322							elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:\|X-\|Resent-)
1323							(?:Return-Path\|From\|To\|Cc\|Reply-To\|Errors-To\|Mail-Followup-To\|Sender)$/ix)
1324	60					157	{
1325							$val = $self->_pre_chew_addr_header ($val);
1326							}
1327	28					96	elsif ($hdr eq 'Received') {
1328							$val = $self->_pre_chew_received ($val);
1329							}
1330	4					23	elsif ($hdr eq 'Content-Type') {
1331							$val = $self->_pre_chew_content_type ($val);
1332							}
1333	2					10	elsif ($hdr eq 'MIME-Version') {
1334							$val =~ s/1\.0//; # totally innocuous
1335							}
1336	0					0	elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
1337							$val = "1"; # just mark the presence, they create lots of hapaxen
1338							}
1339	194					285
1340	194	100				435	if (MAP_HEADERS_MID) {
1341	16					50	if ($hdr =~ /^(?:In-Reply-To\|References\|Message-ID)$/i) {
1342							$parsed{"*MI"} = $val;
1343							}
1344	194					216	}
1345	194	100				430	if (MAP_HEADERS_FROMTOCC) {
1346	32					73	if ($hdr =~ /^(?:From\|To\|Cc)$/i) {
1347							$parsed{"*Ad"} = $val;
1348							}
1349	194					220	}
1350	194	50				440	if (MAP_HEADERS_USERAGENT) {
1351	0					0	if ($hdr =~ /^(?:X-Mailer\|User-Agent)$/i) {
1352							$parsed{"*UA"} = $val;
1353							}
1354							}
1355
1356	194	100				408	# replace hdr name with "compressed" version if possible
1357	126					232	if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
1358							$hdr = $HEADER_NAME_COMPRESSION{$hdr};
1359							}
1360	194	100				336
1361	26					72	if (exists $parsed{$hdr}) {
1362							$parsed{$hdr} .= " ".$val;
1363	168					369	} else {
1364							$parsed{$hdr} = $val;
1365	194	50				395	}
1366	0					0	if (would_log('dbg', 'bayes') > 1) {
1367							dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
1368							}
1369							}
1370	16					204
1371							return %parsed;
1372							}
1373
1374	4			4		14	sub _pre_chew_content_type {
1375							my ($self, $val) = @_;
1376
1377	4	50				30	# hopefully this will retain good bits without too many hapaxen
1378	0					0	if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
1379	0	0				0	my $boundary = $1;
1380	0					0	$boundary = '' if !defined $boundary; # avoid a warning
1381							$boundary =~ s/[a-fA-F0-9]/H/gs;
1382	0					0	# break up blocks of separator chars so they become their own tokens
1383	0					0	$boundary =~ s/([-_\.=]+)/ $1 /gs;
1384							$val .= $boundary;
1385							}
1386
1387	4					36	# stop-list words for Content-Type header: these wind up totally gray
1388							$val =~ s/\b(?:text\|charset)\b//;
1389	4					13
1390							$val;
1391							}
1392
1393	16			16		47	sub _pre_chew_message_id {
1394							my ($self, $val) = @_;
1395							# we can (a) get rid of a lot of hapaxen and (b) increase the token
1396							# specificity by pre-parsing some common formats.
1397
1398	16					52	# Outlook Express format:
1399							$val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
1400							([0-9a-f]{4})[0-9a-f]{4}\$
1401							([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
1402
1403	16					30	# Exim:
1404							$val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
1405
1406	16					39	# Sendmail:
1407							$val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
1408							[A-F0-9]{10,12}\@//gx;
1409
1410							# try to split Message-ID segments on probable ID boundaries. Note that
1411							# Outlook message-ids seem to contain a server identifier ID in the last
1412							# 8 bytes before the @. Make sure this becomes its own token, it's a
1413	16					90	# great spam-sign for a learning system! Be sure to split on ".".
1414	16					44	$val =~ s/[^_A-Za-z0-9]/ /g;
1415							$val;
1416							}
1417
1418	28			28		59	sub _pre_chew_received {
1419							my ($self, $val) = @_;
1420
1421							# Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs
1422							# and valid-format RFC-822/2822 dates
1423	28					106
1424	28					88	$val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail
1425	28					185	$val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail
1426	28					70	$val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail
1427							$val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
1428	28					211
1429							$val =~ s/(?:(?:Mon\|Tue\|Wed\|Thu\|Fri\|Sat\|Sun),\s)?
1430							[0-3\s]?[0-9]\s
1431							(?:Jan\|Feb\|Ma[ry]\|Apr\|Ju[nl]\|Aug\|Sep\|Oct\|Nov\|Dec)\s
1432							(?:19\|20)?[0-9]{2}\s
1433							[0-2][0-9](?:\:[0-5][0-9]){1,2}\s
1434							(?:\s$\|$\|\s(?:[+-][0-9]{4})\|\s(?:UT\|[A-Z]{2,3}T))
1435							//gx;
1436
1437							# IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
1438							# IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
1439							# (on both sides)
1440							# also make a dup with the full IP, as fodder for
1441	28					214	# bayes_dump_to_trusted_networks: "Hr:ipaaa.bbb.ccc.ddd"
1442	30	50	33			218	$val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
			33
1443	0					0	if ($2 eq '10' \|\| ($2 eq '192' && $3 eq '168')) {
1444							$1.$2.$3.$4.
1445							" ip*".$1.$2.$3.$4." ";
1446	30					287	} else {
1447							$1.$2.$3.
1448							" ip*".$1.$2.$3.$4." ";
1449							}
1450							}gex;
1451
1452							# trim these: they turn out as the most common tokens, but with a
1453	28					294	# prob of about .5. waste of space!
1454							$val =~ s/\b(?:with\|from\|for\|SMTP\|ESMTP)\b/ /g;
1455	28					70
1456							$val;
1457							}
1458
1459	60			60		114	sub _pre_chew_addr_header {
1460	60					98	my ($self, $val) = @_;
1461							local ($_);
1462	60					191
1463	60					86	my @addrs = $self->{main}->find_all_addrs_in_line ($val);
1464	60					111	my @toks;
1465	48					110	foreach (@addrs) {
1466							push (@toks, $self->_tokenize_mail_addrs ($_));
1467	60					212	}
1468							return join (' ', @toks);
1469							}
1470
1471	92			92		179	sub _tokenize_mail_addrs {
1472							my ($self, $addr) = @_;
1473	92	50				399
1474	92					131	($addr =~ /(.+)\@(.+)$/) or return ();
1475	92					334	my @toks;
1476	92					169	push(@toks, "U".$1, "D".$2);
	92					379
	84					303
1477	92					305	$_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
1478							return @toks;
1479							}
1480
1481
1482							###########################################################################
1483
1484							# compute the probability that a token is spammish for each token
1485	4			4		22	sub _compute_prob_for_all_tokens {
1486	4					9	my ($self, $tokensdata, $ns, $nn) = @_;
1487							my @probabilities;
1488	4	50	33			47
1489							return if !$ns \|\| !$nn;
1490	4					27
1491	4					11	my $threshold = 1; # ignore low-freq tokens below this s+n threshold
1492							if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1493							$threshold = 10;
1494	4	50				27	}
1495	0					0	if (!$self->{use_hapaxes}) {
1496							$threshold = 2;
1497							}
1498	4					10
	4					15
1499	1104					1326	foreach my $tokendata (@{$tokensdata}) {
1500	1104					1184	my $s = $tokendata->[1]; # spam count
1501	1104					1193	my $n = $tokendata->[2]; # ham count
1502							my $prob;
1503	22			22		225
	22					55
	22					25589
1504	1104	100				1589	no warnings 'uninitialized'; # treat undef as zero in addition
1505							if ($s + $n >= $threshold) {
1506							# ignoring low-freq tokens, also covers the (!$s && !$n) case
1507
1508							# my $ratios = $s / $ns;
1509							# my $ration = $n / $nn;
1510							# $prob = $ratios / ($ration + $ratios);
1511	502					685	#
1512							$prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster
1513	502					526
1514							if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
1515							# use Robinson's f(x) equation for low-n tokens, instead of just
1516	502					590	# ignoring them
1517	502					707	my $robn = $s + $n;
1518							$prob =
1519							($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
1520							/
1521							($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
1522							}
1523							}
1524
1525							# 'log_raw_counts' is used to log the raw data for the Bayes equations
1526							# during a mass-check, allowing the S and X constants to be optimized
1527							# quickly without requiring re-tokenization of the messages for each
1528							# attempt. There's really no need for this code to be uncommented in
1529							# normal use, however. It has never been publicly documented, so
1530							# commenting it out is fine. ;)
1531							#
1532							## if ($self->{log_raw_counts}) {
1533							## $self->{raw_counts} .= " s=$s,n=$n ";
1534							## }
1535	1104					1622
1536							push(@probabilities, $prob);
1537	4					19	}
1538							return \@probabilities;
1539							}
1540
1541							# compute the probability that a token is spammish
1542	0			0		0	sub _compute_prob_for_token {
1543							my ($self, $token, $ns, $nn, $s, $n) = @_;
1544
1545							# we allow the caller to give us the token information, just
1546	0	0	0			0	# to save a potentially expensive lookup
1547	0					0	if (!defined($s) \|\| !defined($n)) {
1548							($s, $n, undef) = $self->{store}->tok_get($token);
1549	0	0	0			0	}
1550							return if !$s && !$n;
1551	0					0
1552							my $probabilities_ref =
1553							$self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);
1554	0					0
1555							return $probabilities_ref->[0];
1556							}
1557
1558							###########################################################################
1559							# If a token is neither hammy nor spammy, return 0.
1560							# For a spammy token, return the minimum number of additional ham messages
1561							# it would have had to appear in to no longer be spammy. Hammy tokens
1562							# are handled similarly. That's what the function does (at the time
1563							# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly
1564							# more useful if it returned the number of /additional/ ham messages
1565							# a spammy token would have to appear in to no longer be spammy but I
1566							# fear that might require the solution to a cubic equation, and I
1567							# just don't have the time for that now.
1568
1569	0			0		0	sub _compute_declassification_distance {
1570							my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
1571	0	0	0			0
1572							return 0 if $ns == 0 && $nn == 0;
1573	0					0
1574	0	0				0	if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
	0	0				0
1575							if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
1576	0	0	0			0
1577	0	0				0	return 0 if $Ns == 0 \|\| $Nn == 0;
1578							return 0 if abs( $prob - 0.5 ) <
1579							$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1580	0	0				0
1581	0					0	my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
1582							my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
1583	0					0
1584							return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
1585							unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
1586	0					0
1587	0					0	my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
1588	0					0	my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
1589	0					0	my $a = $Nb * ( 1 - $p );
1590	0					0	my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
1591	0					0	my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
1592	0	0				0	my $discrim = $b * $b - 4 * $a * $c;
1593	0					0	my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
1594							my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
1595
1596	0	0				0	# This shouldn't be necessary. Should not be < 1
1597							return $dd_exact < 1 ? 1 : int($dd_exact);
1598							}
1599
1600							###########################################################################
1601
1602	10			10		30	sub _opportunistic_calls {
1603							my($self, $journal_only) = @_;
1604
1605	10	50				133	# If we're not already tied, abort.
1606	0					0	if (!$self->{store}->db_readable()) {
1607	0					0	dbg("bayes: opportunistic call attempt failed, DB not readable");
1608							return;
1609							}
1610
1611	10					60	# Is an expire or sync running?
1612	10	50	33			50	my $running_expire = $self->{store}->get_running_expire_tok();
1613	0					0	if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
1614	0					0	dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
1615							return;
1616							}
1617
1618	10	50	66			223	# handle expiry and syncing
		50
1619	0					0	if (!$journal_only && $self->{store}->expiry_due()) {
1620							dbg("bayes: opportunistic call found expiry due");
1621
1622							# sync will bring the DB R/W as necessary, and the expire will remove
1623	0					0	# the running_expire token, may untie as well.
1624							$self->{main}->{bayes_scanner}->sync(1,1);
1625							}
1626	0					0	elsif ( $self->{store}->sync_due() ) {
1627							dbg("bayes: opportunistic call found journal sync due");
1628
1629	0					0	# sync will bring the DB R/W as necessary, may untie as well
1630							$self->{main}->{bayes_scanner}->sync(1,0);
1631
1632	0	0				0	# We can only remove the running_expire token if we're doing R/W
1633	0					0	if ($self->{store}->db_writable()) {
1634							$self->{store}->remove_running_expire_tok();
1635							}
1636							}
1637	10					30
1638							return;
1639							}
1640
1641							###########################################################################
1642
1643	63			63	1	206	sub learner_new {
1644							my ($self) = @_;
1645	63					142
1646	63					332	my $store;
1647	63	100				300	my $module = untaint_var($self->{conf}->{bayes_store_module});
1648							$module = 'Mail::SpamAssassin::BayesStore::DBM' if !$module;
1649	63					266
1650	63					206	dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module);
1651							undef $self->{store}; # DESTROYs previous object, if any
1652							eval '
1653							require '.$module.';
1654							$store = '.$module.'->new($self);
1655	63	50				6736	1;
1656	0	0				0	' or do {
	0					0
1657	0					0	my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
1658							die "bayes: learner_new $module new() failed: $eval_stat\n";
1659							};
1660	63					485
1661	63					174	dbg("bayes: learner_new: got store=%s", $store);
1662							$self->{store} = $store;
1663	63					270
1664							$self;
1665							}
1666
1667							###########################################################################
1668
1669	0			0	0		sub bayes_report_make_list {
1670	0	0					my ($self, $pms, $info, $param) = @_;
1671							return "Tokens not available." unless defined $info;
1672	0		0
1673							my ($limit,$fmt_arg,$more) = split /,/, ($param \|\| '5');
1674	0
1675							my %formats = (
1676							short => '$t',
1677							Short => 'Token: \"$t\"',
1678							compact => '$p-$D--$t',
1679							Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
1680							medium => '$p-$D-$N--$t',
1681							long => '$p-$d--${h}h-${s}s--${a}d--$t',
1682							Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
1683							);
1684	0	0
1685							my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
1686	0	0
1687							return "Invalid format, must be one of: ".join(",",keys %formats)
1688							unless defined $raw_fmt;
1689	0
1690	0	0					my $fmt = '"'.$raw_fmt.'"';
1691	0	0					my $amt = $limit < @$info ? $limit : @$info;
1692							return "" unless $amt;
1693	0
1694	0						my $ns = $pms->{bayes_nspam};
1695	0	0		0			my $nh = $pms->{bayes_nham};
	0
1696	0						my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
1697							my $now = time;
1698
1699	0						join ', ', map {
1700	0						my($t,$prob,$s,$h,$u) = @$_;
1701	0						my $a = int(($now - $u)/(3600 * 24));
1702	0						my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);
1703	0						my $p = sprintf "%.3f", $prob;
1704	0	0					my $n = $s + $h;
1705	0						my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
1706	0						my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
1707	0						eval $fmt; ## no critic
	0
1708							} @{$info}[0..$amt-1];
1709							}
1710
1711							1;