File Coverage

blib/lib/Mail/SpamAssassin/Bayes/CombineChi.pm

Criterion	Covered	Total	%
statement	50	54	92.5
branch	6	10	60.0
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	64	72	88.8

line	stmt	bran	sub	time	code
1					# Chi-square probability combining and related constants.
2					#
3					# <@LICENSE>
4					# Licensed to the Apache Software Foundation (ASF) under one or more
5					# contributor license agreements. See the NOTICE file distributed with
6					# this work for additional information regarding copyright ownership.
7					# The ASF licenses this file to you under the Apache License, Version 2.0
8					# (the "License"); you may not use this file except in compliance with
9					# the License. You may obtain a copy of the License at:
10					#
11					# http://www.apache.org/licenses/LICENSE-2.0
12					#
13					# Unless required by applicable law or agreed to in writing, software
14					# distributed under the License is distributed on an "AS IS" BASIS,
15					# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16					# See the License for the specific language governing permissions and
17					# limitations under the License.
18					# </@LICENSE>
19
20	21		21	149	use strict; # make Test::Perl::Critic happy
	21			47
	21			1359
21
22					# this package is a no-op; the real impl code is in another pkg.
23					package Mail::SpamAssassin::Bayes::CombineChi; 1;
24
25					# Force into another package, so our symbols will appear in that namespace with
26					# no indirection, for speed. Other combiners must do the same, since Bayes.pm
27					# uses this namespace directly. This means only one combiner can be loaded at
28					# any time.
29					package Mail::SpamAssassin::Bayes::Combine;
30
31	21		21	145	use strict;
	21			51
	21			490
32	21		21	108	use warnings;
	21			65
	21			721
33					# use bytes;
34	21		21	133	use re 'taint';
	21			52
	21			908
35
36	21		21	141	use POSIX qw(frexp);
	21			42
	21			462
37	21		21	2776	use constant LN2 => log(2);
	21			46
	21			9119
38
39					# Value for 'x' in Gary Robinson's f(w) equation.
40					# "Let x = the number used when n [hits] is 0."
41					our $FW_X_CONSTANT = 0.538;
42
43					# Value for 's' in the f(w) equation. "We can see s as the "strength" (hence
44					# the use of "s") of an original assumed expectation ... relative to how
45					# strongly we want to consider our actual collected data." Low 's' means
46					# trust collected data more strongly.
47					our $FW_S_CONSTANT = 0.030;
48
49					# (s . x) for the f(w) equation.
50					our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT);
51
52					# Should we ignore tokens with probs very close to the middle ground (.5)?
53					# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
54					our $MIN_PROB_STRENGTH = 0.346;
55
56					###########################################################################
57
58					# Chi-Squared method. Produces mostly boolean $result,
59					# but with a grey area.
60					sub combine {
61	4		4	15	my ($ns, $nn, $sortedref) = @_;
62
63					# @$sortedref contains an array of the probabilities
64	4			17	my $wc = scalar @$sortedref;
65	4	50		13	return unless $wc;
66
67	4			15	my ($H, $S);
68	4			0	my ($Hexp, $Sexp);
69	4			13	$Hexp = $Sexp = 0;
70
71					# see bug 3118
72	4			18	my $totmsgs = ($ns + $nn);
73	4	50		12	if ($totmsgs == 0) { return; }
	0			0
74	4			10	$S = ($ns / $totmsgs);
75	4			10	$H = ($nn / $totmsgs);
76
77	4			20	foreach my $prob (@$sortedref) {
78	282			352	$S *= 1.0 - $prob;
79	282			339	$H *= $prob;
80	282	50		491	if ($S < 1e-200) {
81	0			0	my $e;
82	0			0	($S, $e) = frexp($S);
83	0			0	$Sexp += $e;
84					}
85	282	100		584	if ($H < 1e-200) {
86	2			14	my $e;
87	2			17	($H, $e) = frexp($H);
88	2			7	$Hexp += $e;
89					}
90					}
91
92	4			24	$S = log($S) + $Sexp * LN2;
93	4			12	$H = log($H) + $Hexp * LN2;
94
95					# note: previous versions used (2 * $wc) as second arg ($v), but the chi2q()
96					# fn then just used ($v/2) internally! changed to simply supply $wc as
97					# ($halfv) directly instead to avoid redundant doubling and halving. The
98					# side-effect is that chi2q() uses a different API now, but it's only used
99					# here anyway.
100
101	4			22	$S = 1.0 - chi2q(-2.0 * $S, $wc);
102	4			30	$H = 1.0 - chi2q(-2.0 * $H, $wc);
103	4			20	return (($S - $H) + 1.0) / 2.0;
104					}
105
106					# Chi-squared function (API changed; see comment above)
107					sub chi2q {
108	8		8	21	my ($x2, $halfv) = @_;
109
110	8			19	my $m = $x2 / 2.0;
111	8			17	my ($sum, $term);
112	8			59	$sum = $term = exp(0 - $m);
113
114					# replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp
115					# array, with a plain C-style for loop
116	8			26	my $i;
117	8			27	for ($i = 1; $i < $halfv; $i++) {
118	556			729	$term *= $m / $i;
119	556			992	$sum += $term;
120					}
121	8	50		29	return $sum < 1.0 ? $sum : 1.0;
122					}
123
124					1;