File Coverage

blib/lib/Mail/SpamAssassin/Bayes/CombineChi.pm

Criterion	Covered	Total	%
statement	50	54	92.5
branch	7	10	70.0
condition			n/a
subroutine	8	8	100.0
pod			n/a
total	65	72	90.2

line	stmt	bran	sub	time	code
1					# Chi-square probability combining and related constants.
2					#
3					# <@LICENSE>
4					# Licensed to the Apache Software Foundation (ASF) under one or more
5					# contributor license agreements. See the NOTICE file distributed with
6					# this work for additional information regarding copyright ownership.
7					# The ASF licenses this file to you under the Apache License, Version 2.0
8					# (the "License"); you may not use this file except in compliance with
9					# the License. You may obtain a copy of the License at:
10					#
11					# http://www.apache.org/licenses/LICENSE-2.0
12					#
13					# Unless required by applicable law or agreed to in writing, software
14					# distributed under the License is distributed on an "AS IS" BASIS,
15					# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16					# See the License for the specific language governing permissions and
17					# limitations under the License.
18					# </@LICENSE>
19
20	22		22	140	use strict; # make Test::Perl::Critic happy
	22			50
	22			1158
21
22					# this package is a no-op; the real impl code is in another pkg.
23
24					# Force into another package, so our symbols will appear in that namespace with
25					# no indirection, for speed. Other combiners must do the same, since Bayes.pm
26					# uses this namespace directly. This means only one combiner can be loaded at
27					# any time.
28
29					use strict;
30					use warnings;
31	22		22	122	# use bytes;
	22			47
	22			476
32	22		22	128	use re 'taint';
	22			42
	22			721
33
34	22		22	117	use POSIX qw(frexp);
	22			44
	22			868
35					use constant LN2 => log(2);
36	22		22	155
	22			51
	22			439
37	22		22	2641	# Value for 'x' in Gary Robinson's f(w) equation.
	22			41
	22			8709
38					# "Let x = the number used when n [hits] is 0."
39					our $FW_X_CONSTANT = 0.538;
40
41					# Value for 's' in the f(w) equation. "We can see s as the "strength" (hence
42					# the use of "s") of an original assumed expectation ... relative to how
43					# strongly we want to consider our actual collected data." Low 's' means
44					# trust collected data more strongly.
45					our $FW_S_CONSTANT = 0.030;
46
47					# (s . x) for the f(w) equation.
48					our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT);
49
50					# Should we ignore tokens with probs very close to the middle ground (.5)?
51					# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used.
52					our $MIN_PROB_STRENGTH = 0.346;
53
54					###########################################################################
55
56					# Chi-Squared method. Produces mostly boolean $result,
57					# but with a grey area.
58					my ($ns, $nn, $sortedref) = @_;
59
60					# @$sortedref contains an array of the probabilities
61	4		4	16	my $wc = scalar @$sortedref;
62					return unless $wc;
63
64	4			8	my ($H, $S);
65	4	50		10	my ($Hexp, $Sexp);
66					$Hexp = $Sexp = 0;
67	4			12
68	4			0	# see bug 3118
69	4			10	my $totmsgs = ($ns + $nn);
70					if ($totmsgs == 0) { return; }
71					$S = ($ns / $totmsgs);
72	4			10	$H = ($nn / $totmsgs);
73	4	50		12
	0			0
74	4			9	foreach my $prob (@$sortedref) {
75	4			8	$S *= 1.0 - $prob;
76					$H *= $prob;
77	4			19	if ($S < 1e-200) {
78	282			283	my $e;
79	282			265	($S, $e) = frexp($S);
80	282	50		362	$Sexp += $e;
81	0			0	}
82	0			0	if ($H < 1e-200) {
83	0			0	my $e;
84					($H, $e) = frexp($H);
85	282	100		632	$Hexp += $e;
86	2			7	}
87	2			23	}
88	2			7
89					$S = log($S) + $Sexp * LN2;
90					$H = log($H) + $Hexp * LN2;
91
92	4			33	# note: previous versions used (2 * $wc) as second arg ($v), but the chi2q()
93	4			10	# fn then just used ($v/2) internally! changed to simply supply $wc as
94					# ($halfv) directly instead to avoid redundant doubling and halving. The
95					# side-effect is that chi2q() uses a different API now, but it's only used
96					# here anyway.
97
98					$S = 1.0 - chi2q(-2.0 * $S, $wc);
99					$H = 1.0 - chi2q(-2.0 * $H, $wc);
100					return (($S - $H) + 1.0) / 2.0;
101	4			24	}
102	4			13
103	4			19	# Chi-squared function (API changed; see comment above)
104					my ($x2, $halfv) = @_;
105
106					my $m = $x2 / 2.0;
107					my ($sum, $term);
108	8		8	14	$sum = $term = exp(0 - $m);
109
110	8			16	# replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp
111	8			10	# array, with a plain C-style for loop
112	8			48	my $i;
113					for ($i = 1; $i < $halfv; $i++) {
114					$term *= $m / $i;
115					$sum += $term;
116	8			19	}
117	8			282	return $sum < 1.0 ? $sum : 1.0;
118	556			536	}
119	556			754
120					1;