line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Chi-square probability combining and related constants. |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# <@LICENSE> |
4
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
5
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
6
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
7
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
8
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
9
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
12
|
|
|
|
|
|
|
# |
13
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
14
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
15
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
17
|
|
|
|
|
|
|
# limitations under the License. |
18
|
|
|
|
|
|
|
# </@LICENSE> |
19
|
|
|
|
|
|
|
|
20
|
22
|
|
|
22
|
|
161
|
use strict; # make Test::Perl::Critic happy |
|
22
|
|
|
|
|
41
|
|
|
22
|
|
|
|
|
1237
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# this package is a no-op; the real impl code is in another pkg. |
23
|
|
|
|
|
|
|
package Mail::SpamAssassin::Bayes::CombineChi; 1; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# Force into another package, so our symbols will appear in that namespace with |
26
|
|
|
|
|
|
|
# no indirection, for speed. Other combiners must do the same, since Bayes.pm |
27
|
|
|
|
|
|
|
# uses this namespace directly. This means only one combiner can be loaded at |
28
|
|
|
|
|
|
|
# any time. |
29
|
|
|
|
|
|
|
package Mail::SpamAssassin::Bayes::Combine; |
30
|
|
|
|
|
|
|
|
31
|
22
|
|
|
22
|
|
131
|
use strict; |
|
22
|
|
|
|
|
51
|
|
|
22
|
|
|
|
|
482
|
|
32
|
22
|
|
|
22
|
|
174
|
use warnings; |
|
22
|
|
|
|
|
39
|
|
|
22
|
|
|
|
|
725
|
|
33
|
|
|
|
|
|
|
# use bytes; |
34
|
22
|
|
|
22
|
|
137
|
use re 'taint'; |
|
22
|
|
|
|
|
44
|
|
|
22
|
|
|
|
|
815
|
|
35
|
|
|
|
|
|
|
|
36
|
22
|
|
|
22
|
|
129
|
use POSIX qw(frexp); |
|
22
|
|
|
|
|
50
|
|
|
22
|
|
|
|
|
449
|
|
37
|
22
|
|
|
22
|
|
2679
|
use constant LN2 => log(2); |
|
22
|
|
|
|
|
41
|
|
|
22
|
|
|
|
|
9441
|
|
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# Value for 'x' in Gary Robinson's f(w) equation. |
40
|
|
|
|
|
|
|
# "Let x = the number used when n [hits] is 0." |
41
|
|
|
|
|
|
|
our $FW_X_CONSTANT = 0.538; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# Value for 's' in the f(w) equation. "We can see s as the "strength" (hence |
44
|
|
|
|
|
|
|
# the use of "s") of an original assumed expectation ... relative to how |
45
|
|
|
|
|
|
|
# strongly we want to consider our actual collected data." Low 's' means |
46
|
|
|
|
|
|
|
# trust collected data more strongly. |
47
|
|
|
|
|
|
|
our $FW_S_CONSTANT = 0.030; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# (s . x) for the f(w) equation. |
50
|
|
|
|
|
|
|
our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# Should we ignore tokens with probs very close to the middle ground (.5)? |
53
|
|
|
|
|
|
|
# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used. |
54
|
|
|
|
|
|
|
our $MIN_PROB_STRENGTH = 0.346; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
########################################################################### |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# Chi-Squared method. Produces mostly boolean $result, |
59
|
|
|
|
|
|
|
# but with a grey area. |
60
|
|
|
|
|
|
|
sub combine { |
61
|
4
|
|
|
4
|
|
15
|
my ($ns, $nn, $sortedref) = @_; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# @$sortedref contains an array of the probabilities |
64
|
4
|
|
|
|
|
9
|
my $wc = scalar @$sortedref; |
65
|
4
|
50
|
|
|
|
75
|
return unless $wc; |
66
|
|
|
|
|
|
|
|
67
|
4
|
|
|
|
|
20
|
my ($H, $S); |
68
|
4
|
|
|
|
|
0
|
my ($Hexp, $Sexp); |
69
|
4
|
|
|
|
|
11
|
$Hexp = $Sexp = 0; |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# see bug 3118 |
72
|
4
|
|
|
|
|
16
|
my $totmsgs = ($ns + $nn); |
73
|
4
|
50
|
|
|
|
24
|
if ($totmsgs == 0) { return; } |
|
0
|
|
|
|
|
0
|
|
74
|
4
|
|
|
|
|
11
|
$S = ($ns / $totmsgs); |
75
|
4
|
|
|
|
|
13
|
$H = ($nn / $totmsgs); |
76
|
|
|
|
|
|
|
|
77
|
4
|
|
|
|
|
29
|
foreach my $prob (@$sortedref) { |
78
|
272
|
|
|
|
|
302
|
$S *= 1.0 - $prob; |
79
|
272
|
|
|
|
|
282
|
$H *= $prob; |
80
|
272
|
50
|
|
|
|
422
|
if ($S < 1e-200) { |
81
|
0
|
|
|
|
|
0
|
my $e; |
82
|
0
|
|
|
|
|
0
|
($S, $e) = frexp($S); |
83
|
0
|
|
|
|
|
0
|
$Sexp += $e; |
84
|
|
|
|
|
|
|
} |
85
|
272
|
100
|
|
|
|
410
|
if ($H < 1e-200) { |
86
|
2
|
|
|
|
|
5
|
my $e; |
87
|
2
|
|
|
|
|
22
|
($H, $e) = frexp($H); |
88
|
2
|
|
|
|
|
6
|
$Hexp += $e; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
4
|
|
|
|
|
35
|
$S = log($S) + $Sexp * LN2; |
93
|
4
|
|
|
|
|
14
|
$H = log($H) + $Hexp * LN2; |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# note: previous versions used (2 * $wc) as second arg ($v), but the chi2q() |
96
|
|
|
|
|
|
|
# fn then just used ($v/2) internally! changed to simply supply $wc as |
97
|
|
|
|
|
|
|
# ($halfv) directly instead to avoid redundant doubling and halving. The |
98
|
|
|
|
|
|
|
# side-effect is that chi2q() uses a different API now, but it's only used |
99
|
|
|
|
|
|
|
# here anyway. |
100
|
|
|
|
|
|
|
|
101
|
4
|
|
|
|
|
26
|
$S = 1.0 - chi2q(-2.0 * $S, $wc); |
102
|
4
|
|
|
|
|
20
|
$H = 1.0 - chi2q(-2.0 * $H, $wc); |
103
|
4
|
|
|
|
|
22
|
return (($S - $H) + 1.0) / 2.0; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# Chi-squared function (API changed; see comment above) |
107
|
|
|
|
|
|
|
sub chi2q { |
108
|
8
|
|
|
8
|
|
20
|
my ($x2, $halfv) = @_; |
109
|
|
|
|
|
|
|
|
110
|
8
|
|
|
|
|
30
|
my $m = $x2 / 2.0; |
111
|
8
|
|
|
|
|
17
|
my ($sum, $term); |
112
|
8
|
|
|
|
|
51
|
$sum = $term = exp(0 - $m); |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp |
115
|
|
|
|
|
|
|
# array, with a plain C-style for loop |
116
|
8
|
|
|
|
|
21
|
my $i; |
117
|
8
|
|
|
|
|
30
|
for ($i = 1; $i < $halfv; $i++) { |
118
|
536
|
|
|
|
|
612
|
$term *= $m / $i; |
119
|
536
|
|
|
|
|
859
|
$sum += $term; |
120
|
|
|
|
|
|
|
} |
121
|
8
|
50
|
|
|
|
27
|
return $sum < 1.0 ? $sum : 1.0; |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
1; |