line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Chi-square probability combining and related constants. |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# <@LICENSE> |
4
|
|
|
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more |
5
|
|
|
|
|
|
|
# contributor license agreements. See the NOTICE file distributed with |
6
|
|
|
|
|
|
|
# this work for additional information regarding copyright ownership. |
7
|
|
|
|
|
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0 |
8
|
|
|
|
|
|
|
# (the "License"); you may not use this file except in compliance with |
9
|
|
|
|
|
|
|
# the License. You may obtain a copy of the License at: |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0 |
12
|
|
|
|
|
|
|
# |
13
|
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software |
14
|
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS, |
15
|
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16
|
|
|
|
|
|
|
# See the License for the specific language governing permissions and |
17
|
|
|
|
|
|
|
# limitations under the License. |
18
|
|
|
|
|
|
|
# </@LICENSE> |
19
|
|
|
|
|
|
|
|
20
|
22
|
|
|
22
|
|
140
|
use strict; # make Test::Perl::Critic happy |
|
22
|
|
|
|
|
50
|
|
|
22
|
|
|
|
|
1158
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# this package is a no-op; the real impl code is in another pkg. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# Force into another package, so our symbols will appear in that namespace with |
25
|
|
|
|
|
|
|
# no indirection, for speed. Other combiners must do the same, since Bayes.pm |
26
|
|
|
|
|
|
|
# uses this namespace directly. This means only one combiner can be loaded at |
27
|
|
|
|
|
|
|
# any time. |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
use strict; |
30
|
|
|
|
|
|
|
use warnings; |
31
|
22
|
|
|
22
|
|
122
|
# use bytes; |
|
22
|
|
|
|
|
47
|
|
|
22
|
|
|
|
|
476
|
|
32
|
22
|
|
|
22
|
|
128
|
use re 'taint'; |
|
22
|
|
|
|
|
42
|
|
|
22
|
|
|
|
|
721
|
|
33
|
|
|
|
|
|
|
|
34
|
22
|
|
|
22
|
|
117
|
use POSIX qw(frexp); |
|
22
|
|
|
|
|
44
|
|
|
22
|
|
|
|
|
868
|
|
35
|
|
|
|
|
|
|
use constant LN2 => log(2); |
36
|
22
|
|
|
22
|
|
155
|
|
|
22
|
|
|
|
|
51
|
|
|
22
|
|
|
|
|
439
|
|
37
|
22
|
|
|
22
|
|
2641
|
# Value for 'x' in Gary Robinson's f(w) equation. |
|
22
|
|
|
|
|
41
|
|
|
22
|
|
|
|
|
8709
|
|
38
|
|
|
|
|
|
|
# "Let x = the number used when n [hits] is 0." |
39
|
|
|
|
|
|
|
our $FW_X_CONSTANT = 0.538; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Value for 's' in the f(w) equation. "We can see s as the "strength" (hence |
42
|
|
|
|
|
|
|
# the use of "s") of an original assumed expectation ... relative to how |
43
|
|
|
|
|
|
|
# strongly we want to consider our actual collected data." Low 's' means |
44
|
|
|
|
|
|
|
# trust collected data more strongly. |
45
|
|
|
|
|
|
|
our $FW_S_CONSTANT = 0.030; |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# (s . x) for the f(w) equation. |
48
|
|
|
|
|
|
|
our $FW_S_DOT_X = ($FW_X_CONSTANT * $FW_S_CONSTANT); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# Should we ignore tokens with probs very close to the middle ground (.5)? |
51
|
|
|
|
|
|
|
# tokens need to be outside the [ .5-MPS, .5+MPS ] range to be used. |
52
|
|
|
|
|
|
|
our $MIN_PROB_STRENGTH = 0.346; |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
########################################################################### |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Chi-Squared method. Produces mostly boolean $result, |
57
|
|
|
|
|
|
|
# but with a grey area. |
58
|
|
|
|
|
|
|
my ($ns, $nn, $sortedref) = @_; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# @$sortedref contains an array of the probabilities |
61
|
4
|
|
|
4
|
|
16
|
my $wc = scalar @$sortedref; |
62
|
|
|
|
|
|
|
return unless $wc; |
63
|
|
|
|
|
|
|
|
64
|
4
|
|
|
|
|
8
|
my ($H, $S); |
65
|
4
|
50
|
|
|
|
10
|
my ($Hexp, $Sexp); |
66
|
|
|
|
|
|
|
$Hexp = $Sexp = 0; |
67
|
4
|
|
|
|
|
12
|
|
68
|
4
|
|
|
|
|
0
|
# see bug 3118 |
69
|
4
|
|
|
|
|
10
|
my $totmsgs = ($ns + $nn); |
70
|
|
|
|
|
|
|
if ($totmsgs == 0) { return; } |
71
|
|
|
|
|
|
|
$S = ($ns / $totmsgs); |
72
|
4
|
|
|
|
|
10
|
$H = ($nn / $totmsgs); |
73
|
4
|
50
|
|
|
|
12
|
|
|
0
|
|
|
|
|
0
|
|
74
|
4
|
|
|
|
|
9
|
foreach my $prob (@$sortedref) { |
75
|
4
|
|
|
|
|
8
|
$S *= 1.0 - $prob; |
76
|
|
|
|
|
|
|
$H *= $prob; |
77
|
4
|
|
|
|
|
19
|
if ($S < 1e-200) { |
78
|
282
|
|
|
|
|
283
|
my $e; |
79
|
282
|
|
|
|
|
265
|
($S, $e) = frexp($S); |
80
|
282
|
50
|
|
|
|
362
|
$Sexp += $e; |
81
|
0
|
|
|
|
|
0
|
} |
82
|
0
|
|
|
|
|
0
|
if ($H < 1e-200) { |
83
|
0
|
|
|
|
|
0
|
my $e; |
84
|
|
|
|
|
|
|
($H, $e) = frexp($H); |
85
|
282
|
100
|
|
|
|
632
|
$Hexp += $e; |
86
|
2
|
|
|
|
|
7
|
} |
87
|
2
|
|
|
|
|
23
|
} |
88
|
2
|
|
|
|
|
7
|
|
89
|
|
|
|
|
|
|
$S = log($S) + $Sexp * LN2; |
90
|
|
|
|
|
|
|
$H = log($H) + $Hexp * LN2; |
91
|
|
|
|
|
|
|
|
92
|
4
|
|
|
|
|
33
|
# note: previous versions used (2 * $wc) as second arg ($v), but the chi2q() |
93
|
4
|
|
|
|
|
10
|
# fn then just used ($v/2) internally! changed to simply supply $wc as |
94
|
|
|
|
|
|
|
# ($halfv) directly instead to avoid redundant doubling and halving. The |
95
|
|
|
|
|
|
|
# side-effect is that chi2q() uses a different API now, but it's only used |
96
|
|
|
|
|
|
|
# here anyway. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
$S = 1.0 - chi2q(-2.0 * $S, $wc); |
99
|
|
|
|
|
|
|
$H = 1.0 - chi2q(-2.0 * $H, $wc); |
100
|
|
|
|
|
|
|
return (($S - $H) + 1.0) / 2.0; |
101
|
4
|
|
|
|
|
24
|
} |
102
|
4
|
|
|
|
|
13
|
|
103
|
4
|
|
|
|
|
19
|
# Chi-squared function (API changed; see comment above) |
104
|
|
|
|
|
|
|
my ($x2, $halfv) = @_; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
my $m = $x2 / 2.0; |
107
|
|
|
|
|
|
|
my ($sum, $term); |
108
|
8
|
|
|
8
|
|
14
|
$sum = $term = exp(0 - $m); |
109
|
|
|
|
|
|
|
|
110
|
8
|
|
|
|
|
16
|
# replace 'for my $i (1 .. (($v/2)-1))' idiom, which creates a temp |
111
|
8
|
|
|
|
|
10
|
# array, with a plain C-style for loop |
112
|
8
|
|
|
|
|
48
|
my $i; |
113
|
|
|
|
|
|
|
for ($i = 1; $i < $halfv; $i++) { |
114
|
|
|
|
|
|
|
$term *= $m / $i; |
115
|
|
|
|
|
|
|
$sum += $term; |
116
|
8
|
|
|
|
|
19
|
} |
117
|
8
|
|
|
|
|
282
|
return $sum < 1.0 ? $sum : 1.0; |
118
|
556
|
|
|
|
|
536
|
} |
119
|
556
|
|
|
|
|
754
|
|
120
|
|
|
|
|
|
|
1; |