| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Statistics::Descriptive::PDL::SampleWeighted;
|
|
2
|
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
1796
|
use 5.010;
|
|
|
3
|
|
|
|
|
12
|
|
|
4
|
3
|
|
|
3
|
|
42
|
use strict;
|
|
|
3
|
|
|
|
|
7
|
|
|
|
3
|
|
|
|
|
75
|
|
|
5
|
3
|
|
|
3
|
|
52
|
use warnings;
|
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
158
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
# avoid loading too much, especially into our name space
|
|
8
|
3
|
|
|
3
|
|
14
|
use PDL::Lite '2.012';
|
|
|
3
|
|
|
|
|
7
|
|
|
|
3
|
|
|
|
|
35
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# this is otherwise not loaded due to oddities with multiple loading of PDL::Lite
|
|
11
|
|
|
|
|
|
|
#*pdl = \&PDL::Core::pdl;
|
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
# We could inherit from PDL::Objects, but in this case we want
|
|
14
|
|
|
|
|
|
|
# to hide the piddle from the caller to avoid arbitrary changes
|
|
15
|
|
|
|
|
|
|
# being applied to it.
|
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
## no critic (ProhibitExplicitReturnUndef)
|
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
our $VERSION = '0.17';
|
|
20
|
|
|
|
|
|
|
|
|
21
|
3
|
|
|
3
|
|
1638
|
use parent 'Statistics::Descriptive::PDL::Weighted';
|
|
|
3
|
|
|
|
|
702
|
|
|
|
3
|
|
|
|
|
17
|
|
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
my @cache_methods = qw /
|
|
24
|
|
|
|
|
|
|
median
|
|
25
|
|
|
|
|
|
|
standard_deviation skewness kurtosis
|
|
26
|
|
|
|
|
|
|
/;
|
|
27
|
|
|
|
|
|
|
__PACKAGE__->_make_caching_accessors( \@cache_methods );
|
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
|
|
31
|
15
|
|
|
15
|
|
57
|
sub _wt_type{PDL::long}
|
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub _standard_deviation {
|
|
35
|
4
|
|
|
4
|
|
10
|
my $self = shift;
|
|
36
|
|
|
|
|
|
|
|
|
37
|
4
|
|
|
|
|
13
|
my $data = $self->_get_piddle;
|
|
38
|
|
|
|
|
|
|
|
|
39
|
4
|
|
|
|
|
16
|
my $wts = $self->_get_weights_piddle;
|
|
40
|
4
|
|
|
|
|
18
|
my $n = $wts->sum;
|
|
41
|
|
|
|
|
|
|
|
|
42
|
4
|
50
|
|
|
|
82
|
return 0 if $n == 1;
|
|
43
|
|
|
|
|
|
|
|
|
44
|
4
|
|
|
|
|
493
|
my $var = ((($data ** 2) * $wts)->sum - $n * $self->mean ** 2);
|
|
45
|
|
|
|
|
|
|
|
|
46
|
4
|
50
|
|
|
|
198
|
return $var > 0 ? sqrt ($var / ($n - 1)) : 0;
|
|
47
|
|
|
|
|
|
|
}
|
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub _median {
|
|
51
|
9
|
|
|
9
|
|
21
|
my $self = shift;
|
|
52
|
|
|
|
|
|
|
|
|
53
|
9
|
|
|
|
|
38
|
my $piddle = $self->_sort_piddle;
|
|
54
|
9
|
|
|
|
|
42
|
my $cumsum = $self->_get_cumsum_weight_vector;
|
|
55
|
|
|
|
|
|
|
|
|
56
|
9
|
|
|
|
|
106
|
my $target_wt = $self->sum_weights * 0.5;
|
|
57
|
|
|
|
|
|
|
# vsearch should be faster since it uses a binary search
|
|
58
|
9
|
|
|
|
|
54
|
my $idx = PDL->pdl($target_wt)->vsearch_insert_leftmost($cumsum);
|
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# if the target weight is "on a boundary" between
|
|
61
|
|
|
|
|
|
|
# two values then we need to interpolate
|
|
62
|
9
|
100
|
|
|
|
724
|
my $median
|
|
63
|
|
|
|
|
|
|
= $target_wt == $cumsum->at($idx)
|
|
64
|
|
|
|
|
|
|
? ($piddle->at($idx) + $piddle->at($idx+1)) / 2
|
|
65
|
|
|
|
|
|
|
: $piddle->at($idx);
|
|
66
|
|
|
|
|
|
|
|
|
67
|
9
|
|
|
|
|
1503
|
return $median;
|
|
68
|
|
|
|
|
|
|
}
|
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub _skewness {
|
|
72
|
4
|
|
|
4
|
|
9
|
my $self = shift;
|
|
73
|
|
|
|
|
|
|
|
|
74
|
4
|
|
|
|
|
22
|
my $n = $self->sum_weights;
|
|
75
|
4
|
100
|
|
|
|
16
|
return undef if $n < 3;
|
|
76
|
|
|
|
|
|
|
|
|
77
|
3
|
|
|
|
|
12
|
my $data = $self->_get_piddle;
|
|
78
|
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
# long winded approach
|
|
80
|
3
|
|
|
|
|
20
|
my $mean = $self->mean;
|
|
81
|
3
|
|
|
|
|
13
|
my $sd = $self->standard_deviation;
|
|
82
|
3
|
|
|
|
|
14
|
my $wts = $self->_get_weights_piddle;
|
|
83
|
3
|
|
|
|
|
16
|
my $sumpow3 = ($wts * ((($data - $mean) / $sd) ** 3))->sum;
|
|
84
|
|
|
|
|
|
|
# inplace seems not to be faster here.
|
|
85
|
|
|
|
|
|
|
# Possibly PDL is smart enough to do it by default
|
|
86
|
|
|
|
|
|
|
# in such cases
|
|
87
|
|
|
|
|
|
|
#my $sumpow3 = ($data - $mean)->inplace->divide($sd, 0)->pow(3)->mult($wts, 0)->sum;
|
|
88
|
3
|
|
|
|
|
406
|
my $correction = $n / ( ($n-1) * ($n-2) );
|
|
89
|
3
|
|
|
|
|
42
|
my $skew = $correction * $sumpow3;
|
|
90
|
3
|
|
|
|
|
83
|
return $skew;
|
|
91
|
|
|
|
|
|
|
}
|
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub _kurtosis {
|
|
94
|
4
|
|
|
4
|
|
12
|
my $self = shift;
|
|
95
|
|
|
|
|
|
|
|
|
96
|
4
|
|
|
|
|
16
|
my $n = $self->sum_weights;
|
|
97
|
4
|
100
|
|
|
|
20
|
return undef if $n <= 3;
|
|
98
|
|
|
|
|
|
|
|
|
99
|
3
|
|
|
|
|
12
|
my $data = $self->_get_piddle;
|
|
100
|
3
|
|
|
|
|
70
|
my $mean = $self->mean;
|
|
101
|
3
|
|
|
|
|
48
|
my $sd = $self->standard_deviation;
|
|
102
|
3
|
|
|
|
|
42
|
my $wts = $self->_get_weights_piddle;
|
|
103
|
|
|
|
|
|
|
|
|
104
|
3
|
|
|
|
|
12
|
my $sumpow4 = ($wts * ((($data - $mean) / $sd) ** 4))->sum;
|
|
105
|
|
|
|
|
|
|
|
|
106
|
3
|
|
|
|
|
317
|
my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) );
|
|
107
|
3
|
|
|
|
|
15
|
my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) );
|
|
108
|
|
|
|
|
|
|
|
|
109
|
3
|
|
|
|
|
9
|
return ( $correction1 * $sumpow4 ) - $correction2;
|
|
110
|
|
|
|
|
|
|
}
|
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
# crude memoisation - would be nice to use
|
|
113
|
|
|
|
|
|
|
# state but it has issues with lists on older perls
|
|
114
|
|
|
|
|
|
|
my %k_piddle_cache;
|
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# Uses same basic algorithm as PDL::pctl.
|
|
117
|
|
|
|
|
|
|
sub _percentile {
|
|
118
|
25
|
|
|
25
|
|
70
|
my ($self, $p) = @_;
|
|
119
|
|
|
|
|
|
|
|
|
120
|
25
|
|
|
|
|
73
|
my $piddle = $self->_get_piddle;
|
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
return undef
|
|
123
|
25
|
50
|
33
|
|
|
110
|
if !defined $piddle or $piddle->isempty;
|
|
124
|
|
|
|
|
|
|
|
|
125
|
25
|
|
|
|
|
263
|
$self->_sort_piddle;
|
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# possible slowdown here - users need to dedup before calling to avoid
|
|
128
|
|
|
|
|
|
|
# my $deduped = $self->_deduplicate;
|
|
129
|
|
|
|
|
|
|
# there is actually no need to dedup
|
|
130
|
25
|
|
|
|
|
43
|
my $deduped = $self;
|
|
131
|
25
|
|
|
|
|
74
|
$piddle = $deduped->_get_piddle;
|
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
# my $wt_piddle = $self->_get_weights_piddle;
|
|
134
|
|
|
|
|
|
|
|
|
135
|
25
|
|
|
|
|
115
|
my $cumsum = $deduped->_get_cumsum_weight_vector;
|
|
136
|
25
|
|
|
|
|
337
|
my $wt_sum = $deduped->sum_weights;
|
|
137
|
|
|
|
|
|
|
|
|
138
|
3
|
|
|
3
|
|
2014
|
use POSIX qw /floor/;
|
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
28
|
|
|
139
|
|
|
|
|
|
|
|
|
140
|
25
|
|
|
|
|
142
|
my $target_wt = ($p / 100) * ($wt_sum - 1) + 1;
|
|
141
|
25
|
|
|
|
|
99
|
my $k = floor $target_wt;
|
|
142
|
25
|
|
|
|
|
55
|
my $d = $target_wt - $k;
|
|
143
|
|
|
|
|
|
|
|
|
144
|
25
|
|
66
|
|
|
556
|
my $idx = ($k_piddle_cache{$k} //= PDL->pdl(PDL::indx(), [$k]))->vsearch_insert_leftmost($cumsum)->at(0);
|
|
145
|
|
|
|
|
|
|
|
|
146
|
25
|
50
|
|
|
|
2136
|
if (scalar keys %k_piddle_cache > 10000) {
|
|
147
|
|
|
|
|
|
|
# Reset if we get too many
|
|
148
|
|
|
|
|
|
|
# - could be more nuanced based on frequency
|
|
149
|
|
|
|
|
|
|
# but then we would have to track it
|
|
150
|
0
|
|
|
|
|
0
|
%k_piddle_cache = ();
|
|
151
|
|
|
|
|
|
|
}
|
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# we need to interpolate if our target weight falls between two sets of weights
|
|
154
|
|
|
|
|
|
|
# e.g. target is 1.3, but the cumulative weights are [1,2] or [1,5]
|
|
155
|
25
|
|
|
|
|
77
|
my $fraction = $target_wt - ($cumsum->at($idx));
|
|
156
|
25
|
50
|
33
|
|
|
321
|
if ($fraction > 0 && $fraction < 1) {
|
|
157
|
25
|
|
|
|
|
67
|
my $lower_val = $piddle->at($idx);
|
|
158
|
25
|
|
|
|
|
243
|
my $upper_val = $piddle->at($idx+1);
|
|
159
|
25
|
|
|
|
|
205
|
my $val = $lower_val + $d * ($upper_val - $lower_val);
|
|
160
|
25
|
|
|
|
|
95
|
return $val;
|
|
161
|
|
|
|
|
|
|
}
|
|
162
|
|
|
|
|
|
|
|
|
163
|
0
|
|
|
|
|
0
|
return $piddle->at($idx);
|
|
164
|
|
|
|
|
|
|
}
|
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# weight for each sample is 1
|
|
167
|
|
|
|
|
|
|
sub _sum_sqr_sample_weights {
|
|
168
|
2
|
|
|
2
|
|
7
|
my $self = shift;
|
|
169
|
2
|
|
|
|
|
12
|
return $self->sum_weights;
|
|
170
|
|
|
|
|
|
|
}
|
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
1;
|
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
__END__
|