File Coverage

lib/Statistics/Descriptive/PDL/SampleWeighted.pm
Criterion Covered Total %
statement 75 77 97.4
branch 11 16 68.7
condition 4 9 44.4
subroutine 13 13 100.0
pod n/a
total 103 115 89.5


line stmt bran cond sub pod time code
1             package Statistics::Descriptive::PDL::SampleWeighted;
2            
3 3     3   1796 use 5.010;
  3         12  
4 3     3   42 use strict;
  3         7  
  3         75  
5 3     3   52 use warnings;
  3         5  
  3         158  
6            
7             # avoid loading too much, especially into our name space
8 3     3   14 use PDL::Lite '2.012';
  3         7  
  3         35  
9            
10             # this is otherwise not loaded due to oddities with multiple loading of PDL::Lite
11             #*pdl = \&PDL::Core::pdl;
12            
13             # We could inherit from PDL::Objects, but in this case we want
14             # to hide the piddle from the caller to avoid arbitrary changes
15             # being applied to it.
16            
17             ## no critic (ProhibitExplicitReturnUndef)
18            
19             our $VERSION = '0.17';
20            
21 3     3   1638 use parent 'Statistics::Descriptive::PDL::Weighted';
  3         702  
  3         17  
22            
23             my @cache_methods = qw /
24             median
25             standard_deviation skewness kurtosis
26             /;
27             __PACKAGE__->_make_caching_accessors( \@cache_methods );
28            
29            
30            
31 15     15   57 sub _wt_type{PDL::long}
32            
33            
34             sub _standard_deviation {
35 4     4   10 my $self = shift;
36            
37 4         13 my $data = $self->_get_piddle;
38            
39 4         16 my $wts = $self->_get_weights_piddle;
40 4         18 my $n = $wts->sum;
41            
42 4 50       82 return 0 if $n == 1;
43            
44 4         493 my $var = ((($data ** 2) * $wts)->sum - $n * $self->mean ** 2);
45            
46 4 50       198 return $var > 0 ? sqrt ($var / ($n - 1)) : 0;
47             }
48            
49            
50             sub _median {
51 9     9   21 my $self = shift;
52            
53 9         38 my $piddle = $self->_sort_piddle;
54 9         42 my $cumsum = $self->_get_cumsum_weight_vector;
55            
56 9         106 my $target_wt = $self->sum_weights * 0.5;
57             # vsearch should be faster since it uses a binary search
58 9         54 my $idx = PDL->pdl($target_wt)->vsearch_insert_leftmost($cumsum);
59            
60             # if the target weight is "on a boundary" between
61             # two values then we need to interpolate
62 9 100       724 my $median
63             = $target_wt == $cumsum->at($idx)
64             ? ($piddle->at($idx) + $piddle->at($idx+1)) / 2
65             : $piddle->at($idx);
66            
67 9         1503 return $median;
68             }
69            
70            
71             sub _skewness {
72 4     4   9 my $self = shift;
73            
74 4         22 my $n = $self->sum_weights;
75 4 100       16 return undef if $n < 3;
76            
77 3         12 my $data = $self->_get_piddle;
78            
79             # long winded approach
80 3         20 my $mean = $self->mean;
81 3         13 my $sd = $self->standard_deviation;
82 3         14 my $wts = $self->_get_weights_piddle;
83 3         16 my $sumpow3 = ($wts * ((($data - $mean) / $sd) ** 3))->sum;
84             # inplace seems not to be faster here.
85             # Possibly PDL is smart enough to do it by default
86             # in such cases
87             #my $sumpow3 = ($data - $mean)->inplace->divide($sd, 0)->pow(3)->mult($wts, 0)->sum;
88 3         406 my $correction = $n / ( ($n-1) * ($n-2) );
89 3         42 my $skew = $correction * $sumpow3;
90 3         83 return $skew;
91             }
92            
93             sub _kurtosis {
94 4     4   12 my $self = shift;
95            
96 4         16 my $n = $self->sum_weights;
97 4 100       20 return undef if $n <= 3;
98            
99 3         12 my $data = $self->_get_piddle;
100 3         70 my $mean = $self->mean;
101 3         48 my $sd = $self->standard_deviation;
102 3         42 my $wts = $self->_get_weights_piddle;
103            
104 3         12 my $sumpow4 = ($wts * ((($data - $mean) / $sd) ** 4))->sum;
105            
106 3         317 my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) );
107 3         15 my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) );
108            
109 3         9 return ( $correction1 * $sumpow4 ) - $correction2;
110             }
111            
112             # crude memoisation - would be nice to use
113             # state but it has issues with lists on older perls
114             my %k_piddle_cache;
115            
116             # Uses same basic algorithm as PDL::pctl.
117             sub _percentile {
118 25     25   70 my ($self, $p) = @_;
119            
120 25         73 my $piddle = $self->_get_piddle;
121            
122             return undef
123 25 50 33     110 if !defined $piddle or $piddle->isempty;
124            
125 25         263 $self->_sort_piddle;
126            
127             # possible slowdown here - users need to dedup before calling to avoid
128             # my $deduped = $self->_deduplicate;
129             # there is actually no need to dedup
130 25         43 my $deduped = $self;
131 25         74 $piddle = $deduped->_get_piddle;
132            
133             # my $wt_piddle = $self->_get_weights_piddle;
134            
135 25         115 my $cumsum = $deduped->_get_cumsum_weight_vector;
136 25         337 my $wt_sum = $deduped->sum_weights;
137            
138 3     3   2014 use POSIX qw /floor/;
  3         5  
  3         28  
139            
140 25         142 my $target_wt = ($p / 100) * ($wt_sum - 1) + 1;
141 25         99 my $k = floor $target_wt;
142 25         55 my $d = $target_wt - $k;
143            
144 25   66     556 my $idx = ($k_piddle_cache{$k} //= PDL->pdl(PDL::indx(), [$k]))->vsearch_insert_leftmost($cumsum)->at(0);
145            
146 25 50       2136 if (scalar keys %k_piddle_cache > 10000) {
147             # Reset if we get too many
148             # - could be more nuanced based on frequency
149             # but then we would have to track it
150 0         0 %k_piddle_cache = ();
151             }
152            
153             # we need to interpolate if our target weight falls between two sets of weights
154             # e.g. target is 1.3, but the cumulative weights are [1,2] or [1,5]
155 25         77 my $fraction = $target_wt - ($cumsum->at($idx));
156 25 50 33     321 if ($fraction > 0 && $fraction < 1) {
157 25         67 my $lower_val = $piddle->at($idx);
158 25         243 my $upper_val = $piddle->at($idx+1);
159 25         205 my $val = $lower_val + $d * ($upper_val - $lower_val);
160 25         95 return $val;
161             }
162            
163 0         0 return $piddle->at($idx);
164             }
165            
166             # weight for each sample is 1
167             sub _sum_sqr_sample_weights {
168 2     2   7 my $self = shift;
169 2         12 return $self->sum_weights;
170             }
171            
172            
173            
174             1;
175            
176             __END__