File Coverage

lib/Statistics/Descriptive/PDL/Weighted.pm
Criterion Covered Total %
statement 197 198 99.4
branch 47 62 75.8
condition 8 12 66.6
subroutine 30 30 100.0
pod 6 6 100.0
total 288 308 93.5


line stmt bran cond sub pod time code
1             package Statistics::Descriptive::PDL::Weighted;
2            
3 5     5   209392 use 5.010;
  5         19  
4 5     5   36 use strict;
  5         9  
  5         140  
5 5     5   48 use warnings;
  5         9  
  5         346  
6            
7             # avoid loading too much, especially into our name space
8 5     5   883 use PDL::Lite '2.012';
  5         214667  
  5         45  
9            
10             # We could inherit from PDL::Objects, but in this case we want
11             # to hide the piddle from the caller to avoid arbitrary changes
12             # being applied to it.
13            
14             ## no critic (ProhibitExplicitReturnUndef)
15            
16             our $VERSION = '0.17';
17            
18 5     5   2424 use parent 'Statistics::Descriptive::PDL';
  5         968  
  5         32  
19            
20             my @cache_methods = qw /
21             count sum mode median
22             mean standard_deviation skewness kurtosis
23             geometric_mean harmonic_mean
24             sum_weights sum_sqr_weights
25             /;
26             __PACKAGE__->_make_caching_accessors( \@cache_methods );
27            
28            
29             sub new {
30 41     41 1 1452707 my $proto = shift;
31 41   66     237 my $data_type = shift // PDL::double();
32 41   66     894 my $class = ref($proto) || $proto;
33            
34 41         218 my $self = {
35             piddle => undef,
36             weights_piddle => undef,
37             data_type => $data_type,
38             };
39 41         103 bless $self, $class;
40            
41 41         144 return $self;
42             }
43            
44            
45 32     32   68 sub _wt_type{PDL::double}
46            
47             sub add_data {
48 47     47 1 4378 my ($self, $data, $weights) = @_;
49            
50 47         363 my ($data_piddle, $weights_piddle);
51            
52 47         0 my $data_from_hash;
53            
54 47 100       154 if (ref $data eq 'HASH') {
55 9         103 $data_piddle = PDL->pdl ($self->{data_type}, [keys %$data])->flat;
56 9         431 $weights_piddle = PDL->pdl ($self->_wt_type, [values %$data])->flat;
57 9         289 $data_from_hash = 1;
58             }
59             else {
60 38         182 $data_piddle = PDL->pdl ($self->{data_type}, $data)->flat;
61 38         1434 $weights_piddle = PDL->pdl ($self->_wt_type, $weights)->flat;
62 38 100       1161 die "data and weight vectors not of same length"
63             if scalar $data_piddle->nelem != $weights_piddle->nelem;
64 37 100       294 die "Cannot pass zero or negative weights"
65             if PDL::any($weights_piddle <= 0);
66             }
67            
68 44 50       5451 return if !$data_piddle->nelem;
69            
70 44         190 my $has_existing_data = $self->count;
71            
72             # Take care of appending to an existing data set
73 44 100       143 if ($has_existing_data) {
74 12         57 my $d_piddle = $self->_get_piddle;
75 12         86 $d_piddle = $d_piddle->append ($data_piddle);
76 12         1068 $self->_set_piddle ($d_piddle);
77 12         796 my $w_piddle = $self->_get_weights_piddle;
78 12         32 $w_piddle = $w_piddle->append ($weights_piddle);
79 12         740 $self->_set_weights_piddle ($w_piddle);
80            
81 12         716 delete $self->{sorted};
82             }
83             else {
84 32         139 $self->_set_piddle ($data_piddle);
85 32         2851 $self->_set_weights_piddle ($weights_piddle);
86             }
87            
88             # need to clear late because count is cached
89 44         2582 $self->clear_cache;
90            
91             # somewhat awkward but needs to be set after clearing the cache
92 44 100 100     191 if ($data_from_hash && !$has_existing_data) {
93 5         28 $self->values_are_unique(1);
94             }
95            
96 44         131 return $self->count;
97             }
98            
99             sub get_data {
100 1     1 1 12 my $self = shift;
101 1         5 my $piddle = $self->_get_piddle;
102 1         4 my $wts_piddle = $self->_get_weights_piddle;
103            
104 1 50       11 my $data = defined $piddle ? $piddle->unpdl : [];
105 1 50       7 my $wts = defined $wts_piddle ? $wts_piddle->unpdl : [];
106            
107 1 50       5 return wantarray ? ($data, $wts) : [$data, $wts];
108             }
109            
110             sub get_data_as_hash {
111 3     3 1 2192 my $self = shift;
112            
113 3         11 my $piddle = $self->_get_piddle;
114            
115 3 0       10 return wantarray ? () : {}
    50          
116             if !defined $piddle;
117            
118 3         12 my $deduped = $self->_deduplicate;
119            
120 3         12 my $data = $deduped->_get_piddle->unpdl;
121 3         26 my $wts = $deduped->_get_weights_piddle->unpdl;
122 3         7 my %h;
123 3         100 @h{@$data} = @$wts;
124            
125 3 50       41 return wantarray ? (%h) : \%h;
126             }
127            
128            
129             sub values_are_unique {
130 26     26 1 83 my $self = shift;
131 26 100       85 if (@_) {
132 14         34 my $flag = shift;
133 14         87 $self->{_cache}{deduplicated} = !!$flag;
134             }
135 26         108 return $self->{_cache}{deduplicated};
136             }
137            
138             sub _set_weights_piddle {
139 72     72   174 my ($self, $data) = @_;
140 72         227 $self->{weights_piddle} = PDL->pdl ($data);
141             }
142            
143             sub _get_weights_piddle {
144 245     245   390 my $self = shift;
145 245         962 return $self->{weights_piddle};
146             }
147            
148             sub _count {
149 44     44   81 my $self = shift;
150 44         109 my $piddle = $self->_get_weights_piddle;
151 44 50       131 return undef if !defined $piddle;
152 44         184 return $piddle->sum;
153             }
154            
155             sub _sum {
156 3     3   8 my $self = shift;
157 3         13 return ($self->_get_piddle * $self->_get_weights_piddle)->sum;
158             }
159            
160             sub _sum_weights {
161 34     34   57 my $self = shift;
162            
163 34         105 return $self->_get_weights_piddle->sum;
164             }
165            
166             sub _sum_sqr_weights {
167 3     3   7 my $self = shift;
168 3         12 return $self->_get_weights_piddle->power(2)->sum;
169             }
170            
171             #sub min_weight {
172             # my $self = shift;
173             # return $self->_get_weights_piddle->min;
174             #}
175            
176            
177             sub _mean {
178 25     25   54 my $self = shift;
179            
180 25         59 my $data = $self->_get_piddle;
181 25         54 my $wts = $self->_get_weights_piddle;
182 25         162 return ($data * $wts)->sum / $wts->sum;
183             }
184            
185            
186             sub _standard_deviation {
187 22     22   42 my $self = shift;
188            
189 22         61 my $data = $self->_get_piddle;
190 22         41 my $sd;
191 22         60 my $n = $data->nelem;
192 22 100       79 if ($n > 1) {
    50          
193             # long winded approach
194 21         42 my $wts = $self->_get_weights_piddle;
195 21         57 my $mean = $self->mean;
196 21         456 my $sumsqr = ($wts * (($data - $mean) ** 2))->sum;
197 21         1576 my $var = $sumsqr / $self->sum_weights;
198 21         343 $sd = sqrt $var;
199             }
200             elsif ($n == 1){
201 1         3 $sd = 0;
202             }
203            
204 22         390 return $sd;
205             }
206            
207             sub variance {
208 5     5 1 3893 my $self = shift;
209 5         19 my $sd = $self->standard_deviation;
210 5 100       33 return defined $sd ? $sd ** 2 : undef;
211             }
212            
213             sub _median {
214 10     10   22 my $self = shift;
215            
216 10         31 my $data = $self->_sort_piddle;
217 10         52 my $cumsum = $self->_get_cumsum_weight_vector;
218            
219 10         272 my $target_wt = $self->sum_weights * 0.5;
220             # vsearch should be faster since it uses a binary search
221 10         31 my $idx = PDL->pdl($target_wt)->vsearch_insert_leftmost($cumsum->reshape);
222            
223 10         895 return $data->at($idx);
224             }
225            
226             sub _sort_piddle {
227 55     55   123 my $self = shift;
228 55         1534 my $data = $self->_get_piddle;
229            
230 55 50       164 return undef if !defined $data;
231            
232 55 100       193 return $data if $self->{sorted};
233            
234 22         64 my $wts = $self->_get_weights_piddle;
235 22         580 my $s = $data->qsorti->reshape;
236 22         909 my $sorted_data = $data->slice($s);
237 22         1234 my $sorted_wts = $wts->slice($s);
238            
239 22         862 $self->_set_piddle($sorted_data);
240 22         2215 $self->_set_weights_piddle($sorted_wts);
241            
242 22         1686 $self->{sorted} = 1;
243 22         56 delete $self->{_cache}{cumsum_weight_vector};
244            
245 22         238 return $sorted_data;
246             }
247            
248             # de-duplicate if needed, aggregating weights
249             # there should be a sumover or which approach that will work better
250             # maybe yvals related
251             sub _deduplicate {
252 10     10   3448 my ($self, %args) = @_;
253 10         37 my $piddle = $self->_get_piddle;
254            
255             return undef
256 10 50       34 if !defined $piddle;
257            
258 10 100       35 return $self
259             if $self->values_are_unique;
260            
261 9         52 my $unique = $piddle->uniq;
262            
263 9 100       5250 if ($unique->nelem == $piddle->nelem) {
264 3         16 $self->values_are_unique(1);
265 3         15 return $self
266             }
267            
268 6 100       29 if (!$self->{sorted}) {
269 3         38 $unique = $unique->qsort;
270             }
271            
272 6         22 $piddle = $self->_sort_piddle;
273 6         19 my $wts_piddle = $self->_get_weights_piddle;
274            
275             # could use a map with a hash, but this avoids
276             # stringification and loss of precision
277             # (not that that should cause too many issues for most data)
278             # Should try to reduce looping when there are
279             # not many dups in large data sets
280 6         29 my $j = 0; # index into deduplicated piddle
281 6         24 my $last_val = $piddle->at(0);
282 6         69 my @wts;
283 6         31 foreach my $i (0..$piddle->nelem-1) {
284 204         1335 my $val = $piddle->at($i);
285 204 100       1325 if ($val != $last_val) {
286 168         229 $j++;
287 168         337 $last_val = $val;
288             }
289 204         416 $wts[$j] += $wts_piddle->at($i);
290             }
291            
292 6 100       46 if ($args{inplace}) {
293 2         7 $self->_set_piddle($unique);
294 2         178 $self->_set_weights_piddle(\@wts);
295 2         79 $self->clear_cache;
296 2         6 $self->values_are_unique (1);
297 2         12 return $self;
298             }
299            
300 4         17 my $new = $self->new($self->{data_type});
301 4         17 $new->_set_piddle($unique);
302 4         300 $new->_set_weights_piddle(\@wts);
303 4         228 $new->values_are_unique (1);
304            
305 4         43 return $new;
306             }
307            
308             sub _get_cumsum_weight_vector {
309 49     49   87 my $self = shift;
310            
311             return $self->{_cache}{cumsum_weight_vector}
312 49 100       295 if defined $self->{_cache}{cumsum_weight_vector};
313             return $self->{_cache}{cumsum_weight_vector}
314 19         53 = $self->_get_weights_piddle->cumusumover->reshape;
315             }
316            
317             sub _skewness {
318 13     13   23 my $self = shift;
319            
320 13         54 my $data = $self->_get_piddle;
321            
322             # long winded approach
323 13         36 my $mean = $self->mean;
324 13         33 my $sd = $self->standard_deviation;
325 13         44 my $wts = $self->_get_weights_piddle;
326 13         47 my $sumpow3 = ($wts * ((($data - $mean) / $sd) ** 3))->sum;
327 13         1049 my $skew = $sumpow3 / $self->sum_weights;
328 13         186 return $skew;
329             }
330            
331             sub _kurtosis {
332 13     13   22 my $self = shift;
333            
334 13         31 my $data = $self->_get_piddle;
335            
336             # long winded approach
337 13         49 my $mean = $self->mean;
338 13         38 my $sd = $self->standard_deviation;
339 13         40 my $wts = $self->_get_weights_piddle;
340 13         50 my $sumpow4 = ($wts * ((($data - $mean) / $sd) ** 4))->sum;
341 13         1200 my $kurt = $sumpow4 / $self->sum_weights - 3;
342 13         368 return $kurt;
343             }
344            
345            
346             sub _harmonic_mean {
347 6     6   16 my $self = shift;
348            
349 6         19 my $data = $self->_get_piddle;
350            
351             # not sure about this...
352 6 100       31 return undef if $data->which->nelem != $data->nelem;
353            
354 5         2411 my $wts = $self->_get_weights_piddle;
355            
356 5         28 my $hs = ((1 / $data) * $wts)->sum;
357            
358 5 100       318 return $hs ? $self->count / $hs : undef;
359             }
360            
361             sub _geometric_mean {
362 5     5   13 my $self = shift;
363            
364 5         21 my $data = $self->_get_piddle;
365            
366             # should add a sorted status check, as we can use vsearch in such cases
367 5 50       29 return undef if $data->where($data < 0)->nelem;
368            
369 5         920 my $wts = $self->_get_weights_piddle;
370            
371             # formula from https://en.wikipedia.org/wiki/Weighted_geometric_mean
372 5         254 return exp (($data->log * $wts)->sum / $wts->sum);
373             }
374            
375             sub _mode {
376 4     4   9 my $self = shift;
377            
378             # de-duplicate and aggregate weights if needed
379 4         23 my $deduped = $self->_deduplicate;
380            
381 4         19 my $data = $deduped->_get_piddle;
382            
383            
384 4         13 my $wts = $deduped->_get_weights_piddle;
385 4         135 my $mode = $data->at($wts->maximum_ind);
386 4 50       272 if ($mode > $data->max) {
387             # PDL returns strange numbers when distributions are flat
388 0         0 $mode = undef;
389             }
390            
391 4         526 return $mode;
392             }
393            
394             # need to convert $p to fraction, or perhaps die if it is between 0 and 1
395             sub _percentile {
396 5     5   15 my ($self, $p) = @_;
397            
398 5         17 my $data = $self->_get_piddle;
399             return undef
400 5 50 33     26 if !defined $data or $data->isempty;
401            
402 5         44 $data = $self->_sort_piddle;
403            
404 5         16 my $cumsum = $self->_get_cumsum_weight_vector;
405            
406 5         52 my $target_wt = $self->sum_weights * ($p / 100);
407            
408 5         19 my $idx = PDL->pdl($target_wt)->vsearch_insert_leftmost($cumsum->reshape);
409            
410 5         449 return $data->at($idx);
411             }
412            
413            
414             1;
415            
416             __END__