File Coverage

lib/Statistics/Descriptive/PDL/SampleWeighted.pm

Criterion	Covered	Total	%
statement	75	77	97.4
branch	11	16	68.7
condition	4	9	44.4
subroutine	13	13	100.0
pod			n/a
total	103	115	89.5

line	stmt	bran	cond	sub	time	code
1						package Statistics::Descriptive::PDL::SampleWeighted;
2
3	3			3	1796	use 5.010;
	3				12
4	3			3	42	use strict;
	3				7
	3				75
5	3			3	52	use warnings;
	3				5
	3				158
6
7						# avoid loading too much, especially into our name space
8	3			3	14	use PDL::Lite '2.012';
	3				7
	3				35
9
10						# this is otherwise not loaded due to oddities with multiple loading of PDL::Lite
11						#*pdl = \&PDL::Core::pdl;
12
13						# We could inherit from PDL::Objects, but in this case we want
14						# to hide the piddle from the caller to avoid arbitrary changes
15						# being applied to it.
16
17						## no critic (ProhibitExplicitReturnUndef)
18
19						our $VERSION = '0.17';
20
21	3			3	1638	use parent 'Statistics::Descriptive::PDL::Weighted';
	3				702
	3				17
22
23						my @cache_methods = qw /
24						median
25						standard_deviation skewness kurtosis
26						/;
27						__PACKAGE__->_make_caching_accessors( \@cache_methods );
28
29
30
31	15			15	57	sub _wt_type{PDL::long}
32
33
34						sub _standard_deviation {
35	4			4	10	my $self = shift;
36
37	4				13	my $data = $self->_get_piddle;
38
39	4				16	my $wts = $self->_get_weights_piddle;
40	4				18	my $n = $wts->sum;
41
42	4	50			82	return 0 if $n == 1;
43
44	4				493	my $var = ((($data ** 2) * $wts)->sum - $n * $self->mean ** 2);
45
46	4	50			198	return $var > 0 ? sqrt ($var / ($n - 1)) : 0;
47						}
48
49
50						sub _median {
51	9			9	21	my $self = shift;
52
53	9				38	my $piddle = $self->_sort_piddle;
54	9				42	my $cumsum = $self->_get_cumsum_weight_vector;
55
56	9				106	my $target_wt = $self->sum_weights * 0.5;
57						# vsearch should be faster since it uses a binary search
58	9				54	my $idx = PDL->pdl($target_wt)->vsearch_insert_leftmost($cumsum);
59
60						# if the target weight is "on a boundary" between
61						# two values then we need to interpolate
62	9	100			724	my $median
63						= $target_wt == $cumsum->at($idx)
64						? ($piddle->at($idx) + $piddle->at($idx+1)) / 2
65						: $piddle->at($idx);
66
67	9				1503	return $median;
68						}
69
70
71						sub _skewness {
72	4			4	9	my $self = shift;
73
74	4				22	my $n = $self->sum_weights;
75	4	100			16	return undef if $n < 3;
76
77	3				12	my $data = $self->_get_piddle;
78
79						# long winded approach
80	3				20	my $mean = $self->mean;
81	3				13	my $sd = $self->standard_deviation;
82	3				14	my $wts = $self->_get_weights_piddle;
83	3				16	my $sumpow3 = ($wts * ((($data - $mean) / $sd) ** 3))->sum;
84						# inplace seems not to be faster here.
85						# Possibly PDL is smart enough to do it by default
86						# in such cases
87						#my $sumpow3 = ($data - $mean)->inplace->divide($sd, 0)->pow(3)->mult($wts, 0)->sum;
88	3				406	my $correction = $n / ( ($n-1) * ($n-2) );
89	3				42	my $skew = $correction * $sumpow3;
90	3				83	return $skew;
91						}
92
93						sub _kurtosis {
94	4			4	12	my $self = shift;
95
96	4				16	my $n = $self->sum_weights;
97	4	100			20	return undef if $n <= 3;
98
99	3				12	my $data = $self->_get_piddle;
100	3				70	my $mean = $self->mean;
101	3				48	my $sd = $self->standard_deviation;
102	3				42	my $wts = $self->_get_weights_piddle;
103
104	3				12	my $sumpow4 = ($wts * ((($data - $mean) / $sd) ** 4))->sum;
105
106	3				317	my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) );
107	3				15	my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) );
108
109	3				9	return ( $correction1 * $sumpow4 ) - $correction2;
110						}
111
112						# crude memoisation - would be nice to use
113						# state but it has issues with lists on older perls
114						my %k_piddle_cache;
115
116						# Uses same basic algorithm as PDL::pctl.
117						sub _percentile {
118	25			25	70	my ($self, $p) = @_;
119
120	25				73	my $piddle = $self->_get_piddle;
121
122						return undef
123	25	50	33		110	if !defined $piddle or $piddle->isempty;
124
125	25				263	$self->_sort_piddle;
126
127						# possible slowdown here - users need to dedup before calling to avoid
128						# my $deduped = $self->_deduplicate;
129						# there is actually no need to dedup
130	25				43	my $deduped = $self;
131	25				74	$piddle = $deduped->_get_piddle;
132
133						# my $wt_piddle = $self->_get_weights_piddle;
134
135	25				115	my $cumsum = $deduped->_get_cumsum_weight_vector;
136	25				337	my $wt_sum = $deduped->sum_weights;
137
138	3			3	2014	use POSIX qw /floor/;
	3				5
	3				28
139
140	25				142	my $target_wt = ($p / 100) * ($wt_sum - 1) + 1;
141	25				99	my $k = floor $target_wt;
142	25				55	my $d = $target_wt - $k;
143
144	25		66		556	my $idx = ($k_piddle_cache{$k} //= PDL->pdl(PDL::indx(), [$k]))->vsearch_insert_leftmost($cumsum)->at(0);
145
146	25	50			2136	if (scalar keys %k_piddle_cache > 10000) {
147						# Reset if we get too many
148						# - could be more nuanced based on frequency
149						# but then we would have to track it
150	0				0	%k_piddle_cache = ();
151						}
152
153						# we need to interpolate if our target weight falls between two sets of weights
154						# e.g. target is 1.3, but the cumulative weights are [1,2] or [1,5]
155	25				77	my $fraction = $target_wt - ($cumsum->at($idx));
156	25	50	33		321	if ($fraction > 0 && $fraction < 1) {
157	25				67	my $lower_val = $piddle->at($idx);
158	25				243	my $upper_val = $piddle->at($idx+1);
159	25				205	my $val = $lower_val + $d * ($upper_val - $lower_val);
160	25				95	return $val;
161						}
162
163	0				0	return $piddle->at($idx);
164						}
165
166						# weight for each sample is 1
167						sub _sum_sqr_sample_weights {
168	2			2	7	my $self = shift;
169	2				12	return $self->sum_weights;
170						}
171
172
173
174						1;
175
176						__END__