File Coverage

blib/lib/Statistics/KernelEstimation.pm

Criterion	Covered	Total	%
statement	198	266	74.4
branch	51	104	49.0
condition	17	30	56.6
subroutine	33	38	86.8
pod	17	17	100.0
total	316	455	69.4

line	stmt	bran	cond	sub	pod	time	code
1							package Statistics::KernelEstimation;
2
3	5			5		115873	use 5.008008;
	5					21
	5					197
4	5			5		28	use strict;
	5					9
	5					213
5	5			5		43	use warnings;
	5					20
	5					168
6
7	5			5		25	use Carp;
	5					8
	5					24834
8
9							our $VERSION = '0.05';
10
11							# =================================================================
12							# TO DOs
13							#
14							# - More unit tests
15							# - bandidth from data
16							# - math function
17							# - optimization
18							#
19							# - opt broken for epanechnikov
20							# - max number of integration steps in stiffness integral
21
22							# =================================================================
23							# Ctors
24
25							sub new {
26	2			2	1	29	return new_gauss( @_ );
27							}
28
29							sub new_gauss {
30	2			2	1	9	my $self = _new( @_ );
31
32	2					16	$self->{pdf} = \&_gauss_pdf;
33	2					6	$self->{cdf} = \&_gauss_cdf;
34
35	2					7	$self->{curvature} = \&_gauss_curvature;
36	2					4	$self->{extension} = 3;
37
38	2					7	return $self;
39							}
40
41							sub new_box {
42	1			1	1	18	my $self = _new( @_ );
43
44	1					10	$self->{pdf} = \&_box_pdf;
45	1					3	$self->{cdf} = \&_box_cdf;
46
47	1					3	$self->{curvature} = \&_box_curvature;
48	1					2	$self->{extension} = 0;
49
50	1					3	$self->{optimizable} = 0;
51
52	1					4	return $self;
53
54							}
55
56							sub new_epanechnikov {
57	1			1	1	14	my $self = _new( @_ );
58
59	1					6	$self->{pdf} = \&_epanechnikov_pdf;
60	1					3	$self->{cdf} = \&_epanechnikov_cdf;
61
62	1					3	$self->{curvature} = \&_epanechnikov_curvature;
63	1					1	$self->{extension} = 0;
64
65	1					2	$self->{optimizable} = 0;
66
67	1					5	return $self;
68							}
69
70							sub _new {
71	4			4		13	my ( $class ) = @_;
72	4					40	bless { data => [],
73							sum_x => 0,
74							sum_x2 => 0,
75							sum_cnt => 0,
76							min => undef,
77							max => undef,
78							optimizable => 1 }, $class;
79							}
80
81							# =================================================================
82							# Accessors
83
84							sub count {
85	15831			15831	1	29249	my ( $self ) = @_;
86	15831					34683	return $self->{sum_cnt};
87							}
88
89							sub range {
90	37			37	1	1177	my ( $self ) = @_;
91
92	37	100				89	if( wantarray ) {
93	31					89	return ( $self->{min}, $self->{max} );
94							}
95
96	6					25	return $self->{max};
97							}
98
99							sub extended_range {
100	27			27	1	27	my ( $self ) = @_;
101
102	27					50	my ( $min, $max ) = $self->range();
103	27					62	my $w = $self->{extension}*$self->default_bandwidth();
104
105	27	50				50	if( wantarray ) {
106	27					60	return ( $min - $w, $max + $w );
107							}
108
109	0					0	return $max + $w;
110							}
111
112							sub default_bandwidth {
113	34			34	1	1117	my ( $self ) = @_;
114
115	34	100				86	if( $self->{sum_cnt} == 0 ) { return undef; }
	3					13
116
117	31					53	my $x = $self->{sum_x}/$self->{sum_cnt};
118	31					52	my $x2 = $self->{sum_x2}/$self->{sum_cnt};
119	31					52	my $sigma = sqrt( $x2 - $x**2 );
120
121							# This is the optimal bandwidth if the point distribution is Gaussian.
122							# (Applied Smoothing Techniques for Data Analysis
123							# by Adrian W, Bowman & Adelchi Azzalini (1997)) */
124	31					146	return $sigma * ( (3.0$self->{sum_cnt}/4.0)*(-1.0/5.0) );
125							}
126
127							# =================================================================
128							# Adding Data
129
130							sub add_data {
131	18			18	1	1511	my ( $self, $x, $y, $w ) = @_;
132
133	18	50				41	unless( _isNumber( $x ) ) { croak "Input ,$x, is not numeric."; }
	0					0
134
135	18	100				64	if( !defined( $y ) ) {
		50
136	12					522	$y = 1;
137							} elsif( !_notNegative( $y ) ) {
138	0					0	croak "Weight ,$y, must be non-negative number.";
139							}
140
141	18	50	33			1124	if( defined( $w ) && !_isPositive( $w ) ) {
142	0					0	croak "Bandwidth ,$w, must be strictly positive number in add_data.";
143							}
144
145							# If no bandwidth has been specified, $w will be undef!
146	18					33	push @{ $self->{data} }, { pos => $x, cnt => $y, wid => $w };
	18					170
147
148							# Update summary statistics as we go along:
149	18					410	$self->{sum_x} += $y*$x;
150	18					33	$self->{sum_x2} += $y$x$x;
151	18					26	$self->{sum_cnt} += $y;
152
153	18	100				28	if( scalar @{ $self->{data} } == 1 ) {
	18					770
154	4					9	$self->{min} = $x;
155	4					28	$self->{max} = $x;
156							} else {
157	14	100				46	$self->{min} = $x < $self->{min} ? $x : $self->{min};
158	14	100				43	$self->{max} = $x > $self->{max} ? $x : $self->{max};
159							}
160
161	18					687	return;
162							}
163
164							# =================================================================
165							# Kernel Estimate
166
167							sub pdf {
168	14005			14005	1	69321	my $self = shift;
169	14005					27996	return $self->_impl( 'pdf', 'default', @_ );
170							}
171
172							sub pdf_width_from_data {
173	0			0	1	0	my $self = shift;
174	0					0	return $self->_impl( 'pdf', 'fromdata', @_ );
175							}
176
177							# sub pdf_optimal {
178							# my $self = shift;
179							# return $self->_impl( 'pdf', 'optimal', @_ );
180							# }
181
182
183							sub cdf {
184	3			3	1	1220	my $self = shift;
185	3					17	return $self->_impl( 'cdf', 'default', @_ );
186							}
187
188							sub cdf_width_from_data {
189	0			0	1	0	my $self = shift;
190	0					0	return $self->_impl( 'cdf', 'fromdata', @_ );
191							}
192
193							# sub cdf_optimal {
194							# my $self = shift;
195							# return $self->_impl( 'cdf', 'optimal', @_ );
196							# }
197
198							sub _curvature {
199	1787			1787		1946	my $self = shift;
200	1787					3223	return $self->_impl( 'curvature', 'default', @_ );
201							}
202
203							sub _impl {
204	15795			15795		20915	my ( $self, $mode, $bandwidth_mode, $x, $w ) = @_;
205
206	15795	50	100			42656	unless( $mode eq 'pdf' \|\| $mode eq 'cdf'
			66
207	0					0	\|\| $mode eq 'curvature' ) { die "Illegal mode: ,$mode,"; }
208	15795	50				22498	unless( _isNumber( $x ) ) { croak "Position ,$x, must be numeric."; }
	0					0
209
210							# If no data (or only data w/ weight zero), return immediately
211	15795					28220	my $count = $self->count();
212	15795	50				29471	if( $count == 0 ) { return 0; }
	0					0
213
214							# If bandwidth is from data, calculate result and return immediately
215	15795	50				26413	if( $bandwidth_mode eq 'fromdata' ) {
216	0					0	my $y = 0;
217	0					0	for my $p ( @{ $self->{data} } ) {
	0					0
218	0	0				0	unless( defined $p->{wid} ) {
219	0					0	carp "Undefined bandwidth in data at position " . $p->{pos}
220							. ". Using default bandwidth.";
221	0					0	$w = $self->default_bandwidth();
222							} else {
223	0					0	$w = $p->{wid};
224							}
225
226	0					0	$y += $p->{cnt} * $self->{$mode}( $x, $p->{pos}, $w );
227							}
228	0					0	return $y/$count;
229							}
230
231							# ... otherwise, determine bandwidth
232	15795	50				35796	if( $bandwidth_mode eq 'default' ) {
233	15795	50				38210	if( !defined( $w ) ) {
		50
234	0					0	$w = $self->default_bandwidth();
235							} elsif( !_notNegative( $w ) ) {
236	0					0	croak "Bandwidth ,$w, must be strictly positive number.";
237							}
238
239							# } elsif( $bandwidth_mode eq 'optimal' ) {
240							# $w = $self->optimal_bandwidth();
241							}
242
243							# ... now use bandwidth from above to find result
244	15795					17421	my $y = 0;
245	15795					15095	for my $q ( @{ $self->{data} } ) {
	15795					31954
246	45171					98838	$y += $q->{cnt} * $self->{$mode}( $x, $q->{pos}, $w );
247							}
248
249	15795					68327	return $y/$count;
250							}
251
252							# =================================================================
253							# Classical Histograms
254
255							sub histogram {
256	2			2	1	12	my ( $self, $bins ) = @_;
257
258	2	50	33			8	unless( _isPositive( $bins ) && $bins==int($bins) ) {
259	0					0	croak "Number of bins must be strictly positive integer.";
260							}
261
262	2	100				7	if( $self->count() == 0 ) { return []; }
	1					7
263
264	1					5	my ( $min, $max ) = $self->range();
265
266	1	50				4	if( $bins == 1 ) {
267	0					0	return [ { pos => 0.5*($max-$min), cnt => $self->count() } ];
268							}
269
270	1					4	my $w = ($max - $min)/($bins - 1);
271
272	1					3	my @histo = ();
273	1					3	for my $k ( 0..$bins-1 ) {
274	9					28	push @histo, { pos => $min + $k*$w, cnt => 0 };
275							}
276
277	1					3	for my $p ( @{ $self->{data} } ) {
	1					3
278	6					14	my $i = int( ($p->{pos} - ( $min - 0.5*$w ) )/$w );
279	6					12	my ( $lo, $hi ) = ( $min + ($i-0.5)$w, $min + ($i+0.5)$w );
280
281	6					11	my $x = $p->{pos};
282
283	6	50	33			30	if( $x < $lo ) { $i -= 1; }
	0	50				0
		0
284	6					8	elsif( $lo <= $x && $x < $hi ) { $i = $i; }
285	0					0	elsif( $hi <= $x ) { $i += 1; }
286
287	6					15	$histo[ $i ]->{cnt} += $p->{cnt};
288							}
289
290	1					4	return \@histo;
291							}
292
293							sub distribution_function {
294	2			2	1	8	my ( $self ) = @_;
295
296	2					5	my @sorted = sort { $a->{pos} <=> $b->{pos} } @{ $self->{data} };
	10					529
	2					17
297
298	2					5	my @dist = ();
299	2					6	my $cumul = 0;
300	2					10	for my $p ( @sorted ) {
301	6					241	$cumul += $p->{cnt};
302	6					26	push @dist, { pos => $p->{pos}, cnt => $cumul };
303							}
304
305	2					29	return \@dist;
306							}
307
308							# =================================================================
309							# Input validation
310
311							# In general: undef evaluates to invalid input!
312
313							sub _isNumber {
314	31616			31616		33308	my ( $in ) = @_;
315	31616	50	33			213859	if( defined( $in ) &&
316	31616					103730	$in =~ /^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$/ ) { return 1; }
317	0					0	return 0;
318							}
319
320							sub _isPositive {
321	2			2		4	my ( $in ) = @_;
322	2	50	33			8	if( _isNumber( $in ) && $in > 0 ) { return 1; }
	2					15
323	0					0	return 0;
324							}
325
326							sub _notNegative {
327	15801			15801		15833	my ( $in ) = @_;
328	15801	50	33			24310	if( _isNumber( $in ) && $in >= 0 ) { return 1; }
	15801					37519
329	0					0	return 0;
330							}
331
332							# =================================================================
333							# Optimal Bandwidth
334
335							# Development history:
336							#
337							# Equation solver:
338							# 1) Straight iteration
339							# 2) Newton's method
340							# 3) Sekant method
341							# 4) Bisection
342							#
343							# Integration:
344							# 1) Numerical differentiation
345							# 2) Numerical differentiation, with equal step width as integration
346							# 3) Symbolic differentiation
347							# 4) Adaptive step size integration
348							# 5) Romberg integration (not implemented)
349
350							# This routine solves the equation encoded in _optimal_bandwidth_equation
351							# using the secant method.
352
353							sub optimal_bandwidth {
354	2			2	1	2	my $self = shift;
355	2	50				6	my $n = @_ ? shift : 25;
356	2	50				5	my $eps = @_ ? shift : 1e-3;
357
358	2	50				7	unless( $self->{optimizable} ) {
359	0					0	croak "Bandwidth Optimization not available for this type of kernel.";
360							}
361
362	2	100				6	if( $self->{sum_cnt} == 0 ) { return undef; }
	1					4
363
364	1					3	my $x0 = $self->default_bandwidth();
365	1					4	my $y0 = $self->_optimal_bandwidth_equation( $x0 );
366
367	1					3	my $x = 0.8*$x0;
368							# my $x = $x0 * ( 1 - 1e-6 );
369	1					4	my $y = $self->_optimal_bandwidth_equation( $x );
370
371	1					3	my $dx = 0;
372
373	1					3	my $i = 0;
374	1					5	while( $i++ < $n ) {
375	25					44	$x -= $y*($x0-$x)/($y0-$y);
376	25					45	$y = $self->_optimal_bandwidth_equation( $x );
377
378	25	50				97	if( abs($y) < $eps*$y0 ) { last }
	0					0
379							}
380
381	1	50				6	if( wantarray ) { return ( $x, $i ); }
	0					0
382	1					9	return $x;
383							}
384
385							# This routine uses the secant method.
386
387							sub optimal_bandwidth_safe {
388	0			0	1	0	my $self = shift;
389	0	0				0	my $x0 = @_ ? shift : $self->default_bandwidth() / $self->count();
390	0	0				0	my $x1 = @_ ? shift : 2*$self->default_bandwidth();
391	0	0				0	my $eps = @_ ? shift : 1e-3;
392
393	0	0				0	unless( $self->{optimizable} ) {
394	0					0	croak "Bandwidth Optimization not available for this type of kernel.";
395							}
396
397	0	0				0	if( $self->{sum_cnt} == 0 ) { return undef; }
	0					0
398
399	0					0	my $y0 = $self->_optimal_bandwidth_equation( $x0 );
400	0					0	my $y1 = $self->_optimal_bandwidth_equation( $x1 );
401
402	0	0				0	unless( $y0 * $y1 < 0 ) {
403	0					0	croak "Interval [ f(x0=$x0)=$y0 : f(x1=$x1)=$y1 ] does not bracket root.";
404							}
405
406	0					0	my ( $x, $y, $i ) = ( 0, 0, 0 );
407	0					0	while( abs( $x0 - $x1 ) > $eps*$x1 ) {
408	0					0	$i += 1;
409
410	0					0	$x = ( $x0 + $x1 )/2;
411	0					0	$y = $self->_optimal_bandwidth_equation( $x );
412
413	0	0				0	if( abs( $y ) < $eps*$y0 ) { last }
	0					0
414
415	0	0				0	if( $y * $y0 < 0 ) {
416	0					0	( $x1, $y1 ) = ( $x, $y );
417							} else {
418	0					0	( $x0, $y0 ) = ( $x, $y );
419							}
420							}
421
422	0	0				0	if( wantarray ) { return ( $x, $i ); }
	0					0
423	0					0	return $x;
424							}
425
426							# This routine encodes the self-consistent equation that is fulfilled
427							# by the optimal bandwidth. Notation according to Bowman & Azzalini.
428
429							sub _optimal_bandwidth_equation {
430	27			27		38	my ( $self, $w ) = @_;
431
432	27					29	my $alpha = 1.0/(2.0*sqrt( 3.14159265358979323846 ) );
433	27					28	my $sigma = 1.0;
434	27					42	my $n = $self->count();
435	27					55	my $q = $self->_stiffness_integral( $w );
436
437	27					137	return $w - ( ($n$q$sigma4)/$alpha )(-1.0/5.0);
438							}
439
440							# This routine calculates the integral over the square of the curvature
441							# (it: Int (f'')**2 ) using the trapezoidal rule. The routine decreases
442							# the step size by half until the relative error in the last step is less
443							# than epsilon.
444
445							sub _stiffness_integral {
446	27			27		29	my ( $self, $w ) = @_;
447
448	27					54	my $eps = 1e-4;
449
450	27					64	my ( $mn, $mx ) = $self->extended_range();
451	27					33	my $n = 1;
452	27					36	my $dx = ($mx-$mn)/$n;
453
454	27					49	my $yy = 0.5($self->_curvature($mn,$w)2+$self->_curvature($mx,$w)2)$dx;
455
456							# The trapezoidal rule guarantees a relative error of better than eps
457							# for some number of steps less than maxn.
458	27					51	my $maxn = ($mx-$mn)/sqrt($eps);
459
460							# This is not ideal, but I want to cap the total computation spent here:
461	27	50				47	$maxn = ( $maxn > 2048 ? 2048 : $maxn );
462
463	27					71	for( my $n=2; $n<=$maxn; $n*=2 ) {
464	162					171	$dx /= 2.0;
465
466	162					153	my $y = 0;
467	162					333	for( my $i=1; $i<=$n-1; $i+=2 ) {
468	1733					3730	$y += $self->_curvature( $mn + $i$dx, $w )*2;
469							}
470	162					198	$yy = 0.5$yy + $y$dx;
471
472							# Make at least 8 steps, then evaluate the relative change between steps
473	162	100	100			689	if( $n > 8 && abs($y$dx-0.5$yy) < $eps*$yy ) { last }
	27					42
474							}
475
476	27					51	return $yy;
477							}
478
479							# =================================================================
480							# Kernels
481
482							sub _gauss_pdf {
483	17157			17157		18811	my ( $x, $m, $s ) = @_;
484	17157					18890	my $z = ($x - $m)/$s;
485	17157					47285	return exp(-0.5$z$z)/( $ssqrt( 2.03.14159265358979323846 ) );
486							}
487
488							# Abramowitz & Stegun, 26.2.17
489							sub _gauss_cdf {
490	4			4		7	my ( $x, $m, $s ) = @_;
491
492	4					6	my $z = abs( $x - $m)/$s;
493	4					7	my $t = 1.0/(1.0 + 0.2316419*$z);
494	4					10	my $y = $t*( 0.319381530
495							+ $t*( -0.356563782
496							+ $t*( 1.781477937
497							+ $t( -1.821255978 + $t1.330274429 ) ) ) );
498
499	4	50				7	if( $x >= $m ) {
500	4					9	return 1.0 - _gauss_pdf( $x, $m, $s )$y$s;
501							} else {
502	0					0	return _gauss_pdf( $x, $m, $s )$y$s;
503							}
504							}
505
506							sub _gauss_curvature {
507	7148			7148		8535	my ( $x, $m, $s ) = @_;
508	7148					8936	my $z = ($x - $m)/$s;
509	7148					12276	return ($z*2 - 1.0)_gauss_pdf( $x, $m, $s )/$s**2;
510							}
511
512							sub _box_pdf {
513	10005			10005		14258	my ( $x, $m, $s ) = @_;
514	10005	100	100			39502	if( $x < $m-0.5$s \|\| $x > $m+0.5$s ) { return 0.0; }
	9505					27942
515	500					1498	return 1.0/$s;
516							}
517
518							sub _box_cdf {
519	4			4		6	my ( $x, $m, $s ) = @_;
520	4	50				13	if( $x < $m-0.5*$s ) { return 0.0; }
	0					0
521	4	50				11	if( $x > $m+0.5*$s ) { return 1.0; }
	4					21
522	0					0	return ( $x-$m )/$s + 0.5;
523							}
524
525							sub _box_curvature {
526	0			0		0	return 0;
527							}
528
529							sub _epanechnikov_pdf {
530	18001			18001		18733	my ( $x, $m, $s ) = @_;
531	18001					18983	my $z = ($x-$m)/$s;
532	18001	100				30869	if( abs($z) > 1 ) { return 0.0; }
	16201					35040
533	1800					4465	return 0.75(1-$z*2)/$s;
534							}
535
536							sub _epanechnikov_cdf {
537	4			4		6	my ( $x, $m, $s ) = @_;
538	4					7	my $z = ($x-$m)/$s;
539	4	50				9	if( $z < -1 ) { return 0.0; }
	0					0
540	4	50				18	if( $z > 1 ) { return 1.0; }
	4					12
541	0						return 0.25(2.0 + 3.0$z - $z**3 );
542							}
543
544							sub _epanechnikov_curvature {
545	0			0			my ( $x, $m, $s ) = @_;
546	0						my $z = ($x-$m)/$s;
547	0	0					if( abs($z) > 1 ) { return 0; }
	0
548	0						return -1.5/$s**3;
549							}
550
551							1;
552
553							__END__