| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Statistics::KernelEstimation; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 5 |  |  | 5 |  | 115873 | use 5.008008; | 
|  | 5 |  |  |  |  | 21 |  | 
|  | 5 |  |  |  |  | 197 |  | 
| 4 | 5 |  |  | 5 |  | 28 | use strict; | 
|  | 5 |  |  |  |  | 9 |  | 
|  | 5 |  |  |  |  | 213 |  | 
| 5 | 5 |  |  | 5 |  | 43 | use warnings; | 
|  | 5 |  |  |  |  | 20 |  | 
|  | 5 |  |  |  |  | 168 |  | 
| 6 |  |  |  |  |  |  |  | 
| 7 | 5 |  |  | 5 |  | 25 | use Carp; | 
|  | 5 |  |  |  |  | 8 |  | 
|  | 5 |  |  |  |  | 24834 |  | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | our $VERSION = '0.05'; | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | # ================================================================= | 
| 12 |  |  |  |  |  |  | # TO DOs | 
| 13 |  |  |  |  |  |  | # | 
| 14 |  |  |  |  |  |  | # - More unit tests | 
| 15 |  |  |  |  |  |  | #   - bandidth from data | 
| 16 |  |  |  |  |  |  | #   - math function | 
| 17 |  |  |  |  |  |  | #   - optimization | 
| 18 |  |  |  |  |  |  | # | 
| 19 |  |  |  |  |  |  | # - opt broken for epanechnikov | 
| 20 |  |  |  |  |  |  | # - max number of integration steps in stiffness integral | 
| 21 |  |  |  |  |  |  |  | 
| 22 |  |  |  |  |  |  | # ================================================================= | 
| 23 |  |  |  |  |  |  | # Ctors | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | sub new { | 
| 26 | 2 |  |  | 2 | 1 | 29 | return new_gauss( @_ ); | 
| 27 |  |  |  |  |  |  | } | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | sub new_gauss { | 
| 30 | 2 |  |  | 2 | 1 | 9 | my $self = _new( @_ ); | 
| 31 |  |  |  |  |  |  |  | 
| 32 | 2 |  |  |  |  | 16 | $self->{pdf} = \&_gauss_pdf; | 
| 33 | 2 |  |  |  |  | 6 | $self->{cdf} = \&_gauss_cdf; | 
| 34 |  |  |  |  |  |  |  | 
| 35 | 2 |  |  |  |  | 7 | $self->{curvature} = \&_gauss_curvature; | 
| 36 | 2 |  |  |  |  | 4 | $self->{extension} = 3; | 
| 37 |  |  |  |  |  |  |  | 
| 38 | 2 |  |  |  |  | 7 | return $self; | 
| 39 |  |  |  |  |  |  | } | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | sub new_box { | 
| 42 | 1 |  |  | 1 | 1 | 18 | my $self = _new( @_ ); | 
| 43 |  |  |  |  |  |  |  | 
| 44 | 1 |  |  |  |  | 10 | $self->{pdf} = \&_box_pdf; | 
| 45 | 1 |  |  |  |  | 3 | $self->{cdf} = \&_box_cdf; | 
| 46 |  |  |  |  |  |  |  | 
| 47 | 1 |  |  |  |  | 3 | $self->{curvature} = \&_box_curvature; | 
| 48 | 1 |  |  |  |  | 2 | $self->{extension} = 0; | 
| 49 |  |  |  |  |  |  |  | 
| 50 | 1 |  |  |  |  | 3 | $self->{optimizable} = 0; | 
| 51 |  |  |  |  |  |  |  | 
| 52 | 1 |  |  |  |  | 4 | return $self; | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | } | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | sub new_epanechnikov { | 
| 57 | 1 |  |  | 1 | 1 | 14 | my $self = _new( @_ ); | 
| 58 |  |  |  |  |  |  |  | 
| 59 | 1 |  |  |  |  | 6 | $self->{pdf} = \&_epanechnikov_pdf; | 
| 60 | 1 |  |  |  |  | 3 | $self->{cdf} = \&_epanechnikov_cdf; | 
| 61 |  |  |  |  |  |  |  | 
| 62 | 1 |  |  |  |  | 3 | $self->{curvature} = \&_epanechnikov_curvature; | 
| 63 | 1 |  |  |  |  | 1 | $self->{extension} = 0; | 
| 64 |  |  |  |  |  |  |  | 
| 65 | 1 |  |  |  |  | 2 | $self->{optimizable} = 0; | 
| 66 |  |  |  |  |  |  |  | 
| 67 | 1 |  |  |  |  | 5 | return $self; | 
| 68 |  |  |  |  |  |  | } | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | sub _new { | 
| 71 | 4 |  |  | 4 |  | 13 | my ( $class ) = @_; | 
| 72 | 4 |  |  |  |  | 40 | bless { data => [], | 
| 73 |  |  |  |  |  |  | sum_x   => 0, | 
| 74 |  |  |  |  |  |  | sum_x2  => 0, | 
| 75 |  |  |  |  |  |  | sum_cnt => 0, | 
| 76 |  |  |  |  |  |  | min => undef, | 
| 77 |  |  |  |  |  |  | max => undef, | 
| 78 |  |  |  |  |  |  | optimizable => 1 }, $class; | 
| 79 |  |  |  |  |  |  | } | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | # ================================================================= | 
| 82 |  |  |  |  |  |  | # Accessors | 
| 83 |  |  |  |  |  |  |  | 
| 84 |  |  |  |  |  |  | sub count { | 
| 85 | 15831 |  |  | 15831 | 1 | 29249 | my ( $self ) = @_; | 
| 86 | 15831 |  |  |  |  | 34683 | return $self->{sum_cnt}; | 
| 87 |  |  |  |  |  |  | } | 
| 88 |  |  |  |  |  |  |  | 
| 89 |  |  |  |  |  |  | sub range { | 
| 90 | 37 |  |  | 37 | 1 | 1177 | my ( $self ) = @_; | 
| 91 |  |  |  |  |  |  |  | 
| 92 | 37 | 100 |  |  |  | 89 | if( wantarray ) { | 
| 93 | 31 |  |  |  |  | 89 | return ( $self->{min}, $self->{max} ); | 
| 94 |  |  |  |  |  |  | } | 
| 95 |  |  |  |  |  |  |  | 
| 96 | 6 |  |  |  |  | 25 | return $self->{max}; | 
| 97 |  |  |  |  |  |  | } | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | sub extended_range { | 
| 100 | 27 |  |  | 27 | 1 | 27 | my ( $self ) = @_; | 
| 101 |  |  |  |  |  |  |  | 
| 102 | 27 |  |  |  |  | 50 | my ( $min, $max ) = $self->range(); | 
| 103 | 27 |  |  |  |  | 62 | my $w = $self->{extension}*$self->default_bandwidth(); | 
| 104 |  |  |  |  |  |  |  | 
| 105 | 27 | 50 |  |  |  | 50 | if( wantarray ) { | 
| 106 | 27 |  |  |  |  | 60 | return ( $min - $w, $max + $w ); | 
| 107 |  |  |  |  |  |  | } | 
| 108 |  |  |  |  |  |  |  | 
| 109 | 0 |  |  |  |  | 0 | return $max + $w; | 
| 110 |  |  |  |  |  |  | } | 
| 111 |  |  |  |  |  |  |  | 
| 112 |  |  |  |  |  |  | sub default_bandwidth { | 
| 113 | 34 |  |  | 34 | 1 | 1117 | my ( $self ) = @_; | 
| 114 |  |  |  |  |  |  |  | 
| 115 | 34 | 100 |  |  |  | 86 | if( $self->{sum_cnt} == 0 ) { return undef; } | 
|  | 3 |  |  |  |  | 13 |  | 
| 116 |  |  |  |  |  |  |  | 
| 117 | 31 |  |  |  |  | 53 | my $x  = $self->{sum_x}/$self->{sum_cnt}; | 
| 118 | 31 |  |  |  |  | 52 | my $x2 = $self->{sum_x2}/$self->{sum_cnt}; | 
| 119 | 31 |  |  |  |  | 52 | my $sigma = sqrt( $x2 - $x**2 ); | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | # This is the optimal bandwidth if the point distribution is Gaussian. | 
| 122 |  |  |  |  |  |  | # (Applied Smoothing Techniques for Data Analysis | 
| 123 |  |  |  |  |  |  | # by Adrian W, Bowman & Adelchi Azzalini (1997)) */ | 
| 124 | 31 |  |  |  |  | 146 | return $sigma * ( (3.0*$self->{sum_cnt}/4.0)**(-1.0/5.0) ); | 
| 125 |  |  |  |  |  |  | } | 
| 126 |  |  |  |  |  |  |  | 
| 127 |  |  |  |  |  |  | # ================================================================= | 
| 128 |  |  |  |  |  |  | # Adding Data | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | sub add_data { | 
| 131 | 18 |  |  | 18 | 1 | 1511 | my ( $self, $x, $y, $w ) = @_; | 
| 132 |  |  |  |  |  |  |  | 
| 133 | 18 | 50 |  |  |  | 41 | unless( _isNumber( $x ) ) { croak "Input ,$x, is not numeric."; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 134 |  |  |  |  |  |  |  | 
| 135 | 18 | 100 |  |  |  | 64 | if( !defined( $y ) ) { | 
|  |  | 50 |  |  |  |  |  | 
| 136 | 12 |  |  |  |  | 522 | $y = 1; | 
| 137 |  |  |  |  |  |  | } elsif( !_notNegative( $y ) ) { | 
| 138 | 0 |  |  |  |  | 0 | croak "Weight ,$y, must be non-negative number."; | 
| 139 |  |  |  |  |  |  | } | 
| 140 |  |  |  |  |  |  |  | 
| 141 | 18 | 50 | 33 |  |  | 1124 | if( defined( $w ) && !_isPositive( $w ) ) { | 
| 142 | 0 |  |  |  |  | 0 | croak "Bandwidth ,$w, must be strictly positive number in add_data."; | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  |  | 
| 145 |  |  |  |  |  |  | # If no bandwidth has been specified, $w will be undef! | 
| 146 | 18 |  |  |  |  | 33 | push @{ $self->{data} }, { pos => $x, cnt => $y, wid => $w }; | 
|  | 18 |  |  |  |  | 170 |  | 
| 147 |  |  |  |  |  |  |  | 
| 148 |  |  |  |  |  |  | # Update summary statistics as we go along: | 
| 149 | 18 |  |  |  |  | 410 | $self->{sum_x}   += $y*$x; | 
| 150 | 18 |  |  |  |  | 33 | $self->{sum_x2}  += $y*$x*$x; | 
| 151 | 18 |  |  |  |  | 26 | $self->{sum_cnt} += $y; | 
| 152 |  |  |  |  |  |  |  | 
| 153 | 18 | 100 |  |  |  | 28 | if( scalar @{ $self->{data} } == 1 ) { | 
|  | 18 |  |  |  |  | 770 |  | 
| 154 | 4 |  |  |  |  | 9 | $self->{min} = $x; | 
| 155 | 4 |  |  |  |  | 28 | $self->{max} = $x; | 
| 156 |  |  |  |  |  |  | } else { | 
| 157 | 14 | 100 |  |  |  | 46 | $self->{min} = $x < $self->{min} ? $x : $self->{min}; | 
| 158 | 14 | 100 |  |  |  | 43 | $self->{max} = $x > $self->{max} ? $x : $self->{max}; | 
| 159 |  |  |  |  |  |  | } | 
| 160 |  |  |  |  |  |  |  | 
| 161 | 18 |  |  |  |  | 687 | return; | 
| 162 |  |  |  |  |  |  | } | 
| 163 |  |  |  |  |  |  |  | 
| 164 |  |  |  |  |  |  | # ================================================================= | 
| 165 |  |  |  |  |  |  | # Kernel Estimate | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | sub pdf { | 
| 168 | 14005 |  |  | 14005 | 1 | 69321 | my $self = shift; | 
| 169 | 14005 |  |  |  |  | 27996 | return $self->_impl( 'pdf', 'default', @_ ); | 
| 170 |  |  |  |  |  |  | } | 
| 171 |  |  |  |  |  |  |  | 
| 172 |  |  |  |  |  |  | sub pdf_width_from_data { | 
| 173 | 0 |  |  | 0 | 1 | 0 | my $self = shift; | 
| 174 | 0 |  |  |  |  | 0 | return $self->_impl( 'pdf', 'fromdata', @_ ); | 
| 175 |  |  |  |  |  |  | } | 
| 176 |  |  |  |  |  |  |  | 
| 177 |  |  |  |  |  |  | # sub pdf_optimal { | 
| 178 |  |  |  |  |  |  | #   my $self = shift; | 
| 179 |  |  |  |  |  |  | #   return $self->_impl( 'pdf', 'optimal', @_ ); | 
| 180 |  |  |  |  |  |  | # } | 
| 181 |  |  |  |  |  |  |  | 
| 182 |  |  |  |  |  |  |  | 
| 183 |  |  |  |  |  |  | sub cdf { | 
| 184 | 3 |  |  | 3 | 1 | 1220 | my $self = shift; | 
| 185 | 3 |  |  |  |  | 17 | return $self->_impl( 'cdf', 'default', @_ ); | 
| 186 |  |  |  |  |  |  | } | 
| 187 |  |  |  |  |  |  |  | 
| 188 |  |  |  |  |  |  | sub cdf_width_from_data { | 
| 189 | 0 |  |  | 0 | 1 | 0 | my $self = shift; | 
| 190 | 0 |  |  |  |  | 0 | return $self->_impl( 'cdf', 'fromdata', @_ ); | 
| 191 |  |  |  |  |  |  | } | 
| 192 |  |  |  |  |  |  |  | 
| 193 |  |  |  |  |  |  | # sub cdf_optimal { | 
| 194 |  |  |  |  |  |  | #   my $self = shift; | 
| 195 |  |  |  |  |  |  | #   return $self->_impl( 'cdf', 'optimal', @_ ); | 
| 196 |  |  |  |  |  |  | # } | 
| 197 |  |  |  |  |  |  |  | 
| 198 |  |  |  |  |  |  | sub _curvature { | 
| 199 | 1787 |  |  | 1787 |  | 1946 | my $self = shift; | 
| 200 | 1787 |  |  |  |  | 3223 | return $self->_impl( 'curvature', 'default', @_ ); | 
| 201 |  |  |  |  |  |  | } | 
| 202 |  |  |  |  |  |  |  | 
| 203 |  |  |  |  |  |  | sub _impl { | 
| 204 | 15795 |  |  | 15795 |  | 20915 | my ( $self, $mode, $bandwidth_mode, $x, $w ) = @_; | 
| 205 |  |  |  |  |  |  |  | 
| 206 | 15795 | 50 | 100 |  |  | 42656 | unless( $mode eq 'pdf' || $mode eq 'cdf' | 
|  |  |  | 66 |  |  |  |  | 
| 207 | 0 |  |  |  |  | 0 | || $mode eq 'curvature' ) { die "Illegal mode: ,$mode,"; } | 
| 208 | 15795 | 50 |  |  |  | 22498 | unless( _isNumber( $x ) ) { croak "Position ,$x, must be numeric."; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 209 |  |  |  |  |  |  |  | 
| 210 |  |  |  |  |  |  | # If no data (or only data w/ weight zero), return immediately | 
| 211 | 15795 |  |  |  |  | 28220 | my $count = $self->count(); | 
| 212 | 15795 | 50 |  |  |  | 29471 | if( $count == 0 ) { return 0; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | # If bandwidth is from data, calculate result and return immediately | 
| 215 | 15795 | 50 |  |  |  | 26413 | if( $bandwidth_mode eq 'fromdata' ) { | 
| 216 | 0 |  |  |  |  | 0 | my $y = 0; | 
| 217 | 0 |  |  |  |  | 0 | for my $p ( @{ $self->{data} } ) { | 
|  | 0 |  |  |  |  | 0 |  | 
| 218 | 0 | 0 |  |  |  | 0 | unless( defined $p->{wid} ) { | 
| 219 | 0 |  |  |  |  | 0 | carp "Undefined bandwidth in data at position " . $p->{pos} | 
| 220 |  |  |  |  |  |  | . ". Using default bandwidth."; | 
| 221 | 0 |  |  |  |  | 0 | $w = $self->default_bandwidth(); | 
| 222 |  |  |  |  |  |  | } else { | 
| 223 | 0 |  |  |  |  | 0 | $w = $p->{wid}; | 
| 224 |  |  |  |  |  |  | } | 
| 225 |  |  |  |  |  |  |  | 
| 226 | 0 |  |  |  |  | 0 | $y += $p->{cnt} * $self->{$mode}( $x, $p->{pos}, $w ); | 
| 227 |  |  |  |  |  |  | } | 
| 228 | 0 |  |  |  |  | 0 | return $y/$count; | 
| 229 |  |  |  |  |  |  | } | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | # ... otherwise, determine bandwidth | 
| 232 | 15795 | 50 |  |  |  | 35796 | if( $bandwidth_mode eq 'default' ) { | 
| 233 | 15795 | 50 |  |  |  | 38210 | if( !defined( $w ) ) { | 
|  |  | 50 |  |  |  |  |  | 
| 234 | 0 |  |  |  |  | 0 | $w = $self->default_bandwidth(); | 
| 235 |  |  |  |  |  |  | } elsif( !_notNegative( $w ) ) { | 
| 236 | 0 |  |  |  |  | 0 | croak "Bandwidth ,$w, must be strictly positive number."; | 
| 237 |  |  |  |  |  |  | } | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  | # } elsif( $bandwidth_mode eq 'optimal' ) { | 
| 240 |  |  |  |  |  |  | #   $w = $self->optimal_bandwidth(); | 
| 241 |  |  |  |  |  |  | } | 
| 242 |  |  |  |  |  |  |  | 
| 243 |  |  |  |  |  |  | # ... now use bandwidth from above to find result | 
| 244 | 15795 |  |  |  |  | 17421 | my $y = 0; | 
| 245 | 15795 |  |  |  |  | 15095 | for my $q ( @{ $self->{data} } ) { | 
|  | 15795 |  |  |  |  | 31954 |  | 
| 246 | 45171 |  |  |  |  | 98838 | $y += $q->{cnt} * $self->{$mode}( $x, $q->{pos}, $w ); | 
| 247 |  |  |  |  |  |  | } | 
| 248 |  |  |  |  |  |  |  | 
| 249 | 15795 |  |  |  |  | 68327 | return $y/$count; | 
| 250 |  |  |  |  |  |  | } | 
| 251 |  |  |  |  |  |  |  | 
| 252 |  |  |  |  |  |  | # ================================================================= | 
| 253 |  |  |  |  |  |  | # Classical Histograms | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  | sub histogram { | 
| 256 | 2 |  |  | 2 | 1 | 12 | my ( $self, $bins ) = @_; | 
| 257 |  |  |  |  |  |  |  | 
| 258 | 2 | 50 | 33 |  |  | 8 | unless( _isPositive( $bins ) && $bins==int($bins) ) { | 
| 259 | 0 |  |  |  |  | 0 | croak "Number of bins must be strictly positive integer."; | 
| 260 |  |  |  |  |  |  | } | 
| 261 |  |  |  |  |  |  |  | 
| 262 | 2 | 100 |  |  |  | 7 | if( $self->count() == 0 ) { return []; } | 
|  | 1 |  |  |  |  | 7 |  | 
| 263 |  |  |  |  |  |  |  | 
| 264 | 1 |  |  |  |  | 5 | my ( $min, $max ) = $self->range(); | 
| 265 |  |  |  |  |  |  |  | 
| 266 | 1 | 50 |  |  |  | 4 | if( $bins == 1 ) { | 
| 267 | 0 |  |  |  |  | 0 | return [ { pos => 0.5*($max-$min), cnt => $self->count() } ]; | 
| 268 |  |  |  |  |  |  | } | 
| 269 |  |  |  |  |  |  |  | 
| 270 | 1 |  |  |  |  | 4 | my $w = ($max - $min)/($bins - 1); | 
| 271 |  |  |  |  |  |  |  | 
| 272 | 1 |  |  |  |  | 3 | my @histo = (); | 
| 273 | 1 |  |  |  |  | 3 | for my $k ( 0..$bins-1 ) { | 
| 274 | 9 |  |  |  |  | 28 | push @histo, { pos => $min + $k*$w, cnt => 0 }; | 
| 275 |  |  |  |  |  |  | } | 
| 276 |  |  |  |  |  |  |  | 
| 277 | 1 |  |  |  |  | 3 | for my $p ( @{ $self->{data} } ) { | 
|  | 1 |  |  |  |  | 3 |  | 
| 278 | 6 |  |  |  |  | 14 | my $i = int( ($p->{pos} - ( $min - 0.5*$w ) )/$w ); | 
| 279 | 6 |  |  |  |  | 12 | my ( $lo, $hi ) = ( $min + ($i-0.5)*$w, $min + ($i+0.5)*$w ); | 
| 280 |  |  |  |  |  |  |  | 
| 281 | 6 |  |  |  |  | 11 | my $x = $p->{pos}; | 
| 282 |  |  |  |  |  |  |  | 
| 283 | 6 | 50 | 33 |  |  | 30 | if(     $x < $lo )             { $i -= 1; } | 
|  | 0 | 50 |  |  |  | 0 |  | 
|  |  | 0 |  |  |  |  |  | 
| 284 | 6 |  |  |  |  | 8 | elsif( $lo <= $x && $x < $hi ) { $i = $i; } | 
| 285 | 0 |  |  |  |  | 0 | elsif( $hi <= $x )             { $i += 1; } | 
| 286 |  |  |  |  |  |  |  | 
| 287 | 6 |  |  |  |  | 15 | $histo[ $i ]->{cnt} += $p->{cnt}; | 
| 288 |  |  |  |  |  |  | } | 
| 289 |  |  |  |  |  |  |  | 
| 290 | 1 |  |  |  |  | 4 | return \@histo; | 
| 291 |  |  |  |  |  |  | } | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  | sub distribution_function { | 
| 294 | 2 |  |  | 2 | 1 | 8 | my ( $self ) = @_; | 
| 295 |  |  |  |  |  |  |  | 
| 296 | 2 |  |  |  |  | 5 | my @sorted = sort { $a->{pos} <=> $b->{pos} } @{ $self->{data} }; | 
|  | 10 |  |  |  |  | 529 |  | 
|  | 2 |  |  |  |  | 17 |  | 
| 297 |  |  |  |  |  |  |  | 
| 298 | 2 |  |  |  |  | 5 | my @dist = (); | 
| 299 | 2 |  |  |  |  | 6 | my $cumul = 0; | 
| 300 | 2 |  |  |  |  | 10 | for my $p ( @sorted ) { | 
| 301 | 6 |  |  |  |  | 241 | $cumul += $p->{cnt}; | 
| 302 | 6 |  |  |  |  | 26 | push @dist, { pos => $p->{pos}, cnt => $cumul }; | 
| 303 |  |  |  |  |  |  | } | 
| 304 |  |  |  |  |  |  |  | 
| 305 | 2 |  |  |  |  | 29 | return \@dist; | 
| 306 |  |  |  |  |  |  | } | 
| 307 |  |  |  |  |  |  |  | 
| 308 |  |  |  |  |  |  | # ================================================================= | 
| 309 |  |  |  |  |  |  | # Input validation | 
| 310 |  |  |  |  |  |  |  | 
| 311 |  |  |  |  |  |  | # In general: undef evaluates to invalid input! | 
| 312 |  |  |  |  |  |  |  | 
| 313 |  |  |  |  |  |  | sub _isNumber { | 
| 314 | 31616 |  |  | 31616 |  | 33308 | my ( $in ) = @_; | 
| 315 | 31616 | 50 | 33 |  |  | 213859 | if( defined( $in ) && | 
| 316 | 31616 |  |  |  |  | 103730 | $in =~ /^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$/ ) { return 1; } | 
| 317 | 0 |  |  |  |  | 0 | return 0; | 
| 318 |  |  |  |  |  |  | } | 
| 319 |  |  |  |  |  |  |  | 
| 320 |  |  |  |  |  |  | sub _isPositive { | 
| 321 | 2 |  |  | 2 |  | 4 | my ( $in ) = @_; | 
| 322 | 2 | 50 | 33 |  |  | 8 | if( _isNumber( $in ) && $in > 0 ) { return 1; } | 
|  | 2 |  |  |  |  | 15 |  | 
| 323 | 0 |  |  |  |  | 0 | return 0; | 
| 324 |  |  |  |  |  |  | } | 
| 325 |  |  |  |  |  |  |  | 
| 326 |  |  |  |  |  |  | sub _notNegative { | 
| 327 | 15801 |  |  | 15801 |  | 15833 | my ( $in ) = @_; | 
| 328 | 15801 | 50 | 33 |  |  | 24310 | if( _isNumber( $in ) && $in >= 0 ) { return 1; } | 
|  | 15801 |  |  |  |  | 37519 |  | 
| 329 | 0 |  |  |  |  | 0 | return 0; | 
| 330 |  |  |  |  |  |  | } | 
| 331 |  |  |  |  |  |  |  | 
| 332 |  |  |  |  |  |  | # ================================================================= | 
| 333 |  |  |  |  |  |  | # Optimal Bandwidth | 
| 334 |  |  |  |  |  |  |  | 
| 335 |  |  |  |  |  |  | # Development history: | 
| 336 |  |  |  |  |  |  | # | 
| 337 |  |  |  |  |  |  | # Equation solver: | 
| 338 |  |  |  |  |  |  | # 1) Straight iteration | 
| 339 |  |  |  |  |  |  | # 2) Newton's method | 
| 340 |  |  |  |  |  |  | # 3) Sekant method | 
| 341 |  |  |  |  |  |  | # 4) Bisection | 
| 342 |  |  |  |  |  |  | # | 
| 343 |  |  |  |  |  |  | # Integration: | 
| 344 |  |  |  |  |  |  | # 1) Numerical differentiation | 
| 345 |  |  |  |  |  |  | # 2) Numerical differentiation, with equal step width as integration | 
| 346 |  |  |  |  |  |  | # 3) Symbolic differentiation | 
| 347 |  |  |  |  |  |  | # 4) Adaptive step size integration | 
| 348 |  |  |  |  |  |  | # 5) Romberg integration (not implemented) | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | # This routine solves the equation encoded in _optimal_bandwidth_equation | 
| 351 |  |  |  |  |  |  | # using the secant method. | 
| 352 |  |  |  |  |  |  |  | 
| 353 |  |  |  |  |  |  | sub optimal_bandwidth { | 
| 354 | 2 |  |  | 2 | 1 | 2 | my $self = shift; | 
| 355 | 2 | 50 |  |  |  | 6 | my $n    = @_ ? shift : 25; | 
| 356 | 2 | 50 |  |  |  | 5 | my $eps  = @_ ? shift : 1e-3; | 
| 357 |  |  |  |  |  |  |  | 
| 358 | 2 | 50 |  |  |  | 7 | unless( $self->{optimizable} ) { | 
| 359 | 0 |  |  |  |  | 0 | croak "Bandwidth Optimization not available for this type of kernel."; | 
| 360 |  |  |  |  |  |  | } | 
| 361 |  |  |  |  |  |  |  | 
| 362 | 2 | 100 |  |  |  | 6 | if( $self->{sum_cnt} == 0 ) { return undef; } | 
|  | 1 |  |  |  |  | 4 |  | 
| 363 |  |  |  |  |  |  |  | 
| 364 | 1 |  |  |  |  | 3 | my $x0 = $self->default_bandwidth(); | 
| 365 | 1 |  |  |  |  | 4 | my $y0 = $self->_optimal_bandwidth_equation( $x0 ); | 
| 366 |  |  |  |  |  |  |  | 
| 367 | 1 |  |  |  |  | 3 | my $x = 0.8*$x0; | 
| 368 |  |  |  |  |  |  | # my $x = $x0 * ( 1 - 1e-6 ); | 
| 369 | 1 |  |  |  |  | 4 | my $y = $self->_optimal_bandwidth_equation( $x ); | 
| 370 |  |  |  |  |  |  |  | 
| 371 | 1 |  |  |  |  | 3 | my $dx = 0; | 
| 372 |  |  |  |  |  |  |  | 
| 373 | 1 |  |  |  |  | 3 | my $i = 0; | 
| 374 | 1 |  |  |  |  | 5 | while( $i++ < $n ) { | 
| 375 | 25 |  |  |  |  | 44 | $x -= $y*($x0-$x)/($y0-$y); | 
| 376 | 25 |  |  |  |  | 45 | $y = $self->_optimal_bandwidth_equation( $x ); | 
| 377 |  |  |  |  |  |  |  | 
| 378 | 25 | 50 |  |  |  | 97 | if( abs($y) < $eps*$y0 ) { last } | 
|  | 0 |  |  |  |  | 0 |  | 
| 379 |  |  |  |  |  |  | } | 
| 380 |  |  |  |  |  |  |  | 
| 381 | 1 | 50 |  |  |  | 6 | if( wantarray ) { return ( $x, $i ); } | 
|  | 0 |  |  |  |  | 0 |  | 
| 382 | 1 |  |  |  |  | 9 | return $x; | 
| 383 |  |  |  |  |  |  | } | 
| 384 |  |  |  |  |  |  |  | 
| 385 |  |  |  |  |  |  | # This routine uses the secant method. | 
| 386 |  |  |  |  |  |  |  | 
| 387 |  |  |  |  |  |  | sub optimal_bandwidth_safe { | 
| 388 | 0 |  |  | 0 | 1 | 0 | my $self = shift; | 
| 389 | 0 | 0 |  |  |  | 0 | my $x0 = @_ ? shift : $self->default_bandwidth() / $self->count(); | 
| 390 | 0 | 0 |  |  |  | 0 | my $x1 = @_ ? shift : 2*$self->default_bandwidth(); | 
| 391 | 0 | 0 |  |  |  | 0 | my $eps  = @_ ? shift : 1e-3; | 
| 392 |  |  |  |  |  |  |  | 
| 393 | 0 | 0 |  |  |  | 0 | unless( $self->{optimizable} ) { | 
| 394 | 0 |  |  |  |  | 0 | croak "Bandwidth Optimization not available for this type of kernel."; | 
| 395 |  |  |  |  |  |  | } | 
| 396 |  |  |  |  |  |  |  | 
| 397 | 0 | 0 |  |  |  | 0 | if( $self->{sum_cnt} == 0 ) { return undef; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 398 |  |  |  |  |  |  |  | 
| 399 | 0 |  |  |  |  | 0 | my $y0 = $self->_optimal_bandwidth_equation( $x0 ); | 
| 400 | 0 |  |  |  |  | 0 | my $y1 = $self->_optimal_bandwidth_equation( $x1 ); | 
| 401 |  |  |  |  |  |  |  | 
| 402 | 0 | 0 |  |  |  | 0 | unless( $y0 * $y1 < 0 ) { | 
| 403 | 0 |  |  |  |  | 0 | croak "Interval [ f(x0=$x0)=$y0 : f(x1=$x1)=$y1 ] does not bracket root."; | 
| 404 |  |  |  |  |  |  | } | 
| 405 |  |  |  |  |  |  |  | 
| 406 | 0 |  |  |  |  | 0 | my ( $x, $y, $i ) = ( 0, 0, 0 ); | 
| 407 | 0 |  |  |  |  | 0 | while( abs( $x0 - $x1 ) > $eps*$x1 ) { | 
| 408 | 0 |  |  |  |  | 0 | $i += 1; | 
| 409 |  |  |  |  |  |  |  | 
| 410 | 0 |  |  |  |  | 0 | $x = ( $x0 + $x1 )/2; | 
| 411 | 0 |  |  |  |  | 0 | $y = $self->_optimal_bandwidth_equation( $x ); | 
| 412 |  |  |  |  |  |  |  | 
| 413 | 0 | 0 |  |  |  | 0 | if( abs( $y ) < $eps*$y0 ) { last } | 
|  | 0 |  |  |  |  | 0 |  | 
| 414 |  |  |  |  |  |  |  | 
| 415 | 0 | 0 |  |  |  | 0 | if( $y * $y0 < 0 ) { | 
| 416 | 0 |  |  |  |  | 0 | ( $x1, $y1 ) = ( $x, $y ); | 
| 417 |  |  |  |  |  |  | } else { | 
| 418 | 0 |  |  |  |  | 0 | ( $x0, $y0 ) = ( $x, $y ); | 
| 419 |  |  |  |  |  |  | } | 
| 420 |  |  |  |  |  |  | } | 
| 421 |  |  |  |  |  |  |  | 
| 422 | 0 | 0 |  |  |  | 0 | if( wantarray ) { return ( $x, $i ); } | 
|  | 0 |  |  |  |  | 0 |  | 
| 423 | 0 |  |  |  |  | 0 | return $x; | 
| 424 |  |  |  |  |  |  | } | 
| 425 |  |  |  |  |  |  |  | 
| 426 |  |  |  |  |  |  | # This routine encodes the self-consistent equation that is fulfilled | 
| 427 |  |  |  |  |  |  | # by the optimal bandwidth. Notation according to Bowman & Azzalini. | 
| 428 |  |  |  |  |  |  |  | 
| 429 |  |  |  |  |  |  | sub _optimal_bandwidth_equation { | 
| 430 | 27 |  |  | 27 |  | 38 | my ( $self, $w ) = @_; | 
| 431 |  |  |  |  |  |  |  | 
| 432 | 27 |  |  |  |  | 29 | my $alpha = 1.0/(2.0*sqrt( 3.14159265358979323846 ) ); | 
| 433 | 27 |  |  |  |  | 28 | my $sigma = 1.0; | 
| 434 | 27 |  |  |  |  | 42 | my $n = $self->count(); | 
| 435 | 27 |  |  |  |  | 55 | my $q = $self->_stiffness_integral( $w ); | 
| 436 |  |  |  |  |  |  |  | 
| 437 | 27 |  |  |  |  | 137 | return $w - ( ($n*$q*$sigma**4)/$alpha )**(-1.0/5.0); | 
| 438 |  |  |  |  |  |  | } | 
| 439 |  |  |  |  |  |  |  | 
| 440 |  |  |  |  |  |  | # This routine calculates the integral over the square of the curvature | 
| 441 |  |  |  |  |  |  | # (it: Int (f'')**2 ) using the trapezoidal rule. The routine decreases | 
| 442 |  |  |  |  |  |  | # the step size by half until the relative error in the last step is less | 
| 443 |  |  |  |  |  |  | # than epsilon. | 
| 444 |  |  |  |  |  |  |  | 
| 445 |  |  |  |  |  |  | sub _stiffness_integral { | 
| 446 | 27 |  |  | 27 |  | 29 | my ( $self, $w ) = @_; | 
| 447 |  |  |  |  |  |  |  | 
| 448 | 27 |  |  |  |  | 54 | my $eps = 1e-4; | 
| 449 |  |  |  |  |  |  |  | 
| 450 | 27 |  |  |  |  | 64 | my ( $mn, $mx ) = $self->extended_range(); | 
| 451 | 27 |  |  |  |  | 33 | my $n = 1; | 
| 452 | 27 |  |  |  |  | 36 | my $dx = ($mx-$mn)/$n; | 
| 453 |  |  |  |  |  |  |  | 
| 454 | 27 |  |  |  |  | 49 | my $yy = 0.5*($self->_curvature($mn,$w)**2+$self->_curvature($mx,$w)**2)*$dx; | 
| 455 |  |  |  |  |  |  |  | 
| 456 |  |  |  |  |  |  | # The trapezoidal rule guarantees a relative error of better than eps | 
| 457 |  |  |  |  |  |  | # for some number of steps less than maxn. | 
| 458 | 27 |  |  |  |  | 51 | my $maxn = ($mx-$mn)/sqrt($eps); | 
| 459 |  |  |  |  |  |  |  | 
| 460 |  |  |  |  |  |  | # This is not ideal, but I want to cap the total computation spent here: | 
| 461 | 27 | 50 |  |  |  | 47 | $maxn = ( $maxn > 2048 ? 2048 : $maxn ); | 
| 462 |  |  |  |  |  |  |  | 
| 463 | 27 |  |  |  |  | 71 | for( my $n=2; $n<=$maxn; $n*=2 ) { | 
| 464 | 162 |  |  |  |  | 171 | $dx /= 2.0; | 
| 465 |  |  |  |  |  |  |  | 
| 466 | 162 |  |  |  |  | 153 | my $y = 0; | 
| 467 | 162 |  |  |  |  | 333 | for( my $i=1; $i<=$n-1; $i+=2 ) { | 
| 468 | 1733 |  |  |  |  | 3730 | $y += $self->_curvature( $mn + $i*$dx, $w )**2; | 
| 469 |  |  |  |  |  |  | } | 
| 470 | 162 |  |  |  |  | 198 | $yy = 0.5*$yy + $y*$dx; | 
| 471 |  |  |  |  |  |  |  | 
| 472 |  |  |  |  |  |  | # Make at least 8 steps, then evaluate the relative change between steps | 
| 473 | 162 | 100 | 100 |  |  | 689 | if( $n > 8 && abs($y*$dx-0.5*$yy) < $eps*$yy ) { last } | 
|  | 27 |  |  |  |  | 42 |  | 
| 474 |  |  |  |  |  |  | } | 
| 475 |  |  |  |  |  |  |  | 
| 476 | 27 |  |  |  |  | 51 | return $yy; | 
| 477 |  |  |  |  |  |  | } | 
| 478 |  |  |  |  |  |  |  | 
| 479 |  |  |  |  |  |  | # ================================================================= | 
| 480 |  |  |  |  |  |  | # Kernels | 
| 481 |  |  |  |  |  |  |  | 
| 482 |  |  |  |  |  |  | sub _gauss_pdf { | 
| 483 | 17157 |  |  | 17157 |  | 18811 | my ( $x, $m, $s ) = @_; | 
| 484 | 17157 |  |  |  |  | 18890 | my $z = ($x - $m)/$s; | 
| 485 | 17157 |  |  |  |  | 47285 | return exp(-0.5*$z*$z)/( $s*sqrt( 2.0*3.14159265358979323846 ) ); | 
| 486 |  |  |  |  |  |  | } | 
| 487 |  |  |  |  |  |  |  | 
| 488 |  |  |  |  |  |  | # Abramowitz & Stegun, 26.2.17 | 
| 489 |  |  |  |  |  |  | sub _gauss_cdf { | 
| 490 | 4 |  |  | 4 |  | 7 | my ( $x, $m, $s ) = @_; | 
| 491 |  |  |  |  |  |  |  | 
| 492 | 4 |  |  |  |  | 6 | my $z = abs( $x - $m)/$s; | 
| 493 | 4 |  |  |  |  | 7 | my $t = 1.0/(1.0 + 0.2316419*$z); | 
| 494 | 4 |  |  |  |  | 10 | my $y = $t*( 0.319381530 | 
| 495 |  |  |  |  |  |  | + $t*( -0.356563782 | 
| 496 |  |  |  |  |  |  | + $t*( 1.781477937 | 
| 497 |  |  |  |  |  |  | + $t*( -1.821255978 + $t*1.330274429 ) ) ) ); | 
| 498 |  |  |  |  |  |  |  | 
| 499 | 4 | 50 |  |  |  | 7 | if( $x >= $m ) { | 
| 500 | 4 |  |  |  |  | 9 | return 1.0 - _gauss_pdf( $x, $m, $s )*$y*$s; | 
| 501 |  |  |  |  |  |  | } else { | 
| 502 | 0 |  |  |  |  | 0 | return _gauss_pdf( $x, $m, $s )*$y*$s; | 
| 503 |  |  |  |  |  |  | } | 
| 504 |  |  |  |  |  |  | } | 
| 505 |  |  |  |  |  |  |  | 
| 506 |  |  |  |  |  |  | sub _gauss_curvature { | 
| 507 | 7148 |  |  | 7148 |  | 8535 | my ( $x, $m, $s ) = @_; | 
| 508 | 7148 |  |  |  |  | 8936 | my $z = ($x - $m)/$s; | 
| 509 | 7148 |  |  |  |  | 12276 | return ($z**2 - 1.0)*_gauss_pdf( $x, $m, $s )/$s**2; | 
| 510 |  |  |  |  |  |  | } | 
| 511 |  |  |  |  |  |  |  | 
| 512 |  |  |  |  |  |  | sub _box_pdf { | 
| 513 | 10005 |  |  | 10005 |  | 14258 | my ( $x, $m, $s ) = @_; | 
| 514 | 10005 | 100 | 100 |  |  | 39502 | if( $x < $m-0.5*$s || $x > $m+0.5*$s ) { return 0.0; } | 
|  | 9505 |  |  |  |  | 27942 |  | 
| 515 | 500 |  |  |  |  | 1498 | return 1.0/$s; | 
| 516 |  |  |  |  |  |  | } | 
| 517 |  |  |  |  |  |  |  | 
| 518 |  |  |  |  |  |  | sub _box_cdf { | 
| 519 | 4 |  |  | 4 |  | 6 | my ( $x, $m, $s ) = @_; | 
| 520 | 4 | 50 |  |  |  | 13 | if( $x < $m-0.5*$s ) { return 0.0; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 521 | 4 | 50 |  |  |  | 11 | if( $x > $m+0.5*$s ) { return 1.0; } | 
|  | 4 |  |  |  |  | 21 |  | 
| 522 | 0 |  |  |  |  | 0 | return ( $x-$m )/$s + 0.5; | 
| 523 |  |  |  |  |  |  | } | 
| 524 |  |  |  |  |  |  |  | 
| 525 |  |  |  |  |  |  | sub _box_curvature { | 
| 526 | 0 |  |  | 0 |  | 0 | return 0; | 
| 527 |  |  |  |  |  |  | } | 
| 528 |  |  |  |  |  |  |  | 
| 529 |  |  |  |  |  |  | sub _epanechnikov_pdf { | 
| 530 | 18001 |  |  | 18001 |  | 18733 | my ( $x, $m, $s ) = @_; | 
| 531 | 18001 |  |  |  |  | 18983 | my $z = ($x-$m)/$s; | 
| 532 | 18001 | 100 |  |  |  | 30869 | if( abs($z) > 1 ) { return 0.0; } | 
|  | 16201 |  |  |  |  | 35040 |  | 
| 533 | 1800 |  |  |  |  | 4465 | return 0.75*(1-$z**2)/$s; | 
| 534 |  |  |  |  |  |  | } | 
| 535 |  |  |  |  |  |  |  | 
| 536 |  |  |  |  |  |  | sub _epanechnikov_cdf { | 
| 537 | 4 |  |  | 4 |  | 6 | my ( $x, $m, $s ) = @_; | 
| 538 | 4 |  |  |  |  | 7 | my $z = ($x-$m)/$s; | 
| 539 | 4 | 50 |  |  |  | 9 | if( $z < -1 ) { return 0.0; } | 
|  | 0 |  |  |  |  | 0 |  | 
| 540 | 4 | 50 |  |  |  | 18 | if( $z >  1 ) { return 1.0; } | 
|  | 4 |  |  |  |  | 12 |  | 
| 541 | 0 |  |  |  |  |  | return 0.25*(2.0 + 3.0*$z - $z**3 ); | 
| 542 |  |  |  |  |  |  | } | 
| 543 |  |  |  |  |  |  |  | 
| 544 |  |  |  |  |  |  | sub _epanechnikov_curvature { | 
| 545 | 0 |  |  | 0 |  |  | my ( $x, $m, $s ) = @_; | 
| 546 | 0 |  |  |  |  |  | my $z = ($x-$m)/$s; | 
| 547 | 0 | 0 |  |  |  |  | if( abs($z) > 1 ) { return 0; } | 
|  | 0 |  |  |  |  |  |  | 
| 548 | 0 |  |  |  |  |  | return -1.5/$s**3; | 
| 549 |  |  |  |  |  |  | } | 
| 550 |  |  |  |  |  |  |  | 
| 551 |  |  |  |  |  |  | 1; | 
| 552 |  |  |  |  |  |  |  | 
| 553 |  |  |  |  |  |  | __END__ |