line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
25
|
|
|
25
|
|
1456323
|
use 5.006; |
|
25
|
|
|
|
|
337
|
|
2
|
25
|
|
|
25
|
|
146
|
use strict; |
|
25
|
|
|
|
|
51
|
|
|
25
|
|
|
|
|
592
|
|
3
|
25
|
|
|
25
|
|
122
|
use warnings; |
|
25
|
|
|
|
|
57
|
|
|
25
|
|
|
|
|
1766
|
|
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
package Statistics::Descriptive::LogScale; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Statistics::Descriptive::LogScale - Memory-efficient approximate univariate |
10
|
|
|
|
|
|
|
descriptive statistics class. |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 VERSION |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Version 0.10 |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=cut |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = 0.11; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 SYNOPSIS |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head2 Basic usage |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
The basic usage is roughly the same as that of L. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
use Statistics::Descriptive::LogScale; |
27
|
|
|
|
|
|
|
my $stat = Statistics::Descriptive::LogScale->new (); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
while(<>) { |
30
|
|
|
|
|
|
|
chomp; |
31
|
|
|
|
|
|
|
$stat->add_data($_); |
32
|
|
|
|
|
|
|
}; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# This can also be done in O(1) memory, precisely |
35
|
|
|
|
|
|
|
printf "Mean: %f +- %f\n", $stat->mean, $stat->standard_deviation; |
36
|
|
|
|
|
|
|
# This requires storing actual data, or approximating |
37
|
|
|
|
|
|
|
printf "25%% : %f\n", $stat->percentile(25); |
38
|
|
|
|
|
|
|
printf "Median: %f\n", $stat->median; |
39
|
|
|
|
|
|
|
printf "75%% : %f\n", $stat->percentile(75); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head2 Save/load |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
This is not present in L. |
44
|
|
|
|
|
|
|
The save/load interface is designed compatible with JSON::XS. |
45
|
|
|
|
|
|
|
However, any other serializer can be used. |
46
|
|
|
|
|
|
|
The C method is I to return unblessed hashref |
47
|
|
|
|
|
|
|
with enough information to restore the original object. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
use Statistics::Descriptive::LogScale; |
50
|
|
|
|
|
|
|
my $stat = Statistics::Descriptive::LogScale->new (); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# ..... much later |
53
|
|
|
|
|
|
|
# Save |
54
|
|
|
|
|
|
|
print $fd encoder_of_choice( $stat->TO_JSON ) |
55
|
|
|
|
|
|
|
or die "Failed to save: $!"; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# ..... and even later |
58
|
|
|
|
|
|
|
# Load |
59
|
|
|
|
|
|
|
my $plain_hash = decoder_of_choice( $raw_data ); |
60
|
|
|
|
|
|
|
my $copy_of_stat = Statistics::Descriptive::LogScale->new( %$plain_hash ); |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# Import into existing LogScale instance |
63
|
|
|
|
|
|
|
my $plain_hash = decoder_of_choice( $more_raw_data ); |
64
|
|
|
|
|
|
|
$copy_of_stat->add_data_hash( $plain_hash->{data} ); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head2 Histograms |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Both L and L |
69
|
|
|
|
|
|
|
offer C method for querying data point counts. |
70
|
|
|
|
|
|
|
However, there's also C method for making pretty pictures. |
71
|
|
|
|
|
|
|
Here's a simple text-based histogram. |
72
|
|
|
|
|
|
|
A proper GD example was too long to fit into this margin. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
use strict; |
75
|
|
|
|
|
|
|
use warnings; |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
use Statistics::Descriptive::LogScale; |
78
|
|
|
|
|
|
|
my $stat = Statistics::Descriptive::LogScale->new (); |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# collect/load data ... |
81
|
|
|
|
|
|
|
my $re_float = qr([-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?); |
82
|
|
|
|
|
|
|
while (<>) { |
83
|
|
|
|
|
|
|
$stat->add_data($_) for m/($re_float)/g; |
84
|
|
|
|
|
|
|
}; |
85
|
|
|
|
|
|
|
die "Empty set" |
86
|
|
|
|
|
|
|
unless $stat->count; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# get data in [ count, lower_bound, upper_bound ] format as arrayref |
89
|
|
|
|
|
|
|
my $hist = $stat->histogram( count => 20 ); |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# find maximum value to use as a scale factor |
92
|
|
|
|
|
|
|
my $scale = $hist->[0][0]; |
93
|
|
|
|
|
|
|
$scale < $_->[0] and $scale = $_->[0] for @$hist; |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
foreach (@$hist) { |
96
|
|
|
|
|
|
|
printf "%10f %s\n", $_->[1], '#' x ($_->[0] * 68 / $scale); |
97
|
|
|
|
|
|
|
}; |
98
|
|
|
|
|
|
|
printf "%10f\n", $hist->[-1][2]; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head1 DESCRIPTION |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
This module aims at providing some advanced statistical functions without |
103
|
|
|
|
|
|
|
storing all data in memory, at the cost of certain (predictable) precision loss. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
Data is represented by a set of bins that only store counts of fitting |
106
|
|
|
|
|
|
|
data points. |
107
|
|
|
|
|
|
|
Most bins are logarithmic, i.e. lower end / upper end ratio is constant. |
108
|
|
|
|
|
|
|
However, around zero linear approximation may be user instead |
109
|
|
|
|
|
|
|
(see "linear_width" and "linear_thresh" parameters in new()). |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
All operations are then performed on the bins, introducing relative error |
112
|
|
|
|
|
|
|
which does not, however, exceed the bins' relative width ("base"). |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head1 METHODS |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=cut |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
######################################################################## |
119
|
|
|
|
|
|
|
# == HOW IT WORKS == |
120
|
|
|
|
|
|
|
# Buckets are stored in a hash: { $value => $count, ... } |
121
|
|
|
|
|
|
|
# {base} is bin width, {logbase} == log {base} (cache) |
122
|
|
|
|
|
|
|
# {linear_thresh} is where we switch to equal bin approximation |
123
|
|
|
|
|
|
|
# {linear_width} is width of bin around zero (==linear_thresh if not given) |
124
|
|
|
|
|
|
|
# {floor} is lower bound of bin whose center is 1. {logfloor} = log {floor} |
125
|
|
|
|
|
|
|
# Nearly all meaningful subs have to scan all the bins, which is bad, |
126
|
|
|
|
|
|
|
# but anyway better than scanning full sample. |
127
|
|
|
|
|
|
|
|
128
|
25
|
|
|
25
|
|
175
|
use Carp; |
|
25
|
|
|
|
|
57
|
|
|
25
|
|
|
|
|
1538
|
|
129
|
25
|
|
|
25
|
|
8320
|
use POSIX qw(floor ceil); |
|
25
|
|
|
|
|
141281
|
|
|
25
|
|
|
|
|
170
|
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# Fields are NOT used internally for now, so this is just a declaration |
132
|
25
|
|
|
|
|
112
|
use fields qw( |
133
|
|
|
|
|
|
|
data count |
134
|
|
|
|
|
|
|
linear_width base logbase floor linear_thresh only_linear logfloor |
135
|
|
|
|
|
|
|
cache |
136
|
25
|
|
|
25
|
|
42260
|
); |
|
25
|
|
|
|
|
36235
|
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# Some internal constants |
139
|
|
|
|
|
|
|
# This is for infinite portability^W^W portable infinity |
140
|
|
|
|
|
|
|
my $INF = 9**9**9; |
141
|
|
|
|
|
|
|
my $re_num = qr/(?:[-+]?(?:\d+\.?\d*|\.\d+)(?:[Ee][-+]?\d+)?)/; |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head2 new( %options ) |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
%options may include: |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=over |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=item * base - ratio of adjacent bins. Default is 10^(1/232), which gives |
150
|
|
|
|
|
|
|
1% precision and exact decimal powers. |
151
|
|
|
|
|
|
|
This value represents acceptable relative error in analysis results. |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
B Actual value may be slightly less than requested one. |
154
|
|
|
|
|
|
|
This is done so to avoid troubles with future rounding in (de)serialization. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=item * linear_width - width of linear bins around zero. |
157
|
|
|
|
|
|
|
This value represents precision of incoming data. |
158
|
|
|
|
|
|
|
Default is zero, i.e. we assume that the measurement is precise. |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
B Actual value may be less (by no more than a factor of C) |
161
|
|
|
|
|
|
|
so that borders of linear and logarithmic bins fit nicely. |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=item * linear_thresh - where to switch to linear approximation. |
164
|
|
|
|
|
|
|
If only one of C and C is given, |
165
|
|
|
|
|
|
|
the other will be calculated. |
166
|
|
|
|
|
|
|
However, user may want to specify both in some cases. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
B Actual value may be less (by no more than a factor of C) |
169
|
|
|
|
|
|
|
so that borders of linear and logarithmic bins fit nicely. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=item * only_linear = 1 (B) - |
172
|
|
|
|
|
|
|
throw away log approximation and become a discrete statistics |
173
|
|
|
|
|
|
|
class with fixed precision. |
174
|
|
|
|
|
|
|
C must be given in this case. |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
B This obviously kills memory efficiency, unless one knows beforehand |
177
|
|
|
|
|
|
|
that all values come from a finite pool. |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=item * data - hashref with C<{ value => weight }> for initializing data. |
180
|
|
|
|
|
|
|
Used for cloning. |
181
|
|
|
|
|
|
|
See C. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=item * zero_thresh - absolute value threshold below which everything is |
184
|
|
|
|
|
|
|
considered zero. |
185
|
|
|
|
|
|
|
DEPRECATED, C and C override this if given. |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
=back |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
=cut |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
my @new_keys = qw( base linear_width linear_thresh data zero_thresh only_linear ); |
192
|
|
|
|
|
|
|
# TODO Throw if extra options given? |
193
|
|
|
|
|
|
|
# TODO Check args better |
194
|
|
|
|
|
|
|
sub new { |
195
|
46
|
|
|
46
|
1
|
23519
|
my $class = shift; |
196
|
46
|
|
|
|
|
196
|
my %opt = @_; |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# First, check for only_linear option |
199
|
46
|
100
|
|
|
|
179
|
if ($opt{only_linear}) { |
200
|
|
|
|
|
|
|
$opt{linear_width} |
201
|
4
|
50
|
|
|
|
15
|
or croak "only_linear option given, but no linear width"; |
202
|
4
|
|
|
|
|
12
|
$opt{only_linear} = $opt{linear_width}; |
203
|
4
|
|
|
|
|
24
|
delete $opt{$_} for qw(linear_width base linear_thresh zero_thresh); |
204
|
|
|
|
|
|
|
}; |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# base for logarithmic bins, sane default: +-1%, exact decimal powers |
207
|
|
|
|
|
|
|
# UGLY HACK number->string->number to avoid |
208
|
|
|
|
|
|
|
# future serialization inconsistencies |
209
|
46
|
|
100
|
|
|
236
|
$opt{base} ||= 10**(1/232); |
210
|
46
|
|
|
|
|
392
|
$opt{base} = 0 + "$opt{base}"; |
211
|
46
|
50
|
|
|
|
184
|
$opt{base} > 1 or croak __PACKAGE__.": new(): base must be >1"; |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
# calculate where to switch to linear approximation |
214
|
|
|
|
|
|
|
# the condition is: linear bin( thresh ) ~~ log bin( thresh ) |
215
|
|
|
|
|
|
|
# i.e. thresh * base - thresh ~~ absolute error * 2 |
216
|
|
|
|
|
|
|
# i.e. thresh ~~ absolute_error * 2 / (base - 1) |
217
|
|
|
|
|
|
|
# also support legacy API (zero_thresh) |
218
|
46
|
100
|
|
|
|
164
|
if (defined $opt{linear_thresh} ) { |
219
|
13
|
|
100
|
|
|
45
|
$opt{linear_width} ||= $opt{linear_thresh} * ($opt{base}-1); |
220
|
|
|
|
|
|
|
} else { |
221
|
33
|
|
|
|
|
97
|
$opt{linear_thresh} = $opt{zero_thresh}; |
222
|
|
|
|
|
|
|
}; |
223
|
|
|
|
|
|
|
$opt{linear_thresh} = abs($opt{linear_width}) / ($opt{base} - 1) |
224
|
46
|
100
|
100
|
|
|
212
|
if $opt{linear_width} and !$opt{linear_thresh}; |
225
|
46
|
|
100
|
|
|
244
|
$opt{linear_thresh} ||= 0; |
226
|
46
|
50
|
|
|
|
141
|
$opt{linear_thresh} >= 0 |
227
|
|
|
|
|
|
|
or croak __PACKAGE__.": new(): linear_thresh must be >= 0"; |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# Can't use fields::new anymore |
230
|
|
|
|
|
|
|
# due to JSON::XS incompatibility with restricted hashes |
231
|
46
|
|
|
|
|
130
|
my $self = bless {}, $class; |
232
|
|
|
|
|
|
|
|
233
|
46
|
|
|
|
|
191
|
$self->{base} = $opt{base}; |
234
|
|
|
|
|
|
|
# cache values to ease calculations |
235
|
|
|
|
|
|
|
# floor = (lower end of bin) / (center of bin) |
236
|
46
|
|
|
|
|
152
|
$self->{floor} = 2/(1+$opt{base}); |
237
|
46
|
|
|
|
|
177
|
$self->{logbase} = log $opt{base}; |
238
|
46
|
|
|
|
|
125
|
$self->{logfloor} = log $self->{floor}; |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
# bootstrap linear_thresh - make it fit bin edge |
241
|
46
|
|
|
|
|
116
|
$self->{linear_width} = $self->{linear_thresh} = 0; |
242
|
46
|
|
|
|
|
187
|
$self->{linear_thresh} = $self->_lower( $opt{linear_thresh} ); |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# divide anything below linear_thresh into odd number of bins |
245
|
|
|
|
|
|
|
# not exceeding requested linear_width |
246
|
46
|
100
|
|
|
|
184
|
if ($self->{linear_thresh}) { |
247
|
15
|
|
66
|
|
|
53
|
my $linear_width = $opt{linear_width} || 2 * $self->{linear_thresh}; |
248
|
15
|
|
|
|
|
57
|
my $n_linear = ceil(2 * $self->{linear_thresh} / abs($linear_width)); |
249
|
15
|
100
|
|
|
|
48
|
$n_linear++ unless $n_linear % 2; |
250
|
15
|
|
|
|
|
36
|
$self->{linear_width} = (2 * $self->{linear_thresh} / $n_linear); |
251
|
|
|
|
|
|
|
}; |
252
|
|
|
|
|
|
|
|
253
|
46
|
100
|
|
|
|
153
|
if ($opt{only_linear}) { |
254
|
4
|
|
|
|
|
10
|
$self->{linear_width} = $opt{only_linear}; |
255
|
4
|
|
|
|
|
7
|
$self->{linear_thresh} = $INF; |
256
|
4
|
|
|
|
|
7
|
$self->{only_linear} = 1; |
257
|
|
|
|
|
|
|
}; |
258
|
|
|
|
|
|
|
|
259
|
46
|
|
|
|
|
178
|
$self->clear; |
260
|
46
|
100
|
|
|
|
130
|
if ($opt{data}) { |
261
|
11
|
|
|
|
|
35
|
$self->add_data_hash($opt{data}); |
262
|
|
|
|
|
|
|
}; |
263
|
46
|
|
|
|
|
212
|
return $self; |
264
|
|
|
|
|
|
|
}; |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=head1 General statistical methods |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
These methods are used to query the distribution properties. They generally |
269
|
|
|
|
|
|
|
follow the interface of L and co, |
270
|
|
|
|
|
|
|
with minor additions. |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
All methods return C on empty data set, except for |
273
|
|
|
|
|
|
|
C, C, C, C and C |
274
|
|
|
|
|
|
|
which all return 0. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
B This module caches whatever it calculates very agressively. |
277
|
|
|
|
|
|
|
Don't hesitate to use statistical functions (except for sum_of/mean_of) |
278
|
|
|
|
|
|
|
more than once. The cache is deleted upon data entry. |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=head2 clear |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
Destroy all stored data. |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=cut |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
sub clear { |
287
|
60
|
|
|
60
|
1
|
11725
|
my $self = shift; |
288
|
60
|
|
|
|
|
2405
|
$self->{data} = {}; |
289
|
60
|
|
|
|
|
181
|
$self->{count} = 0; |
290
|
60
|
|
|
|
|
134
|
delete $self->{cache}; |
291
|
60
|
|
|
|
|
126
|
return $self; |
292
|
|
|
|
|
|
|
}; |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head2 add_data( @data ) |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Add numbers to the data pool. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
Returns self, so that methods can be chained. |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
If incorrect data is given (i.e. non-numeric, undef), |
301
|
|
|
|
|
|
|
an exception is thrown and only partial data gets inserted. |
302
|
|
|
|
|
|
|
The state of object is guaranteed to remain consistent in such case. |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
B Cache is reset, even if no data was actually inserted. |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
B It is possible to add infinite values to data pool. |
307
|
|
|
|
|
|
|
The module will try and calculate whatever can still be calculated. |
308
|
|
|
|
|
|
|
However, no portable way of serializing such values is done yet. |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=cut |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
sub add_data { |
313
|
51217
|
|
|
51217
|
1
|
150318
|
my $self = shift; |
314
|
|
|
|
|
|
|
|
315
|
51217
|
|
|
|
|
64994
|
delete $self->{cache}; |
316
|
51217
|
|
|
|
|
73172
|
foreach (@_) { |
317
|
90167
|
|
|
|
|
143648
|
$self->{data}{ $self->_round($_) }++; |
318
|
90164
|
|
|
|
|
156371
|
$self->{count}++; |
319
|
|
|
|
|
|
|
}; |
320
|
51214
|
|
|
|
|
85781
|
$self; |
321
|
|
|
|
|
|
|
}; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
=head2 count |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
Returns number of data points. |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=cut |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
sub count { |
330
|
69
|
|
|
69
|
1
|
9123
|
my $self = shift; |
331
|
69
|
|
|
|
|
291
|
return $self->{count}; |
332
|
|
|
|
|
|
|
}; |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=head2 min |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
=head2 max |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
Values of minimal and maximal bins. |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
NOTE: Due to rounding, some of the actual inserted values may fall outside |
341
|
|
|
|
|
|
|
of the min..max range. This may change in the future. |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=cut |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
sub min { |
346
|
|
|
|
|
|
|
my $self = shift; |
347
|
|
|
|
|
|
|
return $self->_sort->[0]; |
348
|
|
|
|
|
|
|
}; |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
sub max { |
351
|
|
|
|
|
|
|
my $self = shift; |
352
|
|
|
|
|
|
|
return $self->_sort->[-1]; |
353
|
|
|
|
|
|
|
}; |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=head2 sample_range |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
Return sample range of the dataset, i.e. max() - min(). |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=cut |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
sub sample_range { |
362
|
5
|
|
|
5
|
1
|
934
|
my $self = shift; |
363
|
5
|
100
|
|
|
|
29
|
return $self->count ? $self->max - $self->min : undef; |
364
|
|
|
|
|
|
|
}; |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=head2 sum |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
Return sum of all data points. |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
=cut |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
sub sum { |
373
|
|
|
|
|
|
|
my $self = shift; |
374
|
|
|
|
|
|
|
return $self->sum_of(sub { $_[0] }); |
375
|
|
|
|
|
|
|
}; |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=head2 sumsq |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
Return sum of squares of all datapoints. |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=cut |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
sub sumsq { |
384
|
|
|
|
|
|
|
my $self = shift; |
385
|
|
|
|
|
|
|
return $self->sum_of(sub { $_[0] * $_[0] }); |
386
|
|
|
|
|
|
|
}; |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=head2 mean |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
Return mean, or average value, i.e. sum()/count(). |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=cut |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
sub mean { |
395
|
|
|
|
|
|
|
my $self = shift; |
396
|
|
|
|
|
|
|
return $self->{count} ? $self->sum / $self->{count} : undef; |
397
|
|
|
|
|
|
|
}; |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
=head2 variance |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=head2 variance( $correction ) |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
Return data variance, i.e. E((x - E(x)) ** 2). |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Bessel's correction (division by n-1 instead of n) is used by default. |
406
|
|
|
|
|
|
|
This may be changed by specifying $correction explicitly. |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
B The binning strategy used here should also introduce variance bias. |
409
|
|
|
|
|
|
|
This is not yet accounted for. |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=cut |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
sub variance { |
414
|
|
|
|
|
|
|
my $self = shift; |
415
|
|
|
|
|
|
|
my $correction = shift; |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
# in fact we'll receive correction='' because of how cache works |
418
|
|
|
|
|
|
|
$correction = 1 unless defined $correction and length $correction; |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
return 0 if ($self->{count} < 1 + $correction); |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
my $var = $self->sumsq - $self->sum**2 / $self->{count}; |
423
|
|
|
|
|
|
|
return $var <= 0 ? 0 : $var / ( $self->{count} - $correction ); |
424
|
|
|
|
|
|
|
}; |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=head2 standard_deviation |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=head2 standard_deviation( $correction ) |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=head2 std_dev |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=head2 stdev |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
Return standard deviation, i.e. square root of variance. |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
Bessel's correction (division by n-1 instead of n) is used by default. |
437
|
|
|
|
|
|
|
This may be changed by specifying $correction explicitly. |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
B The binning strategy used here should also introduce variance bias. |
440
|
|
|
|
|
|
|
This is not yet accounted for. |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=cut |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
sub standard_deviation { |
445
|
|
|
|
|
|
|
my $self = shift; |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
return sqrt($self->variance(@_)); |
448
|
|
|
|
|
|
|
}; |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=head2 cdf ($x) |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
Cumulative distribution function. Returns estimated probability of |
453
|
|
|
|
|
|
|
random data point from the sample being less than C<$x>. |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
As a special case, C accounts for I of zeroth bin count (if any). |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
Not present in Statistics::Descriptive::Full, but appears in |
458
|
|
|
|
|
|
|
L. |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
=head2 cdf ($x, $y) |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
Returns probability of a value being between C<$x> and C<$y> ($x <= $y). |
463
|
|
|
|
|
|
|
This is essentially C. |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=cut |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
sub cdf { |
468
|
4
|
|
|
4
|
1
|
8
|
my $self = shift; |
469
|
4
|
50
|
|
|
|
12
|
return unless $self->{count}; |
470
|
4
|
|
|
|
|
9
|
return $self->_count(@_) / $self->{count}; |
471
|
|
|
|
|
|
|
}; |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
=head2 percentile( $n ) |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
Find $n-th percentile, i.e. a value below which lies $n % of the data. |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
0-th percentile is by definition -inf and is returned as undef |
478
|
|
|
|
|
|
|
(see Statistics::Descriptive). |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
$n is a real number, not necessarily integer. |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
=cut |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
sub percentile { |
485
|
48
|
|
|
48
|
1
|
4949
|
my $self = shift; |
486
|
48
|
|
|
|
|
99
|
my $x = shift; |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
# assert 0<=$x<=100 |
489
|
48
|
50
|
33
|
|
|
270
|
croak __PACKAGE__.": percentile() argument must be between 0 and 100" |
490
|
|
|
|
|
|
|
unless 0<= $x and $x <= 100; |
491
|
|
|
|
|
|
|
|
492
|
48
|
|
|
|
|
158
|
my $need = $x * $self->{count} / 100; |
493
|
48
|
100
|
|
|
|
191
|
return if $need < 1; |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
# dichotomize |
496
|
|
|
|
|
|
|
# $i is lowest value >= needed |
497
|
|
|
|
|
|
|
# $need doesnt exceed last bin! |
498
|
42
|
|
|
|
|
154
|
my $i = _bin_search_ge( $self->_probability, $need ); |
499
|
42
|
|
|
|
|
123
|
return $self->_sort->[ $i ]; |
500
|
|
|
|
|
|
|
}; |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=head2 quantile( 0..4 ) |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
From Statistics::Descriptive manual: |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
0 => zero quartile (Q0) : minimal value |
507
|
|
|
|
|
|
|
1 => first quartile (Q1) : lower quartile = lowest cut off (25%) of data = 25th percentile |
508
|
|
|
|
|
|
|
2 => second quartile (Q2) : median = it cuts data set in half = 50th percentile |
509
|
|
|
|
|
|
|
3 => third quartile (Q3) : upper quartile = highest cut off (25%) of data, or lowest 75% = 75th percentile |
510
|
|
|
|
|
|
|
4 => fourth quartile (Q4) : maximal value |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
=cut |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
sub quantile { |
515
|
|
|
|
|
|
|
my $self = shift; |
516
|
|
|
|
|
|
|
my $t = shift; |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
croak (__PACKAGE__.": quantile() argument must be one of 0..4") |
519
|
|
|
|
|
|
|
unless $t =~ /^[0-4]$/; |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
$t or return $self->min; |
522
|
|
|
|
|
|
|
return $self->percentile($t * 100 / 4); |
523
|
|
|
|
|
|
|
}; |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
=head2 median |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
Return median of data, a value that divides the sample in half. |
528
|
|
|
|
|
|
|
Same as percentile(50). |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
=cut |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
sub median { |
533
|
16
|
|
|
16
|
1
|
1389
|
my $self = shift; |
534
|
16
|
|
|
|
|
40
|
return $self->percentile(50); |
535
|
|
|
|
|
|
|
}; |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=head2 trimmed_mean( $ltrim, [ $utrim ] ) |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Return mean of sample with $ltrim and $utrim fraction of data points |
540
|
|
|
|
|
|
|
remover from lower and upper ends respectively. |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
ltrim defaults to 0, and rtrim to ltrim. |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=cut |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
sub trimmed_mean { |
547
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
548
|
1
|
|
|
|
|
4
|
my ($lower, $upper) = @_; |
549
|
1
|
|
50
|
|
|
3
|
$lower ||= 0; |
550
|
1
|
50
|
|
|
|
4
|
$upper = $lower unless defined $upper; |
551
|
|
|
|
|
|
|
|
552
|
1
|
|
|
|
|
6
|
my $min = $self->percentile($lower * 100); |
553
|
1
|
|
|
|
|
5
|
my $max = $self->percentile(100 - $upper * 100); |
554
|
|
|
|
|
|
|
|
555
|
1
|
50
|
|
|
|
4
|
return unless $min < $max; |
556
|
|
|
|
|
|
|
|
557
|
1
|
|
|
5
|
|
9
|
return $self->mean_of(sub{$_[0]}, $min, $max); |
|
5
|
|
|
|
|
8
|
|
558
|
|
|
|
|
|
|
}; |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=head2 harmonic_mean |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
Return harmonic mean of the data, i.e. 1/E(1/x). |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
Return undef if division by zero occurs (see Statistics::Descriptive). |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=cut |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
sub harmonic_mean { |
569
|
2
|
|
|
2
|
1
|
988
|
my $self = shift; |
570
|
|
|
|
|
|
|
|
571
|
2
|
|
|
|
|
4
|
my $ret; |
572
|
2
|
|
|
|
|
4
|
eval { |
573
|
2
|
|
|
5
|
|
6
|
$ret = $self->count / $self->sum_of(sub { 1/$_[0] }); |
|
5
|
|
|
|
|
10
|
|
574
|
|
|
|
|
|
|
}; |
575
|
2
|
50
|
66
|
|
|
33
|
if ($@ and $@ !~ /division.*zero/) { |
576
|
0
|
|
|
|
|
0
|
die $@; # rethrow ALL BUT 1/0 which yields undef |
577
|
|
|
|
|
|
|
}; |
578
|
2
|
|
|
|
|
13
|
return $ret; |
579
|
|
|
|
|
|
|
}; |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
=head2 geometric_mean |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
Return geometric mean of the data, that is, exp(E(log x)). |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
Dies unless all data points are of the same sign. |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
=cut |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
sub geometric_mean { |
590
|
2
|
|
|
2
|
1
|
1050
|
my $self = shift; |
591
|
|
|
|
|
|
|
|
592
|
2
|
100
|
|
|
|
8
|
return unless $self->count; |
593
|
1
|
50
|
|
|
|
4
|
croak __PACKAGE__.": geometric_mean() called on mixed sign sample" |
594
|
|
|
|
|
|
|
if $self->min * $self->max < 0; |
595
|
|
|
|
|
|
|
|
596
|
1
|
50
|
|
|
|
4
|
return 0 if $self->{data}{0}; |
597
|
|
|
|
|
|
|
# this must be dog slow, but we already log() too much at this point. |
598
|
1
|
|
|
5
|
|
6
|
my $ret = exp( $self->sum_of( sub { log abs $_[0] } ) / $self->{count} ); |
|
5
|
|
|
|
|
13
|
|
599
|
1
|
50
|
|
|
|
5
|
return $self->min < 0 ? -$ret : $ret; |
600
|
|
|
|
|
|
|
}; |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
=head2 skewness |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
Return skewness of the distribution, calculated as |
605
|
|
|
|
|
|
|
n/(n-1)(n-2) * E((x-E(x))**3)/std_dev**3 (this is consistent with Excel). |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
=cut |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
sub skewness { |
610
|
20
|
|
|
20
|
1
|
2017
|
my $self = shift; |
611
|
|
|
|
|
|
|
|
612
|
20
|
|
|
|
|
38
|
my $n = $self->{count}; |
613
|
20
|
100
|
|
|
|
63
|
return unless $n > 2; |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
# code stolen from Statistics::Descriptive |
616
|
18
|
|
|
|
|
57
|
my $skew = $n * $self->std_moment(3); |
617
|
18
|
|
|
|
|
44
|
my $correction = $n / ( ($n-1) * ($n-2) ); |
618
|
18
|
|
|
|
|
96
|
return $correction * $skew; |
619
|
|
|
|
|
|
|
}; |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
=head2 kurtosis |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
Return kurtosis of the distribution, that is 4-th standardized moment - 3. |
624
|
|
|
|
|
|
|
The exact formula used here is consistent with that of Excel and |
625
|
|
|
|
|
|
|
Statistics::Descriptive. |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=cut |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
sub kurtosis { |
630
|
22
|
|
|
22
|
1
|
1036
|
my $self = shift; |
631
|
|
|
|
|
|
|
|
632
|
22
|
|
|
|
|
47
|
my $n = $self->{count}; |
633
|
22
|
100
|
|
|
|
67
|
return unless $n > 3; |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
# code stolen from Statistics::Descriptive |
636
|
20
|
|
|
|
|
55
|
my $kurt = $n * $self->std_moment(4); |
637
|
20
|
|
|
|
|
74
|
my $correction1 = ( $n * ($n+1) ) / ( ($n-1) * ($n-2) * ($n-3) ); |
638
|
20
|
|
|
|
|
56
|
my $correction2 = ( 3 * ($n-1) ** 2) / ( ($n-2) * ($n-3) ); |
639
|
|
|
|
|
|
|
|
640
|
20
|
|
|
|
|
99
|
return $correction1 * $kurt - $correction2; |
641
|
|
|
|
|
|
|
}; |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
=head2 central_moment( $n ) |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
Return $n-th central moment, that is, E((x - E(x))^$n). |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
Not present in Statistics::Descriptive::Full. |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
=cut |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
sub central_moment { |
652
|
|
|
|
|
|
|
my $self = shift; |
653
|
|
|
|
|
|
|
my $n = shift; |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
my $mean = $self->mean; |
656
|
|
|
|
|
|
|
return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) / $self->{count}; |
657
|
|
|
|
|
|
|
}; |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
=head2 std_moment( $n ) |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
Return $n-th standardized moment, that is, |
662
|
|
|
|
|
|
|
E((x - E(x))**$n) / std_dev(x)**$n. |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
Not present in Statistics::Descriptive::Full. |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
=cut |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
sub std_moment { |
669
|
|
|
|
|
|
|
my $self = shift; |
670
|
|
|
|
|
|
|
my $n = shift; |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
my $mean = $self->mean; |
673
|
|
|
|
|
|
|
my $dev = $self->std_dev; |
674
|
|
|
|
|
|
|
return $self->sum_of(sub{ ($_[0] - $mean) ** $n }) |
675
|
|
|
|
|
|
|
/ ( $dev**$n * $self->{count} ); |
676
|
|
|
|
|
|
|
}; |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
=head2 abs_moment( $power, [$offset] ) |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
Return $n-th moment of absolute value, that is, C. |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
Default value for offset if E(x). |
683
|
|
|
|
|
|
|
Power may be fractional. |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
B Experimental. Not present in Statistics::Descriptive::Full. |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
=cut |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
sub abs_moment { |
690
|
|
|
|
|
|
|
my ($self, $power, $offset) = @_; |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
$offset = $self->mean unless defined $offset; |
693
|
|
|
|
|
|
|
return $self->sum_of(sub{ return abs($_[0] - $offset) ** $power }) |
694
|
|
|
|
|
|
|
/ $self->{count}; |
695
|
|
|
|
|
|
|
}; |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
=head2 std_abs_moment( $power, [$offset] ) |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
Returns standardized absolute moment - like above, but scaled |
700
|
|
|
|
|
|
|
down by a factor of to standard deviation to n-th power. |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
That is, C |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
Default value for offset if E(x). |
705
|
|
|
|
|
|
|
Power may be fractional. |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
B Experimental. Not present in Statistics::Descriptive::Full. |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
=cut |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
sub std_abs_moment { |
712
|
14
|
|
|
14
|
1
|
51
|
my ($self, $power, $offset) = @_; |
713
|
|
|
|
|
|
|
|
714
|
14
|
|
|
|
|
43
|
return $self->abs_moment($power, $offset) |
715
|
|
|
|
|
|
|
/ |
716
|
|
|
|
|
|
|
($self->abs_moment(2, $offset) ** ($power/2)); |
717
|
|
|
|
|
|
|
}; |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
=head2 mode |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
Mode of a distribution is the most common value for a discrete distribution, |
722
|
|
|
|
|
|
|
or maximum of probability density for continuous one. |
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
For now we assume that the distribution IS discrete, and return the bin with |
725
|
|
|
|
|
|
|
the biggest hit count. |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
NOTE A better algorithm is still wanted. Experimental. |
728
|
|
|
|
|
|
|
Behavior may change in the future. |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=cut |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
# Naive implementation |
733
|
|
|
|
|
|
|
# Find bin w/greatest count and return it |
734
|
|
|
|
|
|
|
sub mode { |
735
|
|
|
|
|
|
|
my $self = shift; |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
return if !$self->count; |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
my $index = $self->_sort; |
740
|
|
|
|
|
|
|
return $index->[0] if @$index == 1; |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
my @count = map { $self->{data}{$_} } @$index; |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
my $max_index; |
745
|
|
|
|
|
|
|
my $max_growth = 0; |
746
|
|
|
|
|
|
|
for (my $i = 0; $i<@count; $i++) { |
747
|
|
|
|
|
|
|
$count[$i] > $max_growth or next; |
748
|
|
|
|
|
|
|
$max_index = $i; |
749
|
|
|
|
|
|
|
$max_growth = $count[$i]; |
750
|
|
|
|
|
|
|
}; |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
return $index->[$max_index]; |
753
|
|
|
|
|
|
|
}; |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
=head2 frequency_distribution_ref( \@index ) |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
=head2 frequency_distribution_ref( $n ) |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
=head2 frequency_distribution_ref |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
Return numbers of data point counts below each number in @index as hashref. |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
If a number is given instead of arrayref, @index is created |
764
|
|
|
|
|
|
|
by dividing [min, max] into $n intervals. |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
If no parameters are given, return previous result, if any. |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
=cut |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
sub frequency_distribution_ref { |
771
|
3
|
|
|
3
|
1
|
989
|
my $self = shift; |
772
|
3
|
|
|
|
|
6
|
my $index = shift; |
773
|
|
|
|
|
|
|
|
774
|
3
|
100
|
|
|
|
9
|
return unless $self->count; |
775
|
|
|
|
|
|
|
# ah, compatibility - return last value |
776
|
|
|
|
|
|
|
return $self->{cache}{frequency_distribution_ref} |
777
|
2
|
50
|
|
|
|
4
|
unless defined $index; |
778
|
|
|
|
|
|
|
# make index if number given |
779
|
2
|
100
|
|
|
|
5
|
if (!ref $index) { |
780
|
1
|
50
|
|
|
|
3
|
croak __PACKAGE__.": frequency_distribution_ref(): ". |
781
|
|
|
|
|
|
|
"argument must be array, of number > 2, not $index" |
782
|
|
|
|
|
|
|
unless $index > 2; |
783
|
1
|
|
|
|
|
4
|
my $min = $self->_lower($self->min); |
784
|
1
|
|
|
|
|
7
|
my $max = $self->_upper($self->max); |
785
|
1
|
|
|
|
|
3
|
my $step = ($max - $min) / $index; |
786
|
1
|
|
|
|
|
4
|
$index = [ map { $min + $_ * $step } 1..$index ]; |
|
5
|
|
|
|
|
9
|
|
787
|
|
|
|
|
|
|
}; |
788
|
|
|
|
|
|
|
|
789
|
2
|
|
|
|
|
10
|
@$index = (-$INF, sort { $a <=> $b } @$index); |
|
12
|
|
|
|
|
20
|
|
790
|
|
|
|
|
|
|
|
791
|
2
|
|
|
|
|
8
|
my @count; |
792
|
2
|
|
|
|
|
7
|
for (my $i = 0; $i<@$index-1; $i++) { |
793
|
9
|
|
|
|
|
22
|
push @count, $self->_count( $index->[$i], $index->[$i+1] ); |
794
|
|
|
|
|
|
|
}; |
795
|
2
|
|
|
|
|
4
|
shift @$index; # remove -inf |
796
|
|
|
|
|
|
|
|
797
|
2
|
|
|
|
|
3
|
my %hash; |
798
|
2
|
|
|
|
|
19
|
@hash{@$index} = @count; |
799
|
2
|
|
|
|
|
6
|
$self->{cache}{frequency_distribution_ref} = \%hash; |
800
|
2
|
|
|
|
|
11
|
return \%hash; |
801
|
|
|
|
|
|
|
}; |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
=head1 Specific methods |
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
The folowing methods only apply to this module, or are experimental. |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
=cut |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
=head2 bucket_width |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
Get bin width (relative to center of bin). Percentiles are off |
812
|
|
|
|
|
|
|
by no more than half of this. DEPRECATED. |
813
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
=cut |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
sub bucket_width { |
817
|
5
|
|
|
5
|
1
|
10
|
my $self = shift; |
818
|
5
|
|
|
|
|
30
|
return $self->{base} - 1; |
819
|
|
|
|
|
|
|
}; |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
=head2 log_base |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
Get upper/lower bound ratio for logarithmic bins. |
824
|
|
|
|
|
|
|
This represents relative precision of sample. |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
=head2 linear_width |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
Get width of linear buckets. |
829
|
|
|
|
|
|
|
This represents absolute precision of sample. |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
=head2 linear_threshold |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
Get absolute value threshold below which interpolation is switched to linear. |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
=cut |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
sub log_base { |
838
|
3
|
|
|
3
|
1
|
6
|
my $self = shift; |
839
|
3
|
|
|
|
|
11
|
return $self->{base}; |
840
|
|
|
|
|
|
|
}; |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
sub linear_width { |
843
|
3
|
|
|
3
|
1
|
7
|
my $self = shift; |
844
|
3
|
|
|
|
|
17
|
return $self->{linear_width}; |
845
|
|
|
|
|
|
|
}; |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
sub linear_threshold { |
848
|
8
|
|
|
8
|
1
|
27
|
my $self = shift; |
849
|
8
|
|
|
|
|
38
|
return $self->{linear_thresh}; |
850
|
|
|
|
|
|
|
}; |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
=head2 add_data_hash ( { value => weight, ... } ) |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
Add values with counts/weights. |
855
|
|
|
|
|
|
|
This can be used to import data from other |
856
|
|
|
|
|
|
|
Statistics::Descriptive::LogScale object. |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
Returns self, so that methods can be chained. |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
Negative counts are allowed and treated as "forgetting" data. |
861
|
|
|
|
|
|
|
If a bin count goes below zero, such bin is simply discarded. |
862
|
|
|
|
|
|
|
Minus infinity weight is allowed and has the same effect. |
863
|
|
|
|
|
|
|
Data is guaranteed to remain consistent. |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
If incorrect data is given (i.e. non-numeric, undef, or +infinity), |
866
|
|
|
|
|
|
|
an exception is thrown and nothing changes. |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
B Cache may be reset, even if no data was actually inserted. |
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
B It is possible to add infinite values to data pool. |
871
|
|
|
|
|
|
|
The module will try and calculate whatever can still be calculated. |
872
|
|
|
|
|
|
|
However, no portable way of serializing such values is done yet. |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
=cut |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
sub add_data_hash { |
877
|
20
|
|
|
20
|
1
|
56
|
my $self = shift; |
878
|
20
|
|
|
|
|
30
|
my $hash = shift; |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
# check incoming data for being numeric, and no +inf values |
881
|
20
|
|
|
|
|
35
|
eval { |
882
|
25
|
|
|
25
|
|
57971
|
use warnings FATAL => qw(numeric); |
|
25
|
|
|
|
|
84
|
|
|
25
|
|
|
|
|
45637
|
|
883
|
20
|
|
|
|
|
100
|
while (my ($k, $v) = each %$hash) { |
884
|
294
|
100
|
33
|
|
|
1437
|
$k == 0+$k and $v == 0+$v and $v < $INF |
|
|
|
66
|
|
|
|
|
885
|
|
|
|
|
|
|
or die "Infinite count for $k\n"; |
886
|
|
|
|
|
|
|
} |
887
|
|
|
|
|
|
|
}; |
888
|
20
|
100
|
|
|
|
282
|
croak __PACKAGE__.": add_data_hash failed: $@" |
889
|
|
|
|
|
|
|
if $@; |
890
|
|
|
|
|
|
|
|
891
|
18
|
|
|
|
|
42
|
delete $self->{cache}; |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
# update our counters |
894
|
18
|
|
|
|
|
70
|
foreach (keys %$hash) { |
895
|
291
|
50
|
|
|
|
453
|
next unless $hash->{$_}; |
896
|
291
|
|
|
|
|
454
|
my $key = $self->_round($_); |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
# Insert data. Make sure -Inf doesn't corrupt our counter. |
899
|
291
|
|
100
|
|
|
950
|
my $newcount = ($self->{data}{$key} || 0) + $hash->{$_}; |
900
|
291
|
100
|
|
|
|
444
|
if ($newcount > 0) { |
901
|
|
|
|
|
|
|
# normal insert |
902
|
279
|
|
|
|
|
650
|
$self->{data}{$key} = $newcount; |
903
|
279
|
|
|
|
|
469
|
$self->{count} += $hash->{$_}; |
904
|
|
|
|
|
|
|
} else { |
905
|
|
|
|
|
|
|
# We're "forgetting" data, AND the bin got empty |
906
|
12
|
|
100
|
|
|
50
|
$self->{count} -= delete $self->{data}{$key} || 0; |
907
|
|
|
|
|
|
|
}; |
908
|
|
|
|
|
|
|
}; |
909
|
18
|
|
|
|
|
49
|
$self; |
910
|
|
|
|
|
|
|
}; |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
=head2 get_data_hash( %options ) |
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
Return distribution hashref {value => number of occurances}. |
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
This is inverse of add_data_hash. |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
Options may include: |
919
|
|
|
|
|
|
|
|
920
|
|
|
|
|
|
|
=over |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
=item * min - ignore values below this. (See find_boundaries) |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
=item * max - ignore values above this. (See find_boundaries) |
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
=item * ltrim - ignore this % of values on lower end. (See find_boundaries) |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
=item * rtrim - ignore this % of values on upper end. (See find_boundaries) |
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
=item * noise_thresh - strip bins with count below this. |
931
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
=back |
933
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
=cut |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
sub get_data_hash { |
937
|
65
|
|
|
65
|
1
|
11750
|
my ($self, %opt) = @_; |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
# shallow copy of data if no options given |
940
|
65
|
100
|
|
|
|
168
|
return {%{ $self->{data} }} unless %opt; |
|
63
|
|
|
|
|
682
|
|
941
|
|
|
|
|
|
|
|
942
|
2
|
|
|
|
|
8
|
my ($min, $max) = $self->find_boundaries( %opt ); |
943
|
2
|
|
100
|
|
|
6
|
my $noize = $opt{noize_thresh} || 0; |
944
|
|
|
|
|
|
|
|
945
|
2
|
|
|
|
|
4
|
my $data = $self->{data}; |
946
|
2
|
|
|
|
|
4
|
my %hash; |
947
|
2
|
|
|
|
|
5
|
foreach (keys %$data ) { |
948
|
13
|
100
|
|
|
|
30
|
$_ < $min and next; |
949
|
8
|
50
|
|
|
|
12
|
$_ > $max and next; |
950
|
8
|
100
|
|
|
|
16
|
$data->{$_} < $noize and next; |
951
|
6
|
|
|
|
|
10
|
$hash{$_} = $data->{$_}; |
952
|
|
|
|
|
|
|
}; |
953
|
|
|
|
|
|
|
|
954
|
2
|
|
|
|
|
9
|
return \%hash; |
955
|
|
|
|
|
|
|
}; |
956
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
=head2 TO_JSON() |
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
Return enough data to recreate the whole object as an unblessed hashref. |
960
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
This routine conforms with C, hence the name. |
962
|
|
|
|
|
|
|
Can be called as |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
my $str = JSON::XS->new->allow_blessed->convert_blessed->encode( $this ); |
965
|
|
|
|
|
|
|
|
966
|
|
|
|
|
|
|
B This module DOES NOT require JSON::XS or serialize to JSON. |
967
|
|
|
|
|
|
|
It just deals with data. |
968
|
|
|
|
|
|
|
Use C, C, C or any serializer of choice. |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
my $raw_data = $stat->TO_JSON; |
971
|
|
|
|
|
|
|
Statistics::Descriptive::LogScale->new( %$raw_data ); |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
Would generate an exact copy of C<$stat> object |
974
|
|
|
|
|
|
|
(provided it's S::D::L and not a subclass). |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
=head2 clone( [ %options ] ) |
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
Copy constructor - returns copy of an existing object. |
979
|
|
|
|
|
|
|
Cache is not preserved. |
980
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
Constructor options may be given to override existing data. See new(). |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
Trim options may be given to get partial data. See get_data_hash(). |
984
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
=cut |
986
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
sub clone { |
988
|
8
|
|
|
8
|
1
|
981
|
my ($self, %opt) = @_; |
989
|
|
|
|
|
|
|
|
990
|
8
|
|
|
|
|
26
|
my $raw = $self->TO_JSON; |
991
|
8
|
100
|
|
|
|
28
|
if (%opt) { |
992
|
|
|
|
|
|
|
$raw->{data} = $self->get_data_hash( %opt ) |
993
|
4
|
100
|
|
|
|
12
|
unless exists $opt{data}; |
994
|
|
|
|
|
|
|
exists $opt{$_} and $raw->{$_} = $opt{$_} |
995
|
4
|
|
66
|
|
|
30
|
for @new_keys; |
996
|
|
|
|
|
|
|
}; |
997
|
|
|
|
|
|
|
|
998
|
8
|
|
|
|
|
41
|
return (ref $self)->new( %$raw ); |
999
|
|
|
|
|
|
|
}; |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
sub TO_JSON { |
1002
|
32
|
|
|
32
|
1
|
173
|
my $self = shift; |
1003
|
|
|
|
|
|
|
# UGLY HACK Increase linear_thresh by a factor of base ** 1/10 |
1004
|
|
|
|
|
|
|
# so that it's rounded down to present value |
1005
|
|
|
|
|
|
|
return { |
1006
|
|
|
|
|
|
|
CLASS => ref $self, |
1007
|
|
|
|
|
|
|
VERSION => $VERSION, |
1008
|
|
|
|
|
|
|
base => $self->{base}, |
1009
|
|
|
|
|
|
|
linear_width => $self->{linear_width}, |
1010
|
|
|
|
|
|
|
linear_thresh => $self->{linear_thresh} * ($self->{base}+9)/10, |
1011
|
|
|
|
|
|
|
only_linear => $self->{only_linear}, |
1012
|
32
|
|
|
|
|
132
|
data => $self->get_data_hash, |
1013
|
|
|
|
|
|
|
}; |
1014
|
|
|
|
|
|
|
}; |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
=head2 scale_sample( $scale ) |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
Multiply all bins' counts by given value. This can be used to adjust |
1019
|
|
|
|
|
|
|
significance of previous data before adding new data (e.g. gradually |
1020
|
|
|
|
|
|
|
"forgetting" past data in a long-running application). |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
=cut |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
sub scale_sample { |
1025
|
1
|
|
|
1
|
1
|
9
|
my $self = shift; |
1026
|
1
|
|
|
|
|
3
|
my $factor = shift; |
1027
|
1
|
50
|
|
|
|
7
|
$factor > 0 or croak (__PACKAGE__.": scale_sample: factor must be positive"); |
1028
|
|
|
|
|
|
|
|
1029
|
1
|
|
|
|
|
3
|
delete $self->{cache}; |
1030
|
1
|
|
|
|
|
3
|
foreach (@{ $self->_sort }) { |
|
1
|
|
|
|
|
5
|
|
1031
|
100
|
|
|
|
|
135
|
$self->{data}{$_} *= $factor; |
1032
|
|
|
|
|
|
|
}; |
1033
|
1
|
|
|
|
|
2
|
$self->{count} *= $factor; |
1034
|
1
|
|
|
|
|
3
|
return $self; |
1035
|
|
|
|
|
|
|
}; |
1036
|
|
|
|
|
|
|
|
1037
|
|
|
|
|
|
|
=head2 mean_of( $code, [$min, $max] ) |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
Return expectation of $code over sample within given range. |
1040
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
$code is expected to be a pure function (i.e. depending only on its input |
1042
|
|
|
|
|
|
|
value, and having no side effects). |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
The underlying integration mechanism only calculates $code once per bin, |
1045
|
|
|
|
|
|
|
so $code should be stable as in not vary wildly over small intervals. |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
=cut |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
sub mean_of { |
1050
|
24
|
|
|
24
|
1
|
97
|
my $self = shift; |
1051
|
24
|
|
|
|
|
82
|
my ($code, $min, $max) = @_; |
1052
|
|
|
|
|
|
|
|
1053
|
24
|
|
|
429
|
|
131
|
my $weight = $self->sum_of( sub {1}, $min, $max ); |
|
429
|
|
|
|
|
1091
|
|
1054
|
24
|
100
|
|
|
|
125
|
return unless $weight; |
1055
|
23
|
|
|
|
|
106
|
return $self->sum_of($code, $min, $max) / $weight; |
1056
|
|
|
|
|
|
|
}; |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
=head1 Experimental methods |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
These methods may be subject to change in the future, or stay, if they |
1061
|
|
|
|
|
|
|
are good. |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
=head2 sum_of ( $code, [ $min, $max ] ) |
1064
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
Integrate arbitrary function over the sample within the [ $min, $max ] interval. |
1066
|
|
|
|
|
|
|
Default values for both limits are infinities of appropriate sign. |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
Values in the edge bins are cut using interpolation if needed. |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
NOTE: sum_of(sub{1}, $a, $b) would return rough nubmer of data points |
1071
|
|
|
|
|
|
|
between $a and $b. |
1072
|
|
|
|
|
|
|
|
1073
|
|
|
|
|
|
|
EXPERIMENTAL. The method name may change in the future. |
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
=cut |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
sub sum_of { |
1078
|
189
|
|
|
189
|
1
|
1081
|
my $self = shift; |
1079
|
189
|
|
|
|
|
487
|
my ($code, $realmin, $realmax) = @_; |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
# Just app up stuff |
1082
|
189
|
100
|
100
|
|
|
853
|
if (!defined $realmin and !defined $realmax) { |
1083
|
166
|
|
|
|
|
338
|
my $sum = 0; |
1084
|
166
|
|
|
|
|
298
|
while (my ($val, $count) = each %{ $self->{data} }) { |
|
63889
|
|
|
|
|
149935
|
|
1085
|
63723
|
|
|
|
|
90845
|
$sum += $count * $code->( $val ); |
1086
|
|
|
|
|
|
|
}; |
1087
|
166
|
|
|
|
|
1035
|
return $sum; |
1088
|
|
|
|
|
|
|
}; |
1089
|
|
|
|
|
|
|
|
1090
|
23
|
100
|
|
|
|
58
|
$realmin = -$INF unless defined $realmin; |
1091
|
23
|
100
|
|
|
|
58
|
$realmax = $INF unless defined $realmax; |
1092
|
23
|
50
|
|
|
|
70
|
return 0 if( $realmin >= $realmax ); |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
# correct limits. $min, $max are indices; $left, $right are limits |
1095
|
23
|
|
|
|
|
63
|
my $min = $self->_round($realmin); |
1096
|
23
|
|
|
|
|
62
|
my $max = $self->_round($realmax); |
1097
|
23
|
|
|
|
|
65
|
my $left = $self->_lower($realmin); |
1098
|
23
|
|
|
|
|
67
|
my $right = $self->_upper($realmax); |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
# find first bin that's above $left |
1101
|
23
|
|
|
|
|
60
|
my $keys = $self->_sort; |
1102
|
23
|
|
|
|
|
72
|
my $i = _bin_search_ge($keys, $left); |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
# warn "sum_of [$min, $max]"; |
1105
|
|
|
|
|
|
|
# add up bins |
1106
|
23
|
|
|
|
|
44
|
my $sum = 0; |
1107
|
23
|
|
|
|
|
53
|
for (; $i < @$keys; $i++) { |
1108
|
284
|
|
|
|
|
921
|
my $val = $keys->[$i]; |
1109
|
284
|
100
|
|
|
|
598
|
last if $val > $right; |
1110
|
264
|
|
|
|
|
564
|
$sum += $self->{data}{$val} * $code->( $val ); |
1111
|
|
|
|
|
|
|
}; |
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
# cut edges: the hard part |
1114
|
|
|
|
|
|
|
# min and max are now used as indices |
1115
|
|
|
|
|
|
|
# if min or max hits 0, we cut it in half (i.e. into equal 0+ and 0-) |
1116
|
|
|
|
|
|
|
# warn "Add up, sum_of = $sum"; |
1117
|
23
|
100
|
|
|
|
132
|
if ($self->{data}{$max}) { |
1118
|
20
|
|
|
|
|
53
|
my $width = $self->_upper($max) - $self->_lower($max); |
1119
|
20
|
100
|
|
|
|
67
|
my $part = $width |
1120
|
|
|
|
|
|
|
? ($self->_upper($max) - $realmax) / $width |
1121
|
|
|
|
|
|
|
: 0.5; |
1122
|
20
|
|
|
|
|
112
|
$sum -= $self->{data}{$max} * $code->($max) * $part; |
1123
|
|
|
|
|
|
|
}; |
1124
|
|
|
|
|
|
|
# warn "Cut R, sum_of = $sum"; |
1125
|
23
|
100
|
|
|
|
120
|
if ($self->{data}{$min}) { |
1126
|
20
|
|
|
|
|
50
|
my $width = $self->_upper($min) - $self->_lower($min); |
1127
|
20
|
100
|
|
|
|
60
|
my $part = $width |
1128
|
|
|
|
|
|
|
? ($realmin - $self->_lower($min)) / $width |
1129
|
|
|
|
|
|
|
: 0.5; |
1130
|
20
|
|
|
|
|
68
|
$sum -= $self->{data}{$min} * $code->($min) * $part; |
1131
|
|
|
|
|
|
|
}; |
1132
|
|
|
|
|
|
|
# warn "Cut L, sum_of = $sum"; |
1133
|
|
|
|
|
|
|
|
1134
|
23
|
|
|
|
|
150
|
return $sum; |
1135
|
|
|
|
|
|
|
}; # end sum_of |
1136
|
|
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
=head2 histogram ( %options ) |
1138
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
Returns array of form [ [ count0_1, x0, x1 ], [count1_2, x1, x2 ], ... ] |
1140
|
|
|
|
|
|
|
where countX_Y is number of data points between X and Y. |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
Options may include: |
1143
|
|
|
|
|
|
|
|
1144
|
|
|
|
|
|
|
=over |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
=item * count (+) - number of intervals to divide sample into. |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
=item * index (+) - interval borders as array. Will be sorted before processing. |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
=item * min - ignore values below this. Default = $self->min - epsilon. |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
=item * max - ignore values above this. Default = $self->max + epsilon. |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
=item * ltrim - ignore this % of values on lower end. |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=item * rtrim - ignore this % of values on upper end. |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
=item * normalize_to - adjust counts so that max number becomes nnn. |
1159
|
|
|
|
|
|
|
This may be useful if one intends to draw pictures. |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
=back |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
Either count or index must be present. |
1164
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
NOTE: this is equivalent to frequency_distribution_ref but better suited |
1166
|
|
|
|
|
|
|
for omitting sample tails and outputting pretty pictures. |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
=cut |
1169
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
sub histogram { |
1171
|
5
|
|
|
5
|
1
|
2338
|
my ($self, %opt) = @_; |
1172
|
|
|
|
|
|
|
|
1173
|
5
|
100
|
|
|
|
15
|
return unless $self->count; |
1174
|
4
|
|
|
|
|
12
|
my ($min, $max) = $self->find_boundaries( %opt ); |
1175
|
|
|
|
|
|
|
# build/check index |
1176
|
4
|
100
|
|
|
|
7
|
my @index = @{ $opt{index} || [] }; |
|
4
|
|
|
|
|
19
|
|
1177
|
4
|
100
|
|
|
|
11
|
if (!@index) { |
1178
|
3
|
|
|
|
|
4
|
my $n = $opt{count}; |
1179
|
3
|
50
|
|
|
|
7
|
$n > 0 or croak (__PACKAGE__.": histogram: insufficient options (count < 1 )"); |
1180
|
3
|
|
|
|
|
7
|
my $step = ($max - $min) / $n; |
1181
|
3
|
|
|
|
|
9
|
for (my $x = $min; $x <= $max; $x += $step) { |
1182
|
16
|
|
|
|
|
31
|
push @index, $x; |
1183
|
|
|
|
|
|
|
}; |
1184
|
|
|
|
|
|
|
} else { |
1185
|
|
|
|
|
|
|
# sort & uniq raw index |
1186
|
1
|
|
|
|
|
3
|
my %known; |
1187
|
1
|
|
|
|
|
2
|
@index = grep { !$known{$_}++ } @index; |
|
5
|
|
|
|
|
14
|
|
1188
|
1
|
|
|
|
|
3
|
@index = sort { $a <=> $b } @index; |
|
8
|
|
|
|
|
12
|
|
1189
|
1
|
50
|
|
|
|
4
|
@index > 1 or croak (__PACKAGE__.": histogram: insufficient options (index < 2)"); |
1190
|
|
|
|
|
|
|
}; |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
# finally: estimated counts between indices! |
1193
|
4
|
|
|
|
|
7
|
my @ret; |
1194
|
4
|
|
|
|
|
9
|
for ( my $i = 0; $i<@index-1; $i++) { |
1195
|
17
|
|
|
|
|
37
|
my $count = $self->_count( $index[$i], $index[$i+1] ); |
1196
|
17
|
|
|
|
|
61
|
push @ret, [ $count, $index[$i], $index[$i+1] ]; |
1197
|
|
|
|
|
|
|
}; |
1198
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
# if normalize - find maximum & divide by it |
1200
|
4
|
100
|
|
|
|
11
|
if (my $norm = $opt{normalize_to}) { |
1201
|
1
|
|
|
|
|
2
|
my $max = 0; |
1202
|
|
|
|
|
|
|
$max < $_->[0] and $max = $_->[0] |
1203
|
1
|
|
66
|
|
|
7
|
for @ret; |
1204
|
1
|
|
|
|
|
2
|
$norm /= $max; |
1205
|
1
|
|
|
|
|
4
|
$_->[0] *= $norm for @ret; |
1206
|
|
|
|
|
|
|
}; |
1207
|
|
|
|
|
|
|
|
1208
|
4
|
|
|
|
|
17
|
return \@ret; |
1209
|
|
|
|
|
|
|
}; |
1210
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
=head2 find_boundaries( %opt ) |
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
Return ($min, $max) of part of sample denoted by options. |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
Options may include: |
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
=over |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
=item * min - ignore values below this. default = min() - epsilon. |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
=item * max - ignore values above this. default = max() + epsilon. |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
=item * ltrim - ignore this % of values on lower end. |
1224
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
=item * rtrim - ignore this % of values on upper end. |
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
=back |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
If no options are given, the whole sample is guaranteed to reside between |
1230
|
|
|
|
|
|
|
returned values. |
1231
|
|
|
|
|
|
|
|
1232
|
|
|
|
|
|
|
=cut |
1233
|
|
|
|
|
|
|
|
1234
|
|
|
|
|
|
|
sub find_boundaries { |
1235
|
9
|
|
|
9
|
1
|
1600
|
my $self = shift; |
1236
|
9
|
|
|
|
|
22
|
my %opt = @_; |
1237
|
|
|
|
|
|
|
|
1238
|
9
|
100
|
|
|
|
52
|
return unless $self->count; |
1239
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
# preprocess boundaries |
1241
|
8
|
100
|
|
|
|
31
|
my $min = defined $opt{min} ? $opt{min} : $self->_lower( $self->min ); |
1242
|
8
|
100
|
|
|
|
31
|
my $max = defined $opt{max} ? $opt{max} : $self->_upper( $self->max ); |
1243
|
|
|
|
|
|
|
|
1244
|
8
|
100
|
|
|
|
22
|
if ($opt{ltrim}) { |
1245
|
1
|
|
|
|
|
5
|
my $newmin = $self->percentile( $opt{ltrim} ); |
1246
|
1
|
50
|
33
|
|
|
7
|
defined $newmin and $newmin > $min and $min = $newmin; |
1247
|
|
|
|
|
|
|
}; |
1248
|
8
|
100
|
|
|
|
19
|
if ($opt{utrim}) { |
1249
|
1
|
|
|
|
|
4
|
my $newmax = $self->percentile( 100-$opt{utrim} ); |
1250
|
1
|
50
|
33
|
|
|
6
|
defined $newmax and $newmax < $max and $max = $newmax; |
1251
|
|
|
|
|
|
|
}; |
1252
|
|
|
|
|
|
|
|
1253
|
8
|
|
|
|
|
25
|
return ($min, $max); |
1254
|
|
|
|
|
|
|
}; |
1255
|
|
|
|
|
|
|
|
1256
|
|
|
|
|
|
|
=head2 format( "printf-like expression", ... ) |
1257
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
Returns a summary as requested by format string. |
1259
|
|
|
|
|
|
|
Just as with printf and sprintf, a placeholder starts with a C<%>, |
1260
|
|
|
|
|
|
|
followed by formatting options and a |
1261
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
The following placeholders are supported: |
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
=over |
1265
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
=item * % - a literal % |
1267
|
|
|
|
|
|
|
|
1268
|
|
|
|
|
|
|
=item * s, f, g - a normal printf acting on an extra argument. |
1269
|
|
|
|
|
|
|
The number of extra arguments MUST match the number of such placeholders, |
1270
|
|
|
|
|
|
|
or this function dies. |
1271
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
=item * n - count; |
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
=item * m - min; |
1275
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
=item * M - max, |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
=item * a - mean, |
1279
|
|
|
|
|
|
|
|
1280
|
|
|
|
|
|
|
=item * d - standard deviation, |
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
=item * S - skewness, |
1283
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
=item * K - kurtosis, |
1285
|
|
|
|
|
|
|
|
1286
|
|
|
|
|
|
|
=item * q(x) - x-th quantile (requires argument), |
1287
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
=item * p(x) - x-th percentile (requires argument), |
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
=item * P(x) - cdf - the inferred cumulative distribution function (x) |
1291
|
|
|
|
|
|
|
(requires argument), |
1292
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
=item * e(n) - central_moment - central moment of n-th power |
1294
|
|
|
|
|
|
|
(requires argument), |
1295
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
=item * E(n) - std_moment - standard moment of n-th power (requires argument), |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
=item * A(n) - abs_moment - absolute moment of n-th power (requires argument). |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
=back |
1301
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
For example, |
1303
|
|
|
|
|
|
|
|
1304
|
|
|
|
|
|
|
$stat->format( "99%% results lie between %p(0.5) and %p(99.5)" ); |
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
Or |
1307
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
for( my $i = 0; $i < @stats; $i++ ) { |
1309
|
|
|
|
|
|
|
print $stats[$i]->format( "%s-th average value is %a +- %d", $i ); |
1310
|
|
|
|
|
|
|
}; |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
=cut |
1313
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
my %format = ( |
1315
|
|
|
|
|
|
|
# percent literal |
1316
|
|
|
|
|
|
|
'%' => '%', |
1317
|
|
|
|
|
|
|
# placeholders without parameters |
1318
|
|
|
|
|
|
|
n => 'count', |
1319
|
|
|
|
|
|
|
m => 'min', |
1320
|
|
|
|
|
|
|
M => 'max', |
1321
|
|
|
|
|
|
|
a => 'mean', |
1322
|
|
|
|
|
|
|
d => 'std_dev', |
1323
|
|
|
|
|
|
|
S => 'skewness', |
1324
|
|
|
|
|
|
|
K => 'kurtosis', |
1325
|
|
|
|
|
|
|
# placeholders with 1 parameter |
1326
|
|
|
|
|
|
|
q => 'quantile?', |
1327
|
|
|
|
|
|
|
p => 'percentile?', |
1328
|
|
|
|
|
|
|
P => 'cdf?', |
1329
|
|
|
|
|
|
|
e => 'central_moment?', |
1330
|
|
|
|
|
|
|
E => 'std_moment?', |
1331
|
|
|
|
|
|
|
A => 'abs_moment?', |
1332
|
|
|
|
|
|
|
); |
1333
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
my %printf = ( |
1335
|
|
|
|
|
|
|
s => 1, |
1336
|
|
|
|
|
|
|
f => 1, |
1337
|
|
|
|
|
|
|
g => 1, |
1338
|
|
|
|
|
|
|
); |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
my $re_format = join "|", keys %format, keys %printf; |
1341
|
|
|
|
|
|
|
$re_format = qr((?:$re_format)); |
1342
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
sub format { |
1344
|
5
|
|
|
5
|
1
|
18
|
my ($self, $format, @extra) = @_; |
1345
|
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
# FIXME this accepts %m(5), then dies - UGLY |
1347
|
|
|
|
|
|
|
# TODO rewrite this as a giant sprintf... one day... |
1348
|
5
|
|
|
|
|
114
|
$format =~ s <%([0-9.\-+ #]*)($re_format)(?:\(($re_num)?\)){0,1}> |
|
7
|
|
|
|
|
21
|
|
1349
|
|
|
|
|
|
|
< _format_dispatch($self, $2, $1, $3, \@extra) >ge; |
1350
|
5
|
50
|
|
|
|
17
|
|
1351
|
|
|
|
|
|
|
croak __PACKAGE__.": Extra arguments in format()" |
1352
|
5
|
|
|
|
|
35
|
if @extra; |
1353
|
|
|
|
|
|
|
return $format; |
1354
|
|
|
|
|
|
|
}; |
1355
|
|
|
|
|
|
|
|
1356
|
7
|
|
|
7
|
|
30
|
sub _format_dispatch { |
1357
|
|
|
|
|
|
|
my ($obj, $method, $float, $arg, $extra) = @_; |
1358
|
|
|
|
|
|
|
|
1359
|
7
|
100
|
|
|
|
24
|
# Handle % escapes |
1360
|
1
|
|
|
|
|
4
|
if ($method !~ /^[a-zA-Z]/) { |
1361
|
|
|
|
|
|
|
return $method; |
1362
|
|
|
|
|
|
|
}; |
1363
|
6
|
100
|
|
|
|
19
|
# Handle printf built-in formats |
1364
|
1
|
50
|
|
|
|
3
|
if (!$format{$method}) { |
1365
|
|
|
|
|
|
|
croak __PACKAGE__.": Not enough arguments in format()" |
1366
|
1
|
|
|
|
|
9
|
unless @$extra; |
1367
|
|
|
|
|
|
|
return sprintf "%${float}${method}", shift @$extra; |
1368
|
|
|
|
|
|
|
}; |
1369
|
|
|
|
|
|
|
|
1370
|
5
|
|
|
|
|
10
|
# Now we know it's LogScale's own method |
1371
|
5
|
100
|
|
|
|
18
|
$method = $format{$method}; |
1372
|
2
|
50
|
|
|
|
9
|
if ($method =~ s/\?$//) { |
1373
|
|
|
|
|
|
|
die "Missing argument in method $method" if !defined $arg; |
1374
|
3
|
50
|
|
|
|
7
|
} else { |
1375
|
|
|
|
|
|
|
die "Extra argument in method $method" if defined $arg; |
1376
|
5
|
|
|
|
|
21
|
}; |
1377
|
|
|
|
|
|
|
my $result = $obj->$method($arg); |
1378
|
|
|
|
|
|
|
|
1379
|
5
|
100
|
100
|
|
|
21
|
# work around S::D::Full's convention that "-inf == undef" |
1380
|
|
|
|
|
|
|
$result = -9**9**9 |
1381
|
5
|
|
|
|
|
42
|
if ($method eq 'percentile' and !defined $result); |
1382
|
|
|
|
|
|
|
return sprintf "%${float}f", $result; |
1383
|
|
|
|
|
|
|
}; |
1384
|
|
|
|
|
|
|
|
1385
|
|
|
|
|
|
|
################################################################ |
1386
|
|
|
|
|
|
|
# No more public methods please |
1387
|
|
|
|
|
|
|
|
1388
|
|
|
|
|
|
|
# MEMOIZE |
1389
|
|
|
|
|
|
|
# We'll keep methods' returned values under {cache}. |
1390
|
|
|
|
|
|
|
# All setters destroy said cache altogether. |
1391
|
|
|
|
|
|
|
# PLEASE replace this with a ready-made module if there's one. |
1392
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
# Sorry for this black magic, but it's too hard to write //= in EVERY method |
1394
|
|
|
|
|
|
|
# Long story short |
1395
|
|
|
|
|
|
|
# The next sub replaces $self->foo() with |
1396
|
|
|
|
|
|
|
# sub { $self->{cache}{foo} //= $self->originnal_foo } |
1397
|
|
|
|
|
|
|
# All setter methods are EXPECTED to destroy {cache} altogether. |
1398
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
# NOTE if you plan subclassing the method, re-memoize methods you change. |
1400
|
300
|
|
|
300
|
|
636
|
sub _memoize_method { |
1401
|
|
|
|
|
|
|
my ($class, $name, $arg) = @_; |
1402
|
300
|
|
|
|
|
1083
|
|
1403
|
300
|
50
|
|
|
|
781
|
my $orig_code = $class->can($name); |
1404
|
|
|
|
|
|
|
die "Error in memoizer section ($name)" |
1405
|
|
|
|
|
|
|
unless ref $orig_code eq 'CODE'; |
1406
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
# begin long conditional |
1408
|
|
|
|
|
|
|
my $cached_code = !$arg |
1409
|
253
|
100
|
|
253
|
|
8570
|
? sub { |
1410
|
111
|
|
|
|
|
323
|
if (!exists $_[0]->{cache}{$name}) { |
1411
|
|
|
|
|
|
|
$_[0]->{cache}{$name} = $orig_code->($_[0]); |
1412
|
253
|
|
|
|
|
1142
|
}; |
1413
|
|
|
|
|
|
|
return $_[0]->{cache}{$name}; |
1414
|
|
|
|
|
|
|
} |
1415
|
242
|
|
|
242
|
|
2660
|
: sub { |
1416
|
242
|
|
|
|
|
420
|
my $self = shift; |
1417
|
25
|
|
|
25
|
|
240
|
my $arg = do { |
|
25
|
|
|
|
|
63
|
|
|
25
|
|
|
|
|
3424
|
|
1418
|
242
|
|
|
|
|
697
|
no warnings 'uninitialized'; ## no critic |
1419
|
|
|
|
|
|
|
join ':', @_; |
1420
|
|
|
|
|
|
|
}; |
1421
|
242
|
100
|
|
|
|
852
|
|
1422
|
143
|
|
|
|
|
411
|
if (!exists $self->{cache}{"$name:$arg"}) { |
1423
|
|
|
|
|
|
|
$self->{cache}{"$name:$arg"} = $orig_code->($self, @_); |
1424
|
241
|
|
|
|
|
1390
|
}; |
1425
|
300
|
100
|
|
|
|
1250
|
return $self->{cache}{"$name:$arg"}; |
1426
|
|
|
|
|
|
|
}; |
1427
|
|
|
|
|
|
|
# conditional ends here |
1428
|
25
|
|
|
25
|
|
181
|
|
|
25
|
|
|
|
|
58
|
|
|
25
|
|
|
|
|
902
|
|
1429
|
25
|
|
|
25
|
|
179
|
no strict 'refs'; ## no critic |
|
25
|
|
|
|
|
69
|
|
|
25
|
|
|
|
|
2587
|
|
1430
|
300
|
|
|
|
|
519
|
no warnings 'redefine'; ## no critic |
|
300
|
|
|
|
|
1137
|
|
1431
|
|
|
|
|
|
|
*{$class."::".$name} = $cached_code; |
1432
|
|
|
|
|
|
|
}; # end of _memoize_method |
1433
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
# Memoize all the methods w/o arguments |
1435
|
|
|
|
|
|
|
foreach ( qw(sum sumsq mean min max mode) ) { |
1436
|
|
|
|
|
|
|
__PACKAGE__->_memoize_method($_); |
1437
|
|
|
|
|
|
|
}; |
1438
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
# Memoize methods with 1 argument |
1440
|
|
|
|
|
|
|
foreach (qw(quantile central_moment std_moment abs_moment standard_deviation |
1441
|
|
|
|
|
|
|
variance)) { |
1442
|
|
|
|
|
|
|
__PACKAGE__->_memoize_method($_, 1); |
1443
|
|
|
|
|
|
|
}; |
1444
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
# add shorter alias of standard_deviation (this must happen AFTER memoization) |
1446
|
25
|
|
|
25
|
|
177
|
{ |
|
25
|
|
|
|
|
69
|
|
|
25
|
|
|
|
|
8698
|
|
1447
|
|
|
|
|
|
|
no warnings 'once'; ## no critic |
1448
|
|
|
|
|
|
|
*std_dev = \&standard_deviation; |
1449
|
|
|
|
|
|
|
*stdev = \&standard_deviation; |
1450
|
|
|
|
|
|
|
}; |
1451
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
# Get number of values below $x |
1453
|
|
|
|
|
|
|
# Like sum_of(sub{1}, undef, $x), but faster. |
1454
|
|
|
|
|
|
|
# Used by cdf() |
1455
|
105
|
|
|
105
|
|
144
|
sub _count { |
1456
|
105
|
100
|
|
|
|
214
|
my $self = shift; |
1457
|
74
|
|
|
|
|
95
|
@_>1 and return $self->_count($_[1]) - $self->_count($_[0]); |
1458
|
|
|
|
|
|
|
my $x = shift; |
1459
|
74
|
|
|
|
|
130
|
|
1460
|
74
|
|
|
|
|
137
|
my $upper = $self->_upper($x); |
1461
|
74
|
100
|
|
|
|
148
|
my $i = _bin_search_gt( $self->_sort, $upper ); |
1462
|
65
|
|
|
|
|
110
|
!$i-- and return 0; |
1463
|
|
|
|
|
|
|
my $count = $self->_probability->[$i]; |
1464
|
|
|
|
|
|
|
|
1465
|
65
|
|
|
|
|
115
|
# interpolate |
1466
|
65
|
100
|
|
|
|
273
|
my $bin = $self->_round( $x ); |
1467
|
34
|
|
|
|
|
57
|
if (my $val = $self->{data}{$bin}) { |
1468
|
34
|
100
|
|
|
|
71
|
my $width = ($upper - $bin) * 2; |
1469
|
34
|
|
|
|
|
50
|
my $part = $width ? ( ($upper - $x) / $width) : 1/2; |
1470
|
|
|
|
|
|
|
$count -= $part * $val; |
1471
|
65
|
|
|
|
|
178
|
}; |
1472
|
|
|
|
|
|
|
return $count; |
1473
|
|
|
|
|
|
|
}; |
1474
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
# BINARY SEARCH |
1476
|
|
|
|
|
|
|
# Not a method, just a function |
1477
|
|
|
|
|
|
|
# Takes sorted \@array and a $num |
1478
|
|
|
|
|
|
|
# Return lowest $i such that $array[$i] >= $num |
1479
|
|
|
|
|
|
|
# Return (scalar @array) if no such $i exists |
1480
|
147
|
|
|
147
|
|
865
|
sub _bin_search_ge { |
1481
|
|
|
|
|
|
|
my ($array, $x) = @_; |
1482
|
147
|
100
|
100
|
|
|
640
|
|
1483
|
126
|
|
|
|
|
226
|
return 0 unless @$array and $array->[0] < $x; |
1484
|
126
|
|
|
|
|
200
|
my $l = 0; |
1485
|
126
|
|
|
|
|
286
|
my $r = @$array; |
1486
|
405
|
|
|
|
|
774
|
while ($l+1 < $r) { |
1487
|
405
|
100
|
|
|
|
1045
|
my $m = int( ($l + $r) /2); |
1488
|
|
|
|
|
|
|
$array->[$m] < $x ? $l = $m : $r = $m; |
1489
|
126
|
|
|
|
|
280
|
}; |
1490
|
|
|
|
|
|
|
return $l+1; |
1491
|
|
|
|
|
|
|
}; |
1492
|
74
|
|
|
74
|
|
130
|
sub _bin_search_gt { |
1493
|
74
|
|
|
|
|
112
|
my ($array, $x) = @_; |
1494
|
74
|
100
|
100
|
|
|
216
|
my $i = _bin_search_ge(@_); |
1495
|
74
|
|
|
|
|
108
|
$i++ if defined $array->[$i] and $array->[$i] == $x; |
1496
|
|
|
|
|
|
|
return $i; |
1497
|
|
|
|
|
|
|
}; |
1498
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
# THE CORE |
1500
|
|
|
|
|
|
|
# Here come the number=>bin functions |
1501
|
|
|
|
|
|
|
# round() generates bin center |
1502
|
|
|
|
|
|
|
# upper() and lower() are respective boundaries. |
1503
|
|
|
|
|
|
|
# Here's the algorithm: |
1504
|
|
|
|
|
|
|
# 1) determine whether bin is linear or logarithmic |
1505
|
|
|
|
|
|
|
# 2) for linear bins, return bin# * bucket_width (==2*absolute error) |
1506
|
|
|
|
|
|
|
# add/subtract 0.5 to get edges. |
1507
|
|
|
|
|
|
|
# 3) for logarithmic bins, return base ** bin# with appropriate sign |
1508
|
|
|
|
|
|
|
# multiply by precalculated constant to get edges |
1509
|
|
|
|
|
|
|
# note that +0.5 doesn't work here, since sqrt(a*b) != (a+b)/2 |
1510
|
|
|
|
|
|
|
# This part is fragile and can be written better |
1511
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
# center of bin containing x |
1513
|
90924
|
|
|
90924
|
|
121697
|
sub _round { |
1514
|
90924
|
|
|
|
|
114147
|
my $self = shift; |
1515
|
|
|
|
|
|
|
my $x = shift; |
1516
|
25
|
|
|
25
|
|
189
|
|
|
25
|
|
|
|
|
73
|
|
|
25
|
|
|
|
|
3763
|
|
1517
|
|
|
|
|
|
|
use warnings FATAL => qw(numeric uninitialized); |
1518
|
90924
|
100
|
|
|
|
168333
|
|
1519
|
|
|
|
|
|
|
if (abs($x) <= $self->{linear_thresh}) { |
1520
|
1792
|
|
100
|
|
|
5680
|
return $self->{linear_width} |
1521
|
|
|
|
|
|
|
&& $self->{linear_width} * floor( $x / $self->{linear_width} + 0.5 ); |
1522
|
89129
|
|
|
|
|
178971
|
}; |
1523
|
89129
|
|
|
|
|
149352
|
my $i = floor (((log abs $x) - $self->{logfloor})/ $self->{logbase}); |
1524
|
89129
|
100
|
|
|
|
316496
|
my $value = $self->{base} ** $i; |
1525
|
|
|
|
|
|
|
return $x < 0 ? -$value : $value; |
1526
|
|
|
|
|
|
|
}; |
1527
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
# lower, upper limits of bin containing x |
1529
|
887
|
|
|
887
|
|
52542
|
sub _lower { |
1530
|
887
|
|
|
|
|
1353
|
my $self = shift; |
1531
|
|
|
|
|
|
|
my $x = shift; |
1532
|
25
|
|
|
25
|
|
199
|
|
|
25
|
|
|
|
|
65
|
|
|
25
|
|
|
|
|
8028
|
|
1533
|
|
|
|
|
|
|
use warnings FATAL => qw(numeric uninitialized); |
1534
|
887
|
100
|
|
|
|
2322
|
|
1535
|
|
|
|
|
|
|
if (abs($x) <= $self->{linear_thresh}) { |
1536
|
188
|
|
66
|
|
|
1784
|
return $self->{linear_width} |
1537
|
|
|
|
|
|
|
&& $self->{linear_width} * (floor( $x / $self->{linear_width} + 0.5) - 0.5); |
1538
|
699
|
|
|
|
|
2178
|
}; |
1539
|
699
|
100
|
|
|
|
1490
|
my $i = floor (((log abs $x) - $self->{logfloor} )/ $self->{logbase}); |
1540
|
310
|
|
|
|
|
1294
|
if ($x > 0) { |
1541
|
|
|
|
|
|
|
return $self->{floor} * $self->{base}**($i); |
1542
|
389
|
|
|
|
|
1624
|
} else { |
1543
|
|
|
|
|
|
|
return -$self->{floor} * $self->{base}**($i+1); |
1544
|
|
|
|
|
|
|
}; |
1545
|
|
|
|
|
|
|
}; |
1546
|
|
|
|
|
|
|
|
1547
|
462
|
|
|
462
|
|
5227
|
sub _upper { |
1548
|
|
|
|
|
|
|
return -$_[0]->_lower(-$_[1]); |
1549
|
|
|
|
|
|
|
}; |
1550
|
|
|
|
|
|
|
|
1551
|
|
|
|
|
|
|
# build bin index |
1552
|
191
|
|
|
191
|
|
308
|
sub _sort { |
1553
|
|
|
|
|
|
|
my $self = shift; |
1554
|
191
|
|
100
|
|
|
758
|
return $self->{cache}{sorted} |
|
1234
|
|
|
|
|
2257
|
|
|
32
|
|
|
|
|
292
|
|
1555
|
|
|
|
|
|
|
||= [ sort { $a <=> $b } keys %{ $self->{data} } ]; |
1556
|
|
|
|
|
|
|
}; |
1557
|
|
|
|
|
|
|
|
1558
|
|
|
|
|
|
|
# build cumulative bin counts index |
1559
|
107
|
|
|
107
|
|
170
|
sub _probability { |
1560
|
107
|
|
66
|
|
|
330
|
my $self = shift; |
1561
|
20
|
|
|
|
|
54
|
return $self->{cache}{probability} ||= do { |
1562
|
20
|
|
|
|
|
41
|
my @array; |
1563
|
20
|
|
|
|
|
43
|
my $sum = 0; |
|
20
|
|
|
|
|
89
|
|
1564
|
165
|
|
|
|
|
324
|
foreach (@{ $self->_sort }) { |
1565
|
165
|
|
|
|
|
391
|
$sum += $self->{data}{$_}; |
1566
|
|
|
|
|
|
|
push @array, $sum; |
1567
|
20
|
|
|
|
|
115
|
}; |
1568
|
|
|
|
|
|
|
\@array; |
1569
|
|
|
|
|
|
|
}; |
1570
|
|
|
|
|
|
|
}; |
1571
|
|
|
|
|
|
|
|
1572
|
|
|
|
|
|
|
=head1 AUTHOR |
1573
|
|
|
|
|
|
|
|
1574
|
|
|
|
|
|
|
Konstantin S. Uvarin, C<< >> |
1575
|
|
|
|
|
|
|
|
1576
|
|
|
|
|
|
|
=head1 BUGS |
1577
|
|
|
|
|
|
|
|
1578
|
|
|
|
|
|
|
The module is currently under development. There may be bugs. |
1579
|
|
|
|
|
|
|
|
1580
|
|
|
|
|
|
|
C only works for discrete distributions, and simply returns |
1581
|
|
|
|
|
|
|
the first bin with largest bin count. |
1582
|
|
|
|
|
|
|
A better algorithm is wanted. |
1583
|
|
|
|
|
|
|
|
1584
|
|
|
|
|
|
|
C should have been made a private method. |
1585
|
|
|
|
|
|
|
Its signature and/or name may change in the future. |
1586
|
|
|
|
|
|
|
|
1587
|
|
|
|
|
|
|
See the TODO file in the distribution package. |
1588
|
|
|
|
|
|
|
|
1589
|
|
|
|
|
|
|
Please feel free to post bugs and/or feature requests to github: |
1590
|
|
|
|
|
|
|
L |
1591
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
Alternatively, you can use CPAN RT |
1593
|
|
|
|
|
|
|
via e-mail C, |
1594
|
|
|
|
|
|
|
or through the web interface at |
1595
|
|
|
|
|
|
|
L. |
1596
|
|
|
|
|
|
|
|
1597
|
|
|
|
|
|
|
Your contribution is appreciated. |
1598
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
=head1 SUPPORT |
1600
|
|
|
|
|
|
|
|
1601
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
1602
|
|
|
|
|
|
|
|
1603
|
|
|
|
|
|
|
perldoc Statistics::Descriptive::LogScale |
1604
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
You can also look for information at: |
1606
|
|
|
|
|
|
|
|
1607
|
|
|
|
|
|
|
=over 4 |
1608
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
=item * GitHub: |
1610
|
|
|
|
|
|
|
|
1611
|
|
|
|
|
|
|
L |
1612
|
|
|
|
|
|
|
|
1613
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here) |
1614
|
|
|
|
|
|
|
|
1615
|
|
|
|
|
|
|
L |
1616
|
|
|
|
|
|
|
|
1617
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
1618
|
|
|
|
|
|
|
|
1619
|
|
|
|
|
|
|
L |
1620
|
|
|
|
|
|
|
|
1621
|
|
|
|
|
|
|
=item * CPAN Ratings |
1622
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
L |
1624
|
|
|
|
|
|
|
|
1625
|
|
|
|
|
|
|
=item * Search CPAN |
1626
|
|
|
|
|
|
|
|
1627
|
|
|
|
|
|
|
L |
1628
|
|
|
|
|
|
|
|
1629
|
|
|
|
|
|
|
=back |
1630
|
|
|
|
|
|
|
|
1631
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
1632
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
This module was inspired by a talk that Andrew Aksyonoff, author of |
1634
|
|
|
|
|
|
|
L, |
1635
|
|
|
|
|
|
|
has given at HighLoad++ conference in Moscow, 2012. |
1636
|
|
|
|
|
|
|
|
1637
|
|
|
|
|
|
|
L was and is used as reference when in doubt. |
1638
|
|
|
|
|
|
|
Several code snippets were shamelessly stolen from there. |
1639
|
|
|
|
|
|
|
|
1640
|
|
|
|
|
|
|
C and C parameter names were suggested by |
1641
|
|
|
|
|
|
|
CountZero from http://perlmonks.org |
1642
|
|
|
|
|
|
|
|
1643
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
1644
|
|
|
|
|
|
|
|
1645
|
|
|
|
|
|
|
Copyright 2013-2015 Konstantin S. Uvarin. |
1646
|
|
|
|
|
|
|
|
1647
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
1648
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
1649
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
1650
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
1652
|
|
|
|
|
|
|
|
1653
|
|
|
|
|
|
|
=cut |
1654
|
|
|
|
|
|
|
|
1655
|
|
|
|
|
|
|
1; # End of Statistics::Descriptive::LogScale |