line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Statistics::Discrete |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# Chiara Orsini, CAIDA, UC San Diego |
6
|
|
|
|
|
|
|
# chiara@caida.org |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright (C) 2014 The Regents of the University of California. |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# Statistics::Discrete is free software: you can redistribute it and/or modify |
11
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
12
|
|
|
|
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
13
|
|
|
|
|
|
|
# (at your option) any later version. |
14
|
|
|
|
|
|
|
# |
15
|
|
|
|
|
|
|
# Statistics::Discrete is distributed in the hope that it will be useful, |
16
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
17
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
18
|
|
|
|
|
|
|
# GNU General Public License for more details. |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
21
|
|
|
|
|
|
|
# along with Statistics::Discrete. If not, see . |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
package Statistics::Discrete; |
25
|
|
|
|
|
|
|
|
26
|
1
|
|
|
1
|
|
25627
|
use 5.012; |
|
1
|
|
|
|
|
5
|
|
27
|
1
|
|
|
1
|
|
7
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
24
|
|
28
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
33
|
|
29
|
1
|
|
|
1
|
|
13
|
use List::Util; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
71
|
|
30
|
1
|
|
|
1
|
|
1047
|
use POSIX; |
|
1
|
|
|
|
|
8864
|
|
|
1
|
|
|
|
|
7
|
|
31
|
1
|
|
|
1
|
|
6142
|
use Storable 'dclone'; |
|
1
|
|
|
|
|
5043
|
|
|
1
|
|
|
|
|
110
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
our $VERSION = '0.05.00'; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# require Exporter; |
36
|
|
|
|
|
|
|
# our @EXPORT_OK = ( 'NO_BINNING', 'LIN_BINNING', 'LOG_BINNING'); |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
use constant { |
39
|
1
|
|
|
|
|
105
|
NO_BINNING => 0, |
40
|
|
|
|
|
|
|
LIN_BINNING => 1, |
41
|
|
|
|
|
|
|
LOG_BINNING => 2, |
42
|
1
|
|
|
1
|
|
8
|
}; |
|
1
|
|
|
|
|
2
|
|
43
|
|
|
|
|
|
|
|
44
|
1
|
|
|
1
|
|
5
|
use constant LOG_BASE => 10; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
45
|
|
45
|
1
|
|
|
1
|
|
5
|
use constant DEFAULT_BINS_NUMBER => 10; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
5027
|
|
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# Constructor |
49
|
|
|
|
|
|
|
sub new { |
50
|
1
|
|
|
1
|
0
|
469
|
my $class = shift; |
51
|
1
|
|
|
|
|
2
|
my $self = {}; |
52
|
1
|
|
|
|
|
3
|
$self->{"data_frequency"} = {}; # anonymous hash constructor |
53
|
|
|
|
|
|
|
# binning default values |
54
|
1
|
|
|
|
|
3
|
$self->{"bin-type"} = NO_BINNING; |
55
|
1
|
|
|
|
|
3
|
$self->{"optimal-binning"} = 1; |
56
|
|
|
|
|
|
|
# $self->{"bins"} - bins' structure |
57
|
|
|
|
|
|
|
# $self->{"num-bins"} - num bins when optimal binning is off |
58
|
|
|
|
|
|
|
# on-demand stored stats |
59
|
|
|
|
|
|
|
# $self->{"stats"}{"Desc"} - descriptive statistics |
60
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"} - distributions |
61
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"} - binned distributions |
62
|
|
|
|
|
|
|
# bind self and class |
63
|
1
|
|
|
|
|
2
|
bless $self, $class; |
64
|
1
|
|
|
|
|
4
|
return $self; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
################## Methods to load/add data ################## |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub add_data { |
70
|
1
|
|
|
1
|
0
|
401
|
my $self = shift; |
71
|
1
|
|
|
|
|
3
|
my @data_to_add = @_; |
72
|
1
|
|
|
|
|
2
|
my $data; |
73
|
1
|
|
|
|
|
2
|
foreach $data(@data_to_add) { |
74
|
10
|
100
|
|
|
|
28
|
if(defined($self->{"data_frequency"}{$data})) { |
75
|
4
|
|
|
|
|
9
|
$self->{"data_frequency"}{$data}++; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
else { |
78
|
6
|
|
|
|
|
17
|
$self->{"data_frequency"}{$data} = 1 ; |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
# data has changed - stats and bins need to be computed again |
82
|
1
|
|
|
|
|
2
|
delete $self->{"stats"}; |
83
|
1
|
|
|
|
|
3
|
delete $self->{"bins"}; |
84
|
1
|
|
|
|
|
3
|
return; |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
sub add_data_from_file { |
88
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
89
|
0
|
|
|
|
|
0
|
my $filename = shift; |
90
|
0
|
|
|
|
|
0
|
my @cols; |
91
|
|
|
|
|
|
|
my $f; |
92
|
0
|
|
|
|
|
0
|
my $line; |
93
|
0
|
0
|
|
|
|
0
|
open($f, "<", $filename) or die "Cannot open " .$filename . ": $!"; |
94
|
0
|
|
|
|
|
0
|
while($line = <$f>) { |
95
|
0
|
|
|
|
|
0
|
chomp($line); |
96
|
0
|
0
|
|
|
|
0
|
if($line =~ /^\s*#/) { |
97
|
0
|
|
|
|
|
0
|
next; |
98
|
|
|
|
|
|
|
} |
99
|
0
|
|
|
|
|
0
|
@cols = split /\s/ , $line; |
100
|
0
|
0
|
|
|
|
0
|
if(scalar @cols == 1) { # assume list of values |
101
|
0
|
0
|
|
|
|
0
|
if(defined($self->{"data_frequency"}{$cols[0]})) { |
102
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[0]}++; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
else { |
105
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[0]} = 1 ; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
} |
108
|
0
|
0
|
|
|
|
0
|
if(scalar @cols == 2) { # assume list of id - value pairs |
109
|
0
|
0
|
|
|
|
0
|
if(defined($self->{"data_frequency"}{$cols[1]})) { |
110
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[1]}++; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
else { |
113
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[1]} = 1 ; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
# File format not recognized |
117
|
0
|
0
|
0
|
|
|
0
|
if(scalar @cols != 1 and scalar @cols != 2){ |
118
|
0
|
|
|
|
|
0
|
print "File format not recognized\n"; |
119
|
0
|
|
|
|
|
0
|
close($f); |
120
|
0
|
|
|
|
|
0
|
delete $self->{"stats"}; |
121
|
0
|
|
|
|
|
0
|
delete $self->{"bins"}; |
122
|
0
|
|
|
|
|
0
|
return; |
123
|
|
|
|
|
|
|
} |
124
|
|
|
|
|
|
|
} |
125
|
0
|
|
|
|
|
0
|
close($f); |
126
|
|
|
|
|
|
|
# data has changed - stats and bins need to be computed again |
127
|
0
|
|
|
|
|
0
|
delete $self->{"stats"}; |
128
|
0
|
|
|
|
|
0
|
delete $self->{"bins"}; |
129
|
0
|
|
|
|
|
0
|
return; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub add_data_from_frequencyfile { |
133
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
134
|
0
|
|
|
|
|
0
|
my $filename = shift; |
135
|
0
|
|
|
|
|
0
|
my @cols; |
136
|
|
|
|
|
|
|
my $f; |
137
|
0
|
|
|
|
|
0
|
my $line; |
138
|
0
|
0
|
|
|
|
0
|
open($f, "<", $filename) or die "Cannot open " .$filename . ": $!"; |
139
|
0
|
|
|
|
|
0
|
while($line = <$f>) { |
140
|
0
|
|
|
|
|
0
|
chomp($line); |
141
|
0
|
0
|
|
|
|
0
|
if($line =~ /^\s*#/) { |
142
|
0
|
|
|
|
|
0
|
next; |
143
|
|
|
|
|
|
|
} |
144
|
0
|
|
|
|
|
0
|
@cols = split /\s+/ , $line; |
145
|
|
|
|
|
|
|
#DEBUG print $line . "\t" . @cols . "\n"; |
146
|
0
|
0
|
|
|
|
0
|
if(scalar @cols == 2) { # assume list of value - count |
147
|
0
|
0
|
|
|
|
0
|
if(defined($self->{"data_frequency"}{$cols[0]})) { |
148
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[0]} += $cols[1]; |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
else { |
151
|
0
|
|
|
|
|
0
|
$self->{"data_frequency"}{$cols[0]} = $cols[1]; |
152
|
|
|
|
|
|
|
} |
153
|
0
|
|
|
|
|
0
|
next; |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
# File format not recognized |
156
|
0
|
0
|
|
|
|
0
|
if(scalar @cols != 2){ |
157
|
0
|
|
|
|
|
0
|
print "File format not recognized\n"; |
158
|
0
|
|
|
|
|
0
|
close($f); |
159
|
0
|
|
|
|
|
0
|
delete $self->{"stats"}; |
160
|
0
|
|
|
|
|
0
|
delete $self->{"bins"}; |
161
|
0
|
|
|
|
|
0
|
return; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
} |
164
|
0
|
|
|
|
|
0
|
close($f); |
165
|
|
|
|
|
|
|
# data has changed - stats and bins need to be computed again |
166
|
0
|
|
|
|
|
0
|
delete $self->{"stats"}; |
167
|
0
|
|
|
|
|
0
|
delete $self->{"bins"}; |
168
|
0
|
|
|
|
|
0
|
return; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
################## Descriptive Statistics ################## |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Minimum value |
175
|
|
|
|
|
|
|
# compute the minimum value, save the statistic, and return it |
176
|
|
|
|
|
|
|
sub minimum { |
177
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
178
|
1
|
50
|
|
|
|
5
|
if(!defined($self->{"stats"}{"Desc"}{"min"})) { |
179
|
1
|
|
|
|
|
25
|
my $count = $self->count(); |
180
|
1
|
50
|
|
|
|
9
|
if($count > 0) { |
181
|
1
|
|
|
|
|
2
|
$self->{"stats"}{"Desc"}{"min"} = List::Util::min(keys %{$self->{"data_frequency"}}); |
|
1
|
|
|
|
|
15
|
|
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
else { |
184
|
0
|
|
|
|
|
0
|
$self->{"stats"}{"Desc"}{"min"} = 0; |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
} |
187
|
1
|
|
|
|
|
5
|
return $self->{"stats"}{"Desc"}{"min"}; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# Maximum value |
192
|
|
|
|
|
|
|
# compute the minimum value, save the statistic, and return it |
193
|
|
|
|
|
|
|
sub maximum { |
194
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
195
|
1
|
50
|
|
|
|
5
|
if(!defined($self->{"stats"}{"Desc"}{"max"})) { |
196
|
1
|
|
|
|
|
3
|
my $count = $self->count(); |
197
|
1
|
50
|
|
|
|
3
|
if($count > 0) { |
198
|
1
|
|
|
|
|
2
|
$self->{"stats"}{"Desc"}{"max"} = List::Util::max(keys %{$self->{"data_frequency"}}); |
|
1
|
|
|
|
|
8
|
|
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
else { |
201
|
0
|
|
|
|
|
0
|
$self->{"stats"}{"Desc"}{"max"} = 0; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
} |
204
|
1
|
|
|
|
|
5
|
return $self->{"stats"}{"Desc"}{"max"}; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
# count the number of samples provided |
209
|
|
|
|
|
|
|
# save the statistic, and return it |
210
|
|
|
|
|
|
|
sub count { |
211
|
6
|
|
|
6
|
1
|
15
|
my $self = shift; |
212
|
6
|
100
|
|
|
|
19
|
if(!defined($self->{"stats"}{"Desc"}{"count"})) { |
213
|
1
|
|
|
|
|
2
|
my $count = 0; |
214
|
1
|
|
|
|
|
1
|
my $data; |
215
|
1
|
|
|
|
|
2
|
foreach $data(keys %{$self->{"data_frequency"}}) { |
|
1
|
|
|
|
|
5
|
|
216
|
6
|
|
|
|
|
11
|
$count += $self->{"data_frequency"}{$data}; |
217
|
|
|
|
|
|
|
} |
218
|
1
|
|
|
|
|
4
|
$self->{"stats"}{"Desc"}{"count"} = $count; |
219
|
|
|
|
|
|
|
} |
220
|
6
|
|
|
|
|
15
|
return $self->{"stats"}{"Desc"}{"count"}; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Expected_value |
225
|
|
|
|
|
|
|
# the weighted average of the possible values, |
226
|
|
|
|
|
|
|
# using their probabilities as their weights |
227
|
|
|
|
|
|
|
sub mean { |
228
|
2
|
|
|
2
|
1
|
5
|
my $self = shift; |
229
|
2
|
100
|
|
|
|
7
|
if(!defined($self->{"stats"}{"Desc"}{"mean"})) { |
230
|
1
|
|
|
|
|
2
|
my $cumul_value = 0; |
231
|
1
|
|
|
|
|
4
|
my $count = $self->count(); |
232
|
1
|
|
|
|
|
2
|
my $v; |
233
|
1
|
|
|
|
|
2
|
foreach $v( keys %{$self->{"data_frequency"}}) { |
|
1
|
|
|
|
|
4
|
|
234
|
6
|
|
|
|
|
14
|
$cumul_value += $v * $self->{"data_frequency"}{$v}; |
235
|
|
|
|
|
|
|
} |
236
|
1
|
50
|
|
|
|
4
|
if($count > 0) { |
237
|
1
|
|
|
|
|
4
|
$self->{"stats"}{"Desc"}{"mean"} = $cumul_value / $count; |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
else { |
240
|
0
|
|
|
|
|
0
|
$self->{"stats"}{"Desc"}{"mean"} = 0; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
} |
243
|
2
|
|
|
|
|
8
|
return $self->{"stats"}{"Desc"}{"mean"}; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Median |
249
|
|
|
|
|
|
|
# the median is the numerical value separating |
250
|
|
|
|
|
|
|
# the higher half of a data sample, a population, |
251
|
|
|
|
|
|
|
# or a probability distribution, from the lower half. |
252
|
|
|
|
|
|
|
sub median { |
253
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
254
|
1
|
50
|
|
|
|
6
|
if(!defined($self->{"stats"}{"Desc"}{"median"})) { |
255
|
1
|
|
|
|
|
3
|
my $count = $self->count(); |
256
|
1
|
|
|
|
|
3
|
my $odd = 0; |
257
|
1
|
50
|
|
|
|
4
|
if($count % 2 != 0) { |
258
|
0
|
|
|
|
|
0
|
$odd = 1; |
259
|
|
|
|
|
|
|
} |
260
|
1
|
|
|
|
|
15
|
my $median_index = floor($count/2) + $odd; |
261
|
1
|
|
|
|
|
2
|
my $current_index = 0; |
262
|
|
|
|
|
|
|
|
263
|
1
|
|
|
|
|
2
|
my ($v,$i); |
264
|
1
|
|
|
|
|
2
|
foreach $v(sort {$a<=>$b} keys %{$self->{"data_frequency"}}) { |
|
10
|
|
|
|
|
17
|
|
|
1
|
|
|
|
|
7
|
|
265
|
6
|
|
|
|
|
18
|
for($i=0; $i < $self->{"data_frequency"}{$v}; $i++) { |
266
|
10
|
|
|
|
|
11
|
$current_index++; |
267
|
10
|
100
|
|
|
|
22
|
if($current_index == $median_index) { |
268
|
1
|
|
|
|
|
3
|
$self->{"stats"}{"Desc"}{"median"} = $v; |
269
|
|
|
|
|
|
|
} |
270
|
10
|
100
|
66
|
|
|
45
|
if($current_index == ($median_index+1) and |
271
|
|
|
|
|
|
|
$odd == 0) { |
272
|
1
|
|
|
|
|
5
|
$self->{"stats"}{"Desc"}{"median"} = ($self->{"stats"}{"Desc"}{"median"} + $v)/2; |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
} |
278
|
1
|
|
|
|
|
5
|
return $self->{"stats"}{"Desc"}{"median"}; |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Percentile |
283
|
|
|
|
|
|
|
# A percentile (or a centile) is a measure |
284
|
|
|
|
|
|
|
# used in statistics indicating the value |
285
|
|
|
|
|
|
|
# below which a given percentage of observations |
286
|
|
|
|
|
|
|
# in a group of observations fall. |
287
|
|
|
|
|
|
|
sub percentile { |
288
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
289
|
0
|
|
|
|
|
0
|
my $perc = shift; |
290
|
0
|
|
|
|
|
0
|
my $max_probability = $perc/100; |
291
|
0
|
|
|
|
|
0
|
my $cdf = $self->empirical_distribution_function(); |
292
|
0
|
|
|
|
|
0
|
my $v = 0; |
293
|
0
|
|
|
|
|
0
|
my $previous_v = 0; |
294
|
0
|
|
|
|
|
0
|
my $epsilon = 0; |
295
|
0
|
|
|
|
|
0
|
foreach $v(sort {$a<=>$b} keys %{$cdf}) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
296
|
0
|
|
|
|
|
0
|
$epsilon = $v - $previous_v; |
297
|
0
|
0
|
|
|
|
0
|
if($cdf->{$v} > $max_probability) { |
298
|
0
|
|
|
|
|
0
|
return $v; |
299
|
|
|
|
|
|
|
} |
300
|
0
|
|
|
|
|
0
|
$previous_v = $v; |
301
|
|
|
|
|
|
|
} |
302
|
|
|
|
|
|
|
# if the program is here then $perc == 1 |
303
|
|
|
|
|
|
|
# then we return $v+epsilon |
304
|
0
|
|
|
|
|
0
|
return $v+$epsilon; |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Variance |
310
|
|
|
|
|
|
|
# The variance of a random variable X is |
311
|
|
|
|
|
|
|
# its second central moment, the expected |
312
|
|
|
|
|
|
|
# value of the squared deviation from the mean |
313
|
|
|
|
|
|
|
sub variance { |
314
|
1
|
|
|
1
|
1
|
2
|
my $self = shift; |
315
|
1
|
50
|
|
|
|
6
|
if(!defined($self->{"stats"}{"Desc"}{"variance"})) { |
316
|
1
|
|
|
|
|
3
|
my $mean = $self->mean(); |
317
|
1
|
|
|
|
|
3
|
my $count = $self->count(); |
318
|
1
|
|
|
|
|
2
|
my $cumul_value = 0; |
319
|
1
|
|
|
|
|
2
|
my $square_mean = 0; |
320
|
1
|
|
|
|
|
7
|
my $v; |
321
|
1
|
|
|
|
|
3
|
foreach $v(keys %{$self->{"data_frequency"}}) { |
|
1
|
|
|
|
|
12
|
|
322
|
6
|
|
|
|
|
16
|
$cumul_value += ($v**2) * $self->{"data_frequency"}{$v}; |
323
|
|
|
|
|
|
|
} |
324
|
1
|
50
|
|
|
|
4
|
if($count > 0) { |
325
|
1
|
|
|
|
|
2
|
$square_mean = $cumul_value / $count; |
326
|
|
|
|
|
|
|
} |
327
|
1
|
|
|
|
|
4
|
$self->{"stats"}{"Desc"}{"variance"} = $square_mean - ($mean**2); |
328
|
|
|
|
|
|
|
} |
329
|
1
|
|
|
|
|
17
|
return $self->{"stats"}{"Desc"}{"variance"}; |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Standard_deviation |
334
|
|
|
|
|
|
|
# The standard deviation of a random variable |
335
|
|
|
|
|
|
|
# is the square root of its variance. |
336
|
|
|
|
|
|
|
sub standard_deviation { |
337
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
338
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Desc"}{"sdev"})) { |
339
|
0
|
|
|
|
|
|
my $variance = $self->variance(); |
340
|
0
|
|
|
|
|
|
$self->{"stats"}{"Desc"}{"sdev"} = sqrt($variance); |
341
|
|
|
|
|
|
|
} |
342
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Desc"}{"sdev"}; |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
################## Distributions Statistics ################## |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Frequency_distribution |
349
|
|
|
|
|
|
|
# the frequency distribution is an arrangement of the values |
350
|
|
|
|
|
|
|
# that one or more variables take in a sample. Each entry in |
351
|
|
|
|
|
|
|
# the table contains the frequency or count of the occurrences |
352
|
|
|
|
|
|
|
# of values within a particular group or interval |
353
|
|
|
|
|
|
|
sub frequency_distribution { |
354
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
355
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
356
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty frequency distribution. \n"; |
357
|
0
|
|
|
|
|
|
return (); # empty hash |
358
|
|
|
|
|
|
|
} |
359
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
360
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"frequency"} |
361
|
0
|
|
|
|
|
|
return $self->_frequency_distribution(); |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
else { |
364
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"}{"frequency"} |
365
|
0
|
|
|
|
|
|
return $self->_binned_frequency_distribution(); |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Probability_mass_function |
371
|
|
|
|
|
|
|
# the probability mass function (pmf) is a function that gives |
372
|
|
|
|
|
|
|
# the probability that a discrete random variable is exactly |
373
|
|
|
|
|
|
|
# equal to some value. |
374
|
|
|
|
|
|
|
sub probability_mass_function { |
375
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
376
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
377
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty probability mass function. \n"; |
378
|
0
|
|
|
|
|
|
return (); # empty hash |
379
|
|
|
|
|
|
|
} |
380
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
381
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"pmf"} |
382
|
0
|
|
|
|
|
|
return $self->_probability_mass_function(); |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
else { |
385
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"}{"pmf"} |
386
|
0
|
|
|
|
|
|
return $self->_binned_probability_mass_function(); |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Empirical_distribution_function |
392
|
|
|
|
|
|
|
# the empirical distribution function, or empirical cdf, is the |
393
|
|
|
|
|
|
|
# cumulative distribution function associated with the empirical |
394
|
|
|
|
|
|
|
# measure of the sample. This cdf is a step function that jumps up |
395
|
|
|
|
|
|
|
# by 1/n at each of the n data points. |
396
|
|
|
|
|
|
|
sub empirical_distribution_function { |
397
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
398
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
399
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty emptirical distribution function. \n"; |
400
|
0
|
|
|
|
|
|
return (); # empty hash |
401
|
|
|
|
|
|
|
} |
402
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
403
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"cdf"} |
404
|
0
|
|
|
|
|
|
return $self->_empirical_distribution_function(); |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
else { |
407
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"}{"cdf"} |
408
|
0
|
|
|
|
|
|
return $self->_binned_empirical_distribution_function(); |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/CCDF |
414
|
|
|
|
|
|
|
# how often the random variable is above a particular level. |
415
|
|
|
|
|
|
|
sub complementary_cumulative_distribution_function { |
416
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
417
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
418
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty complementary cumulative distribution function. \n"; |
419
|
0
|
|
|
|
|
|
return (); # empty hash |
420
|
|
|
|
|
|
|
} |
421
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
422
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"ccdf"} |
423
|
0
|
|
|
|
|
|
return $self->_complementary_cumulative_distribution_function(); |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
else { |
426
|
|
|
|
|
|
|
# $self->{"stats"}{"Dist"}{"binned"}{"ccdf"} |
427
|
0
|
|
|
|
|
|
return $self->_binned_complementary_cumulative_distribution_function(); |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
################## Regular Distributions Statistics ################## |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
sub _frequency_distribution { |
437
|
0
|
|
|
0
|
|
|
my $self = shift; |
438
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"frequency"})) { |
439
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"frequency"} = dclone $self->{"data_frequency"}; |
440
|
|
|
|
|
|
|
} |
441
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"frequency"}; |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
sub _probability_mass_function { |
446
|
0
|
|
|
0
|
|
|
my $self = shift; |
447
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"pmf"})) { |
448
|
0
|
|
|
|
|
|
my $count = $self->count(); |
449
|
0
|
|
|
|
|
|
$self->_frequency_distribution(); |
450
|
0
|
|
|
|
|
|
my ($val,$freq); |
451
|
0
|
|
|
|
|
|
foreach $val(keys %{$self->{"stats"}{"Dist"}{"frequency"}}) { |
|
0
|
|
|
|
|
|
|
452
|
0
|
|
|
|
|
|
$freq = $self->{"stats"}{"Dist"}{"frequency"}{$val}; |
453
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"pmf"}{$val} = $freq / $count; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
} |
456
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"pmf"}; |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
sub _empirical_distribution_function { |
461
|
0
|
|
|
0
|
|
|
my $self = shift; |
462
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"cdf"})) { |
463
|
0
|
|
|
|
|
|
my $count = $self->count(); |
464
|
0
|
|
|
|
|
|
$self->_frequency_distribution(); |
465
|
0
|
|
|
|
|
|
my $val; |
466
|
0
|
|
|
|
|
|
my $cumul_freq = 0; |
467
|
0
|
|
|
|
|
|
foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
468
|
0
|
|
|
|
|
|
$cumul_freq += $self->{"stats"}{"Dist"}{"frequency"}{$val}; |
469
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"cdf"}{$val} = $cumul_freq / $count; |
470
|
|
|
|
|
|
|
} |
471
|
|
|
|
|
|
|
} |
472
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"cdf"}; |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub _complementary_cumulative_distribution_function { |
477
|
0
|
|
|
0
|
|
|
my $self = shift; |
478
|
|
|
|
|
|
|
# warning, this creates self->{"stats"} if it doesn't exist |
479
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"ccdf"})) { |
480
|
0
|
|
|
|
|
|
my $count = $self->count(); |
481
|
0
|
|
|
|
|
|
$self->frequency_distribution(); |
482
|
0
|
|
|
|
|
|
my $val; |
483
|
0
|
|
|
|
|
|
my $cumul_freq = $count; |
484
|
0
|
|
|
|
|
|
foreach $val( sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
485
|
0
|
|
|
|
|
|
$cumul_freq -= $self->{"stats"}{"Dist"}{"frequency"}{$val}; |
486
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"ccdf"}{$val} = $cumul_freq / $count; |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
} |
489
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"ccdf"}; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
################## Binned Distributions Statistics ################## |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
sub _binned_frequency_distribution { |
498
|
0
|
|
|
0
|
|
|
my $self = shift; |
499
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"binned"}{"frequency"})) { |
500
|
|
|
|
|
|
|
# 1. get bins |
501
|
0
|
|
|
|
|
|
$self->bins(); # set $self->{"bins"} |
502
|
|
|
|
|
|
|
# 2. get regular frequency distribution |
503
|
0
|
|
|
|
|
|
$self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"} |
504
|
|
|
|
|
|
|
# 3. compute binned property |
505
|
0
|
|
|
|
|
|
my $binned_frequency = {}; |
506
|
0
|
|
|
|
|
|
my $i = 0; # vals iterator |
507
|
0
|
|
|
|
|
|
my @sorted_vals = (sort {$a<=>$b} keys %{$self->{"stats"}{"Dist"}{"frequency"}}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
508
|
0
|
|
|
|
|
|
my $num_vals = scalar @sorted_vals; |
509
|
0
|
|
|
|
|
|
my $bi = 0; # bins iterator |
510
|
0
|
|
|
|
|
|
my @sorted_bins = (sort {$a<=>$b} keys %{$self->{"bins"}}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
511
|
0
|
|
|
|
|
|
my $num_bins = scalar @sorted_bins; |
512
|
0
|
|
|
|
|
|
my $freq_sum = 0; |
513
|
0
|
|
|
|
|
|
for(; $bi < $num_bins; $bi++) { # for each bin |
514
|
0
|
|
|
|
|
|
for(; $i < $num_vals; $i++) { |
515
|
0
|
0
|
|
|
|
|
if($sorted_vals[$i] < $self->{"bins"}->{$sorted_bins[$bi]}{"right"}) { |
516
|
0
|
|
|
|
|
|
$freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_vals[$i]} |
517
|
|
|
|
|
|
|
} |
518
|
|
|
|
|
|
|
else { |
519
|
0
|
|
|
|
|
|
$binned_frequency->{$sorted_bins[$bi]} = $freq_sum; |
520
|
0
|
|
|
|
|
|
$freq_sum = 0; |
521
|
0
|
|
|
|
|
|
last; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
} |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# if last element of vals then we save the current freq sum |
526
|
|
|
|
|
|
|
# in the current bin |
527
|
0
|
0
|
|
|
|
|
if($i == $num_vals) { |
528
|
0
|
|
|
|
|
|
$binned_frequency->{$sorted_bins[$bi]} = $freq_sum; |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
} |
531
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"frequency"} = $binned_frequency; |
532
|
|
|
|
|
|
|
} |
533
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"binned"}{"frequency"}; |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
sub _binned_probability_mass_function { |
538
|
0
|
|
|
0
|
|
|
my $self = shift; |
539
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"binned"}{"pmf"})) { |
540
|
0
|
|
|
|
|
|
my $count = $self->count(); |
541
|
0
|
|
|
|
|
|
my $bins = $self->bins(); # set $self->{"bins"} |
542
|
0
|
|
|
|
|
|
my $bfd = $self->_binned_frequency_distribution(); |
543
|
0
|
|
|
|
|
|
my ($val,$freq, $interval); |
544
|
0
|
|
|
|
|
|
foreach $val(keys %{$bfd}) { |
|
0
|
|
|
|
|
|
|
545
|
0
|
|
|
|
|
|
$freq = $self->{"stats"}{"Dist"}{"binned"}{"frequency"}{$val}; |
546
|
0
|
|
|
|
|
|
$interval = $self->{"bins"}{$val}{"right"} - $self->{"bins"}{$val}{"left"}; |
547
|
0
|
0
|
|
|
|
|
if($interval > 0) { |
548
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = ($freq / $count) / $interval; |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
else { |
551
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"pmf"}{$val} = $freq / $count; |
552
|
|
|
|
|
|
|
} |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
} |
555
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"binned"}{"pmf"}; |
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
sub _binned_empirical_distribution_function { |
560
|
0
|
|
|
0
|
|
|
my $self = shift; |
561
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"binned"}{"cdf"})) { |
562
|
0
|
|
|
|
|
|
my $bpmf = $self->_binned_probability_mass_function(); |
563
|
0
|
|
|
|
|
|
my $val; |
564
|
0
|
|
|
|
|
|
my $cumul_prob = 0; |
565
|
0
|
|
|
|
|
|
foreach $val(sort {$a<=>$b} keys %{$bpmf}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
566
|
0
|
|
|
|
|
|
$cumul_prob += $bpmf->{$val}; |
567
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"cdf"}{$val} = $cumul_prob; |
568
|
|
|
|
|
|
|
} |
569
|
|
|
|
|
|
|
} |
570
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"binned"}{"cdf"}; |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
sub _binned_complementary_cumulative_distribution_function { |
575
|
0
|
|
|
0
|
|
|
my $self = shift; |
576
|
0
|
0
|
|
|
|
|
if(!defined($self->{"stats"}{"Dist"}{"binned"}{"ccdf"})) { |
577
|
0
|
|
|
|
|
|
my $bpmf = $self->_binned_probability_mass_function(); |
578
|
0
|
|
|
|
|
|
my $val; |
579
|
0
|
|
|
|
|
|
my $cumul_prob = 0; |
580
|
0
|
|
|
|
|
|
my $old_cumul_prob = 0; |
581
|
0
|
|
|
|
|
|
foreach $val(sort {$b<=>$a} keys %{$bpmf}) { # descending sort |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
582
|
0
|
|
|
|
|
|
$old_cumul_prob = $cumul_prob; |
583
|
0
|
|
|
|
|
|
$cumul_prob += $bpmf->{$val}; |
584
|
0
|
|
|
|
|
|
$self->{"stats"}{"Dist"}{"binned"}{"ccdf"}{$val} = $old_cumul_prob; |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
} |
587
|
0
|
|
|
|
|
|
return $self->{"stats"}{"Dist"}{"binned"}{"ccdf"}; |
588
|
|
|
|
|
|
|
} |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
################## Binned Related Functions ################## |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
sub set_binning_type { |
595
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
596
|
0
|
|
|
|
|
|
my $bin_type = shift; |
597
|
|
|
|
|
|
|
# check bin-type validity |
598
|
0
|
0
|
0
|
|
|
|
if($bin_type != NO_BINNING && $bin_type != LIN_BINNING && |
|
|
|
0
|
|
|
|
|
599
|
|
|
|
|
|
|
$bin_type != LOG_BINNING) { |
600
|
0
|
|
|
|
|
|
die "Wrong binning type provided\n"; |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
# if bin type is already set |
603
|
0
|
0
|
|
|
|
|
if($bin_type == $self->{"bin-type"}) { |
604
|
0
|
|
|
|
|
|
return; |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
# if bin-type is different |
607
|
0
|
|
|
|
|
|
$self->{"bin-type"} = $bin_type; |
608
|
0
|
|
|
|
|
|
delete $self->{"num-bins"}; |
609
|
0
|
|
|
|
|
|
delete $self->{"bins"}; |
610
|
0
|
|
|
|
|
|
delete $self->{"stats"}{"Dist"}{"binned"}; |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
sub set_optimal_binning { |
615
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
616
|
0
|
0
|
|
|
|
|
if( $self->{"optimal-binning"} == 0) { |
617
|
0
|
|
|
|
|
|
delete $self->{"bins"}; |
618
|
0
|
|
|
|
|
|
delete $self->{"num-bins"}; |
619
|
0
|
|
|
|
|
|
delete $self->{"stats"}{"Dist"}{"binned"}; |
620
|
|
|
|
|
|
|
} |
621
|
0
|
|
|
|
|
|
$self->{"optimal-binning"} = 1; |
622
|
|
|
|
|
|
|
} |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
sub set_custom_num_bins { |
626
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
627
|
0
|
|
|
|
|
|
my $nb = shift; |
628
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == NO_BINNING) { |
629
|
0
|
|
|
|
|
|
die "Set a binning type before setting the num of bins\n"; |
630
|
|
|
|
|
|
|
} |
631
|
0
|
0
|
|
|
|
|
if( $nb < 2 ) { |
632
|
0
|
|
|
|
|
|
die "Wrong number of bins provided: " . $nb . "\n"; |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
# check if there is a change |
635
|
0
|
0
|
0
|
|
|
|
if( $self->{"optimal-binning"} == 0 and |
|
|
|
0
|
|
|
|
|
636
|
|
|
|
|
|
|
defined ($self->{"num-bins"}) and |
637
|
|
|
|
|
|
|
$self->{"num-bins"} == $nb) { |
638
|
|
|
|
|
|
|
# nothing changed -> nothing to do |
639
|
|
|
|
|
|
|
} |
640
|
|
|
|
|
|
|
else { |
641
|
|
|
|
|
|
|
# update binning parameters |
642
|
0
|
|
|
|
|
|
$self->{"optimal-binning"} = 0; |
643
|
0
|
|
|
|
|
|
$self->{"num-bins"} = $nb; |
644
|
0
|
|
|
|
|
|
delete $self->{"stats"}{"Dist"}{"binned"}; |
645
|
0
|
|
|
|
|
|
delete $self->{"bins"}; |
646
|
|
|
|
|
|
|
} |
647
|
|
|
|
|
|
|
} |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
sub bins { |
651
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
652
|
0
|
0
|
|
|
|
|
if($self->count() == 0) { |
653
|
0
|
|
|
|
|
|
print "WARNING: empty dataset, returning empty bins. \n"; |
654
|
0
|
|
|
|
|
|
return {}; # empty array |
655
|
|
|
|
|
|
|
} |
656
|
|
|
|
|
|
|
|
657
|
0
|
0
|
|
|
|
|
if(!defined($self->{"bins"})) { |
658
|
|
|
|
|
|
|
# CASE 1: if no_binning is set we do not |
659
|
|
|
|
|
|
|
# save the binning structure |
660
|
0
|
0
|
|
|
|
|
if( $self->{"bin-type"} == NO_BINNING) { |
661
|
0
|
|
|
|
|
|
my $curbins = {}; |
662
|
0
|
|
|
|
|
|
my $v; |
663
|
0
|
|
|
|
|
|
foreach $v(keys %{$self->{"data_frequency"}}) { |
|
0
|
|
|
|
|
|
|
664
|
0
|
|
|
|
|
|
my $ref = $v; |
665
|
0
|
|
|
|
|
|
$curbins->{$v}{"left"} = $v; |
666
|
0
|
|
|
|
|
|
$curbins->{$v}{"right"} = $v; |
667
|
|
|
|
|
|
|
} |
668
|
0
|
|
|
|
|
|
return $curbins; |
669
|
|
|
|
|
|
|
} |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
# CASE 2: if a custom number of bins is |
672
|
|
|
|
|
|
|
# provided, we lin-/log- bin the support |
673
|
|
|
|
|
|
|
# accordingly |
674
|
0
|
0
|
|
|
|
|
if($self->{"optimal-binning"} == 0) { |
675
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == LIN_BINNING) { |
676
|
0
|
|
|
|
|
|
$self->{"bins"} = $self->compute_lin_bins($self->{"num-bins"}); |
677
|
|
|
|
|
|
|
} |
678
|
0
|
0
|
|
|
|
|
if($self->{"bin-type"} == LOG_BINNING) { |
679
|
0
|
|
|
|
|
|
$self->{"bins"} = $self->compute_log_bins($self->{"num-bins"}); |
680
|
|
|
|
|
|
|
} |
681
|
|
|
|
|
|
|
} |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
# CASE 3: optimal binning |
684
|
|
|
|
|
|
|
else { |
685
|
0
|
|
|
|
|
|
my $res = $self->_optimal_binning(); |
686
|
0
|
0
|
|
|
|
|
if($res != 0) { |
687
|
0
|
|
|
|
|
|
print "Optimal binning was not possible on this sample, "; |
688
|
0
|
|
|
|
|
|
print "using the NO_BINNING mode" . "\n"; |
689
|
0
|
|
|
|
|
|
my $curbins = {}; |
690
|
0
|
|
|
|
|
|
my $v; |
691
|
0
|
|
|
|
|
|
foreach $v(keys %{$self->{"data_frequency"}}) { |
|
0
|
|
|
|
|
|
|
692
|
0
|
|
|
|
|
|
my $ref = $v; |
693
|
0
|
|
|
|
|
|
$curbins->{$v}{"left"} = $v; |
694
|
0
|
|
|
|
|
|
$curbins->{$v}{"right"} = $v; |
695
|
|
|
|
|
|
|
} |
696
|
0
|
|
|
|
|
|
$self->{"bin-type"} = NO_BINNING; |
697
|
0
|
|
|
|
|
|
$self->{"bins"} = $curbins; |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
} |
701
|
0
|
|
|
|
|
|
return $self->{"bins"}; |
702
|
|
|
|
|
|
|
} |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
################## Optimal Binning Related Functions ################## |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
# Compute the optimal binning |
708
|
|
|
|
|
|
|
sub _optimal_binning { |
709
|
0
|
|
|
0
|
|
|
my $self = shift; |
710
|
|
|
|
|
|
|
# LEGEND: |
711
|
|
|
|
|
|
|
# b_* => bins |
712
|
|
|
|
|
|
|
# f_* => binned frequency distribution |
713
|
|
|
|
|
|
|
# d_* => binned density distribution |
714
|
|
|
|
|
|
|
# s_* => sum of density distribution values |
715
|
|
|
|
|
|
|
# |
716
|
|
|
|
|
|
|
# start from extreme values ( min_num_bins and max_num_bins) |
717
|
|
|
|
|
|
|
# and use the bisection method on the interval |
718
|
|
|
|
|
|
|
# till the optimal is found (i.e. the binning such that |
719
|
|
|
|
|
|
|
# the sum of the density function values is the closest |
720
|
|
|
|
|
|
|
# to 1) |
721
|
0
|
|
|
|
|
|
my $min_num_bins = 2; |
722
|
0
|
|
|
|
|
|
my ($b_min, $f_min, $d_min, $s_min) = $self->_bin_attempt($min_num_bins); |
723
|
0
|
|
|
|
|
|
my $max_num_bins = scalar $self->_full_support(); |
724
|
0
|
0
|
|
|
|
|
if($max_num_bins == 1) { |
725
|
0
|
|
|
|
|
|
return -1; |
726
|
|
|
|
|
|
|
} |
727
|
0
|
|
|
|
|
|
my ($b_max, $f_max, $d_max, $s_max) = $self->_bin_attempt($max_num_bins); |
728
|
|
|
|
|
|
|
# initial check |
729
|
0
|
0
|
0
|
|
|
|
if( $s_min > 1 or $s_max < 1) { |
730
|
0
|
|
|
|
|
|
return -1; |
731
|
|
|
|
|
|
|
} |
732
|
0
|
|
|
|
|
|
my $avg_num_bins; |
733
|
0
|
|
|
|
|
|
my ($b_avg, $f_avg, $d_avg, $s_avg); |
734
|
0
|
|
|
|
|
|
while( ($max_num_bins - $min_num_bins) > 1) { |
735
|
0
|
|
|
|
|
|
$avg_num_bins = sprintf("%.f", (($max_num_bins + $min_num_bins)/2)); |
736
|
0
|
|
|
|
|
|
($b_avg, $f_avg, $d_avg, $s_avg) = $self->_bin_attempt($avg_num_bins); |
737
|
0
|
0
|
|
|
|
|
if($s_avg < 1) { |
738
|
0
|
|
|
|
|
|
$min_num_bins = $avg_num_bins; |
739
|
0
|
|
|
|
|
|
($b_min, $f_min, $d_min, $s_min) = ($b_avg, $f_avg, $d_avg, $s_avg); |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
else { |
742
|
0
|
|
|
|
|
|
$max_num_bins = $avg_num_bins; |
743
|
0
|
|
|
|
|
|
($b_max, $f_max, $d_max, $s_max) = ($b_avg, $f_avg, $d_avg, $s_avg); |
744
|
|
|
|
|
|
|
} |
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
# the binning whose density sum is closer to 1 win |
747
|
0
|
0
|
|
|
|
|
if( (1 - $s_min) < ($s_max-1) ) { |
748
|
0
|
|
|
|
|
|
$self->{"bins"} = $b_min; |
749
|
0
|
|
|
|
|
|
$self->{"Dist"}{"binned"}{"frequency"} = $f_min; |
750
|
0
|
|
|
|
|
|
$self->{"Dist"}{"binned"}{"pmf"} = $d_min; |
751
|
|
|
|
|
|
|
} |
752
|
|
|
|
|
|
|
else { |
753
|
0
|
|
|
|
|
|
$self->{"bins"} = $b_max; |
754
|
0
|
|
|
|
|
|
$self->{"Dist"}{"binned"}{"frequency"} = $f_max; |
755
|
0
|
|
|
|
|
|
$self->{"Dist"}{"binned"}{"pmf"} = $d_max; |
756
|
|
|
|
|
|
|
} |
757
|
0
|
|
|
|
|
|
return 0; |
758
|
|
|
|
|
|
|
} |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
# bin the current data into $num_bins intervals |
762
|
|
|
|
|
|
|
# and return: the bins, the binned frequency distribution |
763
|
|
|
|
|
|
|
# the binned density distribution and the sum of the |
764
|
|
|
|
|
|
|
# density values |
765
|
|
|
|
|
|
|
sub _bin_attempt { |
766
|
0
|
|
|
0
|
|
|
my $self = shift; |
767
|
0
|
|
|
|
|
|
my $num_bins = shift; |
768
|
|
|
|
|
|
|
# output values |
769
|
0
|
|
|
|
|
|
my ($cur_bins, $cur_freq, $cur_density, $cur_sum); |
770
|
0
|
|
|
|
|
|
$cur_sum = 0; |
771
|
|
|
|
|
|
|
# get the bins |
772
|
0
|
0
|
|
|
|
|
if( $self->{"bin-type"} == LIN_BINNING ) { |
773
|
0
|
|
|
|
|
|
$cur_bins = $self->compute_lin_bins($num_bins); |
774
|
|
|
|
|
|
|
} |
775
|
0
|
0
|
|
|
|
|
if( $self->{"bin-type"} == LOG_BINNING ) { |
776
|
0
|
|
|
|
|
|
$cur_bins =$self->compute_log_bins($num_bins); |
777
|
|
|
|
|
|
|
} |
778
|
|
|
|
|
|
|
# assert non binned frequency distribution is computed |
779
|
0
|
|
|
|
|
|
$self->_frequency_distribution(); # set $self->{"stats"}{"Dist"}{"frequency"} |
780
|
0
|
|
|
|
|
|
my $count = $self->count(); |
781
|
|
|
|
|
|
|
# |
782
|
0
|
|
|
|
|
|
my $s = 0; # support iterator |
783
|
0
|
|
|
|
|
|
my @sorted_support = (sort {$a<=>$b} $self->_full_support()); |
|
0
|
|
|
|
|
|
|
784
|
0
|
|
|
|
|
|
my $sup_size = scalar @sorted_support; |
785
|
0
|
|
|
|
|
|
my $bi = 0; # bins iterator |
786
|
0
|
|
|
|
|
|
my @sorted_bins = (sort {$a<=>$b} keys %{$cur_bins}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
787
|
0
|
|
|
|
|
|
my $n_bins = scalar @sorted_bins; # assert(n_bins == num_bins) |
788
|
0
|
|
|
|
|
|
my $ref = 0; |
789
|
0
|
|
|
|
|
|
my $freq_sum = 0; |
790
|
0
|
|
|
|
|
|
my $interval = 0; |
791
|
0
|
|
|
|
|
|
for(; $bi < $n_bins; $bi++) { # for each bin |
792
|
0
|
|
|
|
|
|
$ref = $sorted_bins[$bi]; |
793
|
0
|
|
|
|
|
|
$interval = $cur_bins->{$ref}{"right"} - $cur_bins->{$ref}{"left"}; |
794
|
0
|
|
|
|
|
|
for(; $s < $sup_size; $s++) { # for each support value |
795
|
|
|
|
|
|
|
# if the current support value is less than the rightmost value in the bin |
796
|
0
|
0
|
|
|
|
|
if($sorted_support[$s] < $cur_bins->{$ref}{"right"}) { |
797
|
0
|
|
|
|
|
|
$freq_sum += $self->{"stats"}{"Dist"}{"frequency"}->{$sorted_support[$s]}; |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
# else we have to save the information for the current bin |
800
|
|
|
|
|
|
|
else { |
801
|
0
|
|
|
|
|
|
$cur_freq->{$ref} = $freq_sum; |
802
|
0
|
0
|
|
|
|
|
if($interval > 0) { |
803
|
0
|
|
|
|
|
|
$cur_density->{$ref} = ($freq_sum / $count) / $interval; |
804
|
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
else { |
806
|
0
|
|
|
|
|
|
$cur_density->{$ref} = $freq_sum / $count; |
807
|
|
|
|
|
|
|
} |
808
|
0
|
|
|
|
|
|
$cur_sum += $cur_density->{$ref}; |
809
|
0
|
|
|
|
|
|
$freq_sum = 0; |
810
|
0
|
|
|
|
|
|
last; # we do not increment s |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
} |
813
|
|
|
|
|
|
|
# if last element of the support has been reached |
814
|
|
|
|
|
|
|
# we have to save the current freq sum in the current bin |
815
|
0
|
0
|
|
|
|
|
if($s == $sup_size) { |
816
|
0
|
|
|
|
|
|
$cur_freq->{$ref} = $freq_sum; |
817
|
0
|
0
|
|
|
|
|
if($interval > 0) { |
818
|
0
|
|
|
|
|
|
$cur_density->{$ref} = ($freq_sum / $count) / $interval; |
819
|
|
|
|
|
|
|
} |
820
|
|
|
|
|
|
|
else { |
821
|
0
|
|
|
|
|
|
$cur_density->{$ref} = $freq_sum / $count; |
822
|
|
|
|
|
|
|
} |
823
|
0
|
|
|
|
|
|
$cur_sum += $cur_density->{$ref}; |
824
|
|
|
|
|
|
|
} |
825
|
|
|
|
|
|
|
} |
826
|
|
|
|
|
|
|
#DEBUG print "attempt: " . $num_bins . " sum: " . $cur_sum . "\n"; |
827
|
0
|
|
|
|
|
|
return ($cur_bins, $cur_freq, $cur_density, $cur_sum); |
828
|
|
|
|
|
|
|
} |
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
sub _full_support { |
833
|
0
|
|
|
0
|
|
|
my $self = shift; |
834
|
0
|
|
|
|
|
|
my $support = {}; |
835
|
0
|
|
|
|
|
|
my $v; |
836
|
0
|
|
|
|
|
|
foreach $v( keys %{$self->{"data_frequency"}}) { |
|
0
|
|
|
|
|
|
|
837
|
0
|
|
|
|
|
|
$support->{$v} = 1; |
838
|
|
|
|
|
|
|
} |
839
|
0
|
|
|
|
|
|
return (keys %{$support}); |
|
0
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
} |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
|
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
################## Utilities ################## |
846
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
# return a structure describing a linear binning |
848
|
|
|
|
|
|
|
# of the current data ($num_bins provided) |
849
|
|
|
|
|
|
|
sub compute_lin_bins { |
850
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
851
|
0
|
|
|
|
|
|
my $num_bins = shift; |
852
|
0
|
|
|
|
|
|
my $min = $self->minimum(); |
853
|
0
|
|
|
|
|
|
my $max = $self->maximum(); |
854
|
0
|
|
|
|
|
|
my $linbins = {}; # reference to empty hash |
855
|
|
|
|
|
|
|
# last bin includes just the maximum value |
856
|
0
|
|
|
|
|
|
my $delta = abs($max - $min)/($num_bins-1); |
857
|
0
|
|
|
|
|
|
my $i; |
858
|
|
|
|
|
|
|
my $ref; |
859
|
0
|
|
|
|
|
|
for($i=0; $i < $num_bins; $i++) { |
860
|
0
|
|
|
|
|
|
$ref = $min + $i * $delta + $delta/2; |
861
|
0
|
|
|
|
|
|
$linbins->{$ref}{"left"} = $min + $i * $delta; |
862
|
0
|
|
|
|
|
|
$linbins->{$ref}{"right"} = $min + $i * $delta + $delta; |
863
|
|
|
|
|
|
|
} |
864
|
0
|
|
|
|
|
|
return $linbins; |
865
|
|
|
|
|
|
|
} |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
# return a structure describing a logarithmic binning |
869
|
|
|
|
|
|
|
# of the current data ($num_bins provided) |
870
|
|
|
|
|
|
|
# bins are not set |
871
|
|
|
|
|
|
|
sub compute_log_bins { |
872
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
873
|
0
|
|
|
|
|
|
my $num_bins = shift; |
874
|
0
|
|
|
|
|
|
my $min = $self->minimum(); |
875
|
0
|
|
|
|
|
|
my $max = $self->maximum(); |
876
|
0
|
0
|
0
|
|
|
|
if($min == 0 or $max == 0 or $min *$max <0) { |
|
|
|
0
|
|
|
|
|
877
|
|
|
|
|
|
|
# if the interval contains zero, then we cannot logbin |
878
|
|
|
|
|
|
|
# we return the linear binning |
879
|
0
|
|
|
|
|
|
print "Logarithmic binning was not possible on this sample, "; |
880
|
0
|
|
|
|
|
|
print "switching to linear binning mode" . "\n"; |
881
|
0
|
|
|
|
|
|
$self->{"bin-type"} = LIN_BINNING; |
882
|
0
|
|
|
|
|
|
return $self->compute_lin_bins($num_bins); |
883
|
|
|
|
|
|
|
} |
884
|
0
|
|
|
|
|
|
my $logbins = {}; # reference to empty hash |
885
|
0
|
|
|
|
|
|
my $min_exp = log($min)/log(LOG_BASE); |
886
|
0
|
|
|
|
|
|
my $max_exp = log($max+1)/log(LOG_BASE); |
887
|
0
|
|
|
|
|
|
my $delta_exp = abs($max_exp - $min_exp)/$num_bins; |
888
|
0
|
|
|
|
|
|
my $cur_exp; |
889
|
|
|
|
|
|
|
my $i; |
890
|
0
|
|
|
|
|
|
my $ref; |
891
|
0
|
|
|
|
|
|
for($i=0; $i < $num_bins; $i++) { |
892
|
0
|
|
|
|
|
|
$cur_exp = $min_exp + $i * $delta_exp; |
893
|
0
|
|
|
|
|
|
$ref = LOG_BASE ** ($cur_exp + $delta_exp/2); |
894
|
0
|
|
|
|
|
|
$logbins->{$ref}{"left"} = LOG_BASE ** $cur_exp; |
895
|
0
|
|
|
|
|
|
$logbins->{$ref}{"right"} = LOG_BASE ** ($cur_exp + $delta_exp); |
896
|
|
|
|
|
|
|
} |
897
|
0
|
|
|
|
|
|
return $logbins; |
898
|
|
|
|
|
|
|
} |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
1; |