File Coverage

blib/lib/Statistics/Descriptive/Discrete.pm
Criterion Covered Total %
statement 220 226 97.3
branch 62 78 79.4
condition 11 15 73.3
subroutine 17 18 94.4
pod 8 9 88.8
total 318 346 91.9


line stmt bran cond sub pod time code
1             package Statistics::Descriptive::Discrete;
2              
3             ### This module draws heavily from Statistics::Descriptive
4              
5 3     3   211232 use strict;
  3         30  
  3         92  
6 3     3   16 use warnings;
  3         7  
  3         111  
7 3     3   16 use Carp;
  3         7  
  3         183  
8 3     3   1541 use AutoLoader;
  3         4301  
  3         19  
9 3     3   126 use vars qw($VERSION $AUTOLOAD $DEBUG $Tolerance %autosubs);
  3         7  
  3         6397  
10              
11             $VERSION = '0.10';
12             $DEBUG = 0;
13              
14             $Tolerance = 0.0;
15              
16             #what subs can be autoloaded?
17             %autosubs = (
18             count => undef,
19             mean => undef,
20             geometric_mean=> undef,
21             harmonic_mean=>undef,
22             sum => undef,
23             mode => undef,
24             median => undef,
25             min => undef,
26             max => undef,
27             mindex => undef,
28             maxdex => undef,
29             standard_deviation => undef,
30             sample_range => undef,
31             variance => undef,
32             text => undef,
33             );
34              
35            
36             sub new
37             {
38 17     17 1 271 my $proto = shift;
39 17   33     78 my $class = ref($proto) || $proto;
40 17         34 my $self = {};
41 17         52 $self->{_permitted} = \%autosubs;
42 17         32 $self->{data} = ();
43 17         31 $self->{_dataindex} = (); #index of where each value first seen when adding data
44 17         31 $self->{dirty} = 1; #is the data dirty?
45 17         30 $self->{_index} = 0; #current index of number of data items added
46              
47 17         35 bless ($self,$class);
48 17 50       39 print __PACKAGE__,"->new(",join(',',@_),")\n" if $DEBUG;
49 17         50 return $self;
50             }
51              
52             # Clear the stat object & erase all data
53             # Object will be ready to use as if new was called
54             # Not sure this is more efficient than just creating a new object but
55             # maintained for compatability with Statistics::Descriptive
56             sub clear
57             {
58 8     8 1 3842 my $self = shift;
59 8         16 my %keys = %{ $self };
  8         71  
60              
61             #remove _permitted from the deletion list
62 8         24 delete $keys{"_permitted"};
63              
64 8         34 foreach my $key (keys %keys)
65             { # Check each key in the object
66 132 50       224 print __PACKAGE__,"->clear, deleting $key\n" if $DEBUG;
67 132         210 delete $self->{$key}; # Delete any out of date cached key
68             }
69 8         19 $self->{data} = ();
70 8         16 $self->{_dataindex} = ();
71 8         12 $self->{dirty} = 1;
72 8         40 $self->{_index} = 0;
73             }
74              
75             sub add_data
76             {
77             #add data but don't compute ANY statistics yet
78 21     21 1 671 my $self = shift;
79 21 50       47 print __PACKAGE__,"->add_data(",join(',',@_),")\n" if $DEBUG;
80              
81             #get each element and add 0 to force it be a number
82             #that way, 0.000 and 0 are treated the same
83 21         36 my $val = shift;
84 21         62 while (defined $val)
85             {
86 136         199 $val += 0;
87 136         354 $self->{data}{$val}++;
88 136 100       353 if (not exists $self->{_dataindex}{$val}) {
89 74         158 $self->{_dataindex}{$val} = $self->{_index};
90             }
91 136         185 $self->{_index}++;
92             #set dirty flag so we know cached stats are invalid
93 136         199 $self->{dirty}++;
94 136         428 $val = shift; #get next element
95             }
96             }
97              
98             sub add_data_tuple
99             {
100             #add data but don't compute ANY statistics yet
101             #the data are pairs of values and occurrences
102             #e.g. 4,2 means 2 occurrences of the value 4
103             #thanks to Bill Dueber for suggesting this
104              
105 5     5 1 26 my $self = shift;
106 5 50       13 print __PACKAGE__,"->add_data_tuple(",join(',',@_),")\n" if $DEBUG;
107              
108             #we want an even number of arguments (tuples in the form (value, count))
109 5 50       15 carp "argument list must have even number of elements" if @_ % 2;
110              
111             #get each element and add 0 to force it be a number
112             #that way, 0.000 and 0 are treated the same
113             #if $count is 0, then this will set the dirty flag but have no effect on
114             #the statistics
115 5         7 my $val = shift;
116 5         9 my $count = shift;
117 5         13 while (defined $count)
118             {
119 18         26 $val += 0;
120 18         37 $self->{data}{$val} += $count;
121 18 50       37 if (not exists $self->{_dataindex}{$val}) {
122 18         32 $self->{_dataindex}{$val} = $self->{_index};
123             }
124 18         27 $self->{_index} += $count;
125             #set dirty flag so we know cached stats are invalid
126 18         24 $self->{dirty}++;
127 18         28 $val = shift; #get next element
128 18         40 $count = shift;
129             }
130             }
131              
132             sub _test_for_too_small_val
133             {
134 112     112   176 my $self = shift;
135 112         159 my $val = shift;
136              
137 112         285 return (abs($val) <= $Statistics::Descriptive::Discrete::Tolerance);
138             }
139              
140             sub _calc_harmonic_mean
141             {
142 24     24   38 my $self = shift;
143 24         48 my $count = shift;
144 24         39 my $datakeys = shift; #array ref
145              
146 24         36 my $hs = 0;
147              
148 24         35 foreach my $val ( @{$datakeys} )
  24         50  
149             {
150             ##Guarantee that there are no divide by zeros
151 91 100       160 if ($self->_test_for_too_small_val($val))
152             {
153 3         10 return;
154             }
155            
156 88         180 foreach (1..$self->{data}{$val})
157             {
158 201         355 $hs += 1/$val;
159             }
160             }
161              
162 21 100       40 if ($self->_test_for_too_small_val($hs))
163             {
164 2         5 return;
165             }
166              
167 19         42 return $count/$hs;
168             }
169              
170             sub _all_stats
171             {
172             #compute all the stats in one sub to save overhead of sub calls
173             #a little wasteful to do this if all we want is count or sum for example but
174             #I want to keep add_data as lean as possible since it gets called a lot
175 28     28   49 my $self = shift;
176 28 50       59 print __PACKAGE__,"->_all_stats(",join(',',@_),")\n" if $DEBUG;
177              
178             #if data is empty, set all stats to undef and return
179 28 100       70 if (!$self->{data})
180             {
181 4         6 foreach my $key (keys %{$self->{_permitted}})
  4         21  
182             {
183 60         95 $self->{$key} = undef;
184             }
185 4         13 $self->{count} = 0;
186 4         7 return;
187             }
188              
189             #count = total number of data values we have
190 24         38 my $count = 0;
191 24         38 $count += $_ foreach (values %{$self->{data}});
  24         110  
192              
193 24         41 my @datakeys = keys %{$self->{data}};
  24         68  
194              
195             #initialize min, max, mode to an arbitrary value that's in the hash
196 24         49 my $default = $datakeys[0];
197 24         40 my $max = $default;
198 24         34 my $min = $default;
199 24         39 my $mode = $default;
200 24         39 my $moden = 0;
201 24         37 my $sum = 0;
202              
203             #find min, max, sum, and mode
204 24         44 foreach (@datakeys)
205             {
206 95         149 my $n = $self->{data}{$_};
207 95         225 $sum += $_ * $n;
208 95 100       198 $min = $_ if $_ < $min;
209 95 100       184 $max = $_ if $_ > $max;
210            
211             #only finds one mode but there could be more than one
212             #also, there might not be any mode (all the same frequency)
213             #todo: need to make this more robust
214 95 100       192 if ($n > $moden)
215             {
216 28         43 $mode = $_;
217 28         52 $moden = $n;
218             }
219             }
220 24         45 my $mindex = $self->{_dataindex}{$min};
221 24         41 my $maxdex = $self->{_dataindex}{$max};
222              
223 24         49 my $mean = $sum/$count;
224            
225 24         35 my $stddev = 0;
226 24         39 my $variance = 0;
227              
228 24 100       52 if ($count > 1)
229             {
230             # Thanks to Peter Dienes for finding and fixing a round-off error
231             # in the following variance calculation
232              
233 23         47 foreach my $val (@datakeys)
234             {
235 94         213 $stddev += $self->{data}{$val} * (($val - $mean) ** 2);
236             }
237 23         41 $variance = $stddev / ($count - 1);
238 23         41 $stddev = sqrt($variance);
239             }
240 1         3 else {$stddev = undef}
241            
242             #find median, and do it without creating a list of the all the data points
243             #if n=count is odd and n=2k+1 then median = data(k+1)
244             #if n=count is even and n=2k, then median = (data(k) + data(k+1))/2
245 24         46 my $odd = $count % 2; #odd or even number of points?
246 24         53 my $even = !$odd;
247 24 100       55 my $k = $odd ? ($count-1)/2 : $count/2;
248 24         38 my $median = undef;
249 24         39 my $temp = 0;
250 24         82 MEDIAN: foreach my $val (sort {$a <=> $b} (@datakeys))
  133         226  
251             {
252 67         139 foreach (1..$self->{data}{$val})
253             {
254 124         173 $temp++;
255 124 100 100     375 if (($temp == $k) && $even)
    100          
256             {
257 16         33 $median += $val;
258             }
259             elsif ($temp == $k+1)
260             {
261 24         39 $median += $val;
262 24 100       48 $median /= 2 if $even;
263 24         52 last MEDIAN;
264             }
265             }
266             }
267            
268             #compute geometric mean
269 24         45 my $gm = 1;
270 24         41 my $exponent = 1/$count;
271 24         44 foreach my $val (@datakeys)
272             {
273 90 100       166 if ($val < 0)
274             {
275 3         8 $gm = undef;
276 3         5 last;
277             }
278 87         144 foreach (1..$self->{data}{$val})
279             {
280 200         393 $gm *= $val**$exponent;
281             }
282             }
283              
284             #compute harmonic mean
285 24         65 my $harmonic_mean = scalar $self->_calc_harmonic_mean($count, \@datakeys);
286              
287 24 50       52 print __PACKAGE__,"count: $count, _index ",$self->{_index},"\n" if $DEBUG;
288              
289 24         55 $self->{count} = $count;
290 24         51 $self->{sum} = $sum;
291 24         60 $self->{standard_deviation} = $stddev;
292 24         40 $self->{variance} = $variance;
293 24         44 $self->{min} = $min;
294 24         44 $self->{max} = $max;
295 24         40 $self->{mindex} = $mindex;
296 24         37 $self->{maxdex} = $maxdex;
297 24         49 $self->{sample_range} = $max - $min; #todo: does this require any bounds checking
298 24         40 $self->{mean} = $mean;
299 24         58 $self->{geometric_mean} = $gm;
300 24         39 $self->{harmonic_mean} = $harmonic_mean;
301 24         40 $self->{median} = $median;
302 24         39 $self->{mode} = $mode;
303              
304             #clear dirty flag so we don't needlessly recompute the statistics
305 24         66 $self->{dirty} = 0;
306             }
307              
308             sub set_text
309             {
310 0     0 0 0 my $self = shift;
311 0         0 $self->{text} = shift;
312             }
313              
314             sub get_data
315             {
316             #returns a list of the data in sorted order
317             #the list could be very big an this defeat the purpose of using this module
318             #use this only if you really need it
319 1     1 1 2 my $self = shift;
320 1 50       7 print __PACKAGE__,"->get_data(",join(',',@_),")\n" if $DEBUG;
321              
322 1         2 my @data;
323 1         2 foreach my $val (sort {$a <=> $b} (keys %{$self->{data}}))
  9         17  
  1         6  
324             {
325 5         18 push @data, $val foreach (1..$self->{data}{$val});
326             }
327 1         5 return @data;
328             }
329              
330             # this is the previous frequency_distribution code
331             # redid this completely based on current implementation in
332             # Statistics::Descriptive
333             # sub frequency_distribution
334             # {
335             # #Compute frequency distribution (histogram), borrowed heavily from Statistics::Descriptive
336             # #Behavior is slightly different than Statistics::Descriptive
337             # #e.g. if partition is not specified, we use to set the number of partitions
338             # # if partition = 0, then we return the data hash WITHOUT binning it into equal bins
339             # # I often want to just see how many of each value I saw
340             # #Also, you can manually pass in the bin info (min bin, bin size, and number of partitions)
341             # #I don't cache the frequency data like Statistics::Descriptive does since it's not as expensive to compute
342             # #but I might add that later
343             # #todo: the minbin/binsize stuff is funky and not intuitive -- fix it
344             # my $self = shift;
345             # print __PACKAGE__,"->frequency_distribution(",join(',',@_),")\n" if $DEBUG;
346              
347             # my $partitions = shift; #how many partitions (bins)?
348             # my $minbin = shift; #upper bound of first bin
349             # my $binsize = shift; #how wide is each bin?
350            
351             # #if partition == 0, then return the data hash
352             # if (not defined $partitions || ($partitions == 0))
353             # {
354             # $self->{frequency_partitions} = 0;
355             # %{$self->{frequency}} = %{$self->{data}};
356             # return %{$self->{frequency}};
357             # }
358              
359             # #otherwise, partition better be >= 1
360             # return undef unless $partitions >= 1;
361              
362             # $self->_all_stats() if $self->{dirty}; #recompute stats if dirty, (so we have count)
363             # return undef if $self->{count} < 2; #must have at least 2 values
364              
365             # #set up the bins
366             # my ($interval, $iter, $max);
367             # if (defined $minbin && defined $binsize)
368             # {
369             # $iter = $minbin;
370             # $max = $minbin+$partitions*$binsize - $binsize;
371             # $interval = $binsize;
372             # $iter -= $interval; #so that loop that sets up bins works correctly
373             # }
374             # else
375             # {
376             # $iter = $self->{min};
377             # $max = $self->{max};
378             # $interval = $self->{sample_range}/$partitions;
379             # }
380             # my @k;
381             # my %bins;
382             # while (($iter += $interval) < $max)
383             # {
384             # $bins{$iter} = 0;
385             # push @k, $iter;
386             # }
387             # $bins{$max} = 0;
388             # push @k, $max;
389              
390             # VALUE: foreach my $val (keys %{$self->{data}})
391             # {
392             # foreach my $k (@k)
393             # {
394             # if ($val <= $k)
395             # {
396             # $bins{$k} += $self->{data}{$val}; #how many of this value do we have?
397             # next VALUE;
398             # }
399             # }
400             # # }
401              
402             # %{$self->{frequency}} = %bins; #save it for later in case I add caching
403             # $self->{frequency_partitions} = $partitions; #in case I add caching in the future
404             # return %{$self->{frequency}};
405             # }
406              
407             sub frequency_distribution_ref
408             {
409 12     12 1 1756 my $self = shift;
410 12         23 my @k = ();
411              
412             # If called with no parameters, return the cached hashref
413             # if we have one and data is not dirty
414             # This is implemented this way because that's how Statistics::Descriptive
415             # implements this. I don't like it.
416 12 100 100     68 if ((!@_) && (! $self->{dirty}) && (defined $self->{_frequency}))
      66        
417             {
418 3         29 return $self->{_frequency};
419             }
420              
421 9 100       35 $self->_all_stats() if $self->{dirty}; #recompute stats if dirty, (so we have count)
422              
423             # Must have at least two elements
424 9 100       45 if ($self->count() < 2)
425             {
426 1         5 return undef;
427             }
428            
429 8         27 my %bins;
430 8         14 my $partitions = shift;
431              
432 8 100       24 if (ref($partitions) eq 'ARRAY')
433             {
434 4         8 @k = @{ $partitions };
  4         13  
435 4 50       14 return undef unless @k; ##Empty array
436 4 50       12 if (@k > 1) {
437             ##Check for monotonicity
438 4         10 my $element = $k[0];
439 4         24 for my $next_elem (@k[1..$#k]) {
440 18 50       37 if ($element > $next_elem) {
441 0         0 carp "Non monotonic array cannot be used as frequency bins!\n";
442 0         0 return undef;
443             }
444 18         34 $element = $next_elem;
445             }
446             }
447 4         10 %bins = map { $_ => 0 } @k;
  22         60  
448             }
449             else
450             {
451 4 100 66     30 return undef unless (defined $partitions) && ($partitions >= 1);
452 3         17 my $interval = $self->sample_range() / $partitions;
453 3         11 foreach my $idx (1 .. ($partitions-1))
454             {
455 2         11 push @k, ($self->min() + $idx * $interval);
456             }
457              
458 3         15 $bins{$self->max()} = 0;
459              
460 3         14 push @k, $self->max();
461             }
462              
463             ELEMENT:
464 7         16 foreach my $element (keys %{$self->{data}})
  7         26  
465             {
466 45         70 foreach my $limit (@k)
467             {
468 109 100       222 if ($element <= $limit)
469             {
470 44         107 $bins{$limit} += $self->{data}{$element};
471 44         80 next ELEMENT;
472             }
473             }
474             }
475              
476 7         39 $self->{_frequency} = \%bins;
477 7         30 return $self->{_frequency};
478             }
479              
480             sub frequency_distribution {
481 3     3 1 403 my $self = shift;
482              
483 3         12 my $ret = $self->frequency_distribution_ref(@_);
484              
485 3 50       31 if (!defined($ret))
486             {
487 0         0 return undef;
488             }
489             else
490             {
491 3         19 return %$ret;
492             }
493             }
494              
495             # return count of unique values in data if called in scalar context
496             # returns sorted array of unique data values if called in array context
497             # returns undef if no data
498             sub uniq
499             {
500 8     8 1 38 my $self = shift;
501            
502 8 100       22 if (!$self->{data})
503             {
504 2         8 return undef;
505             }
506              
507 6         11 my @datakeys = sort {$a <=> $b} keys %{$self->{data}};
  39         74  
  6         30  
508              
509 6 100       20 if (wantarray)
510             {
511 2         9 return @datakeys;
512             }
513             else
514             {
515 4         8 my $uniq = @datakeys;
516 4         18 return $uniq;
517             }
518             }
519              
520             sub AUTOLOAD {
521 82     82   8604 my $self = shift;
522 82 50       219 my $type = ref($self)
523             or croak "$self is not an object";
524 82         153 my $name = $AUTOLOAD;
525 82         434 $name =~ s/.*://; ##Strip fully qualified-package portion
526 82 100       545 return if $name eq "DESTROY";
527 65 50       162 unless (exists $self->{_permitted}{$name} ) {
528 0         0 croak "Can't access `$name' field in class $type";
529             }
530              
531 65 50       130 print __PACKAGE__,"->AUTOLOAD $name\n" if $DEBUG;
532              
533             #compute stats if necessary
534 65 100       195 $self->_all_stats() if $self->{dirty};
535 65         280 return $self->{$name};
536             }
537              
538             1;
539              
540             __END__