File Coverage

blib/lib/Statistics/Descriptive/Discrete.pm
Criterion Covered Total %
statement 220 226 97.3
branch 62 78 79.4
condition 11 15 73.3
subroutine 17 18 94.4
pod 8 9 88.8
total 318 346 91.9


line stmt bran cond sub pod time code
1             package Statistics::Descriptive::Discrete;
2              
3             ### This module draws heavily from Statistics::Descriptive
4              
5 3     3   216580 use strict;
  3         24  
  3         88  
6 3     3   15 use warnings;
  3         12  
  3         70  
7 3     3   14 use Carp;
  3         6  
  3         181  
8 3     3   1585 use AutoLoader;
  3         4430  
  3         19  
9 3     3   125 use vars qw($VERSION $AUTOLOAD $DEBUG $Tolerance %autosubs);
  3         6  
  3         6664  
10              
11             $VERSION = '0.11';
12             $DEBUG = 0;
13              
14             #see Statistics::Descriptive documentation for use of $Tolerance
15             $Tolerance = 0.0;
16              
17             #what subs can be autoloaded?
18             %autosubs = (
19             count => undef,
20             mean => undef,
21             geometric_mean=> undef,
22             harmonic_mean=>undef,
23             sum => undef,
24             mode => undef,
25             median => undef,
26             min => undef,
27             max => undef,
28             mindex => undef,
29             maxdex => undef,
30             standard_deviation => undef,
31             sample_range => undef,
32             variance => undef,
33             text => undef,
34             );
35              
36            
37             sub new
38             {
39 17     17 1 270 my $proto = shift;
40 17   33     73 my $class = ref($proto) || $proto;
41 17         35 my $self = {};
42 17         50 $self->{_permitted} = \%autosubs;
43 17         35 $self->{data} = ();
44 17         32 $self->{_dataindex} = (); #index of where each value first seen when adding data
45 17         29 $self->{dirty} = 1; #is the data dirty?
46 17         29 $self->{_index} = 0; #current index of number of data items added
47              
48 17         35 bless ($self,$class);
49 17 50       39 print __PACKAGE__,"->new(",join(',',@_),")\n" if $DEBUG;
50 17         50 return $self;
51             }
52              
53             # Clear the stat object & erase all data
54             # Object will be ready to use as if new was called
55             # Not sure this is more efficient than just creating a new object but
56             # maintained for compatability with Statistics::Descriptive
57             sub clear
58             {
59 8     8 1 4004 my $self = shift;
60 8         16 my %keys = %{ $self };
  8         69  
61              
62             #remove _permitted from the deletion list
63 8         23 delete $keys{"_permitted"};
64              
65 8         33 foreach my $key (keys %keys)
66             { # Check each key in the object
67 132 50       226 print __PACKAGE__,"->clear, deleting $key\n" if $DEBUG;
68 132         212 delete $self->{$key}; # Delete any out of date cached key
69             }
70 8         19 $self->{data} = ();
71 8         15 $self->{_dataindex} = ();
72 8         16 $self->{dirty} = 1;
73 8         41 $self->{_index} = 0;
74             }
75              
76             sub add_data
77             {
78             #add data but don't compute ANY statistics yet
79 21     21 1 651 my $self = shift;
80 21 50       45 print __PACKAGE__,"->add_data(",join(',',@_),")\n" if $DEBUG;
81              
82             #get each element and add 0 to force it be a number
83             #that way, 0.000 and 0 are treated the same
84 21         39 my $val = shift;
85 21         58 while (defined $val)
86             {
87 136         189 $val += 0;
88 136         353 $self->{data}{$val}++;
89 136 100       372 if (not exists $self->{_dataindex}{$val}) {
90 74         163 $self->{_dataindex}{$val} = $self->{_index};
91             }
92 136         195 $self->{_index}++;
93             #set dirty flag so we know cached stats are invalid
94 136         195 $self->{dirty}++;
95 136         281 $val = shift; #get next element
96             }
97             }
98              
99             sub add_data_tuple
100             {
101             #add data but don't compute ANY statistics yet
102             #the data are pairs of values and occurrences
103             #e.g. 4,2 means 2 occurrences of the value 4
104             #thanks to Bill Dueber for suggesting this
105              
106 5     5 1 32 my $self = shift;
107 5 50       17 print __PACKAGE__,"->add_data_tuple(",join(',',@_),")\n" if $DEBUG;
108              
109             #we want an even number of arguments (tuples in the form (value, count))
110 5 50       13 carp "argument list must have even number of elements" if @_ % 2;
111              
112             #get each element and add 0 to force it be a number
113             #that way, 0.000 and 0 are treated the same
114             #if $count is 0, then this will set the dirty flag but have no effect on
115             #the statistics
116 5         9 my $val = shift;
117 5         9 my $count = shift;
118 5         12 while (defined $count)
119             {
120 18         26 $val += 0;
121 18         45 $self->{data}{$val} += $count;
122 18 50       40 if (not exists $self->{_dataindex}{$val}) {
123 18         30 $self->{_dataindex}{$val} = $self->{_index};
124             }
125 18         31 $self->{_index} += $count;
126             #set dirty flag so we know cached stats are invalid
127 18         24 $self->{dirty}++;
128 18         26 $val = shift; #get next element
129 18         41 $count = shift;
130             }
131             }
132              
133             sub _test_for_too_small_val
134             {
135 107     107   149 my $self = shift;
136 107         154 my $val = shift;
137              
138 107         292 return (abs($val) <= $Statistics::Descriptive::Discrete::Tolerance);
139             }
140              
141             sub _calc_harmonic_mean
142             {
143 24     24   49 my $self = shift;
144 24         39 my $count = shift;
145 24         36 my $datakeys = shift; #array ref
146              
147 24         39 my $hs = 0;
148              
149 24         40 foreach my $val ( @{$datakeys} )
  24         50  
150             {
151             ##Guarantee that there are no divide by zeros
152 86 100       165 if ($self->_test_for_too_small_val($val))
153             {
154 3         12 return;
155             }
156            
157 83         178 foreach (1..$self->{data}{$val})
158             {
159 190         314 $hs += 1/$val;
160             }
161             }
162              
163 21 100       44 if ($self->_test_for_too_small_val($hs))
164             {
165 2         5 return;
166             }
167              
168 19         44 return $count/$hs;
169             }
170              
171             sub _all_stats
172             {
173             #compute all the stats in one sub to save overhead of sub calls
174             #a little wasteful to do this if all we want is count or sum for example but
175             #I want to keep add_data as lean as possible since it gets called a lot
176 28     28   46 my $self = shift;
177 28 50       60 print __PACKAGE__,"->_all_stats(",join(',',@_),")\n" if $DEBUG;
178              
179             #if data is empty, set all stats to undef and return
180 28 100       65 if (!$self->{data})
181             {
182 4         7 foreach my $key (keys %{$self->{_permitted}})
  4         21  
183             {
184 60         99 $self->{$key} = undef;
185             }
186 4         10 $self->{count} = 0;
187 4         9 return;
188             }
189              
190             #count = total number of data values we have
191 24         41 my $count = 0;
192 24         39 $count += $_ foreach (values %{$self->{data}});
  24         103  
193              
194 24         47 my @datakeys = keys %{$self->{data}};
  24         66  
195              
196             #initialize min, max, mode to an arbitrary value that's in the hash
197 24         48 my $default = $datakeys[0];
198 24         40 my $max = $default;
199 24         34 my $min = $default;
200 24         37 my $mode = $default;
201 24         37 my $moden = 0;
202 24         38 my $sum = 0;
203              
204             #find min, max, sum, and mode
205 24         47 foreach (@datakeys)
206             {
207 95         154 my $n = $self->{data}{$_};
208 95         189 $sum += $_ * $n;
209 95 100       205 $min = $_ if $_ < $min;
210 95 100       177 $max = $_ if $_ > $max;
211            
212             #only finds one mode but there could be more than one
213             #also, there might not be any mode (all the same frequency)
214             #todo: need to make this more robust
215 95 100       191 if ($n > $moden)
216             {
217 32         54 $mode = $_;
218 32         60 $moden = $n;
219             }
220             }
221 24         42 my $mindex = $self->{_dataindex}{$min};
222 24         41 my $maxdex = $self->{_dataindex}{$max};
223              
224 24         46 my $mean = $sum/$count;
225            
226 24         40 my $stddev = 0;
227 24         35 my $variance = 0;
228              
229 24 100       60 if ($count > 1)
230             {
231             # Thanks to Peter Dienes for finding and fixing a round-off error
232             # in the following variance calculation
233              
234 23         44 foreach my $val (@datakeys)
235             {
236 94         206 $stddev += $self->{data}{$val} * (($val - $mean) ** 2);
237             }
238 23         45 $variance = $stddev / ($count - 1);
239 23         43 $stddev = sqrt($variance);
240             }
241 1         18 else {$stddev = undef}
242            
243             #find median, and do it without creating a list of the all the data points
244             #if n=count is odd and n=2k+1 then median = data(k+1)
245             #if n=count is even and n=2k, then median = (data(k) + data(k+1))/2
246 24         42 my $odd = $count % 2; #odd or even number of points?
247 24         45 my $even = !$odd;
248 24 100       55 my $k = $odd ? ($count-1)/2 : $count/2;
249 24         40 my $median = undef;
250 24         42 my $temp = 0;
251 24         90 MEDIAN: foreach my $val (sort {$a <=> $b} (@datakeys))
  129         217  
252             {
253 67         134 foreach (1..$self->{data}{$val})
254             {
255 124         172 $temp++;
256 124 100 100     389 if (($temp == $k) && $even)
    100          
257             {
258 16         33 $median += $val;
259             }
260             elsif ($temp == $k+1)
261             {
262 24         40 $median += $val;
263 24 100       49 $median /= 2 if $even;
264 24         53 last MEDIAN;
265             }
266             }
267             }
268            
269             #compute geometric mean
270 24         38 my $gm = 1;
271 24         40 my $exponent = 1/$count;
272 24         62 foreach my $val (@datakeys)
273             {
274 93 100       177 if ($val < 0)
275             {
276 3         7 $gm = undef;
277 3         7 last;
278             }
279 90         150 foreach (1..$self->{data}{$val})
280             {
281 203         392 $gm *= $val**$exponent;
282             }
283             }
284              
285             #compute harmonic mean
286 24         63 my $harmonic_mean = scalar $self->_calc_harmonic_mean($count, \@datakeys);
287              
288 24 50       56 print __PACKAGE__,"count: $count, _index ",$self->{_index},"\n" if $DEBUG;
289              
290 24         57 $self->{count} = $count;
291 24         42 $self->{sum} = $sum;
292 24         73 $self->{standard_deviation} = $stddev;
293 24         44 $self->{variance} = $variance;
294 24         42 $self->{min} = $min;
295 24         42 $self->{max} = $max;
296 24         40 $self->{mindex} = $mindex;
297 24         39 $self->{maxdex} = $maxdex;
298 24         49 $self->{sample_range} = $max - $min; #todo: does this require any bounds checking
299 24         77 $self->{mean} = $mean;
300 24         51 $self->{geometric_mean} = $gm;
301 24         51 $self->{harmonic_mean} = $harmonic_mean;
302 24         50 $self->{median} = $median;
303 24         40 $self->{mode} = $mode;
304              
305             #clear dirty flag so we don't needlessly recompute the statistics
306 24         62 $self->{dirty} = 0;
307             }
308              
309             sub set_text
310             {
311 0     0 0 0 my $self = shift;
312 0         0 $self->{text} = shift;
313             }
314              
315             sub get_data
316             {
317             #returns a list of the data in sorted order
318             #the list could be very big an this defeat the purpose of using this module
319             #use this only if you really need it
320 1     1 1 3 my $self = shift;
321 1 50       6 print __PACKAGE__,"->get_data(",join(',',@_),")\n" if $DEBUG;
322              
323 1         3 my @data;
324 1         2 foreach my $val (sort {$a <=> $b} (keys %{$self->{data}}))
  8         16  
  1         6  
325             {
326 5         18 push @data, $val foreach (1..$self->{data}{$val});
327             }
328 1         5 return @data;
329             }
330              
331             # this is the previous frequency_distribution code
332             # redid this completely based on current implementation in
333             # Statistics::Descriptive
334             # sub frequency_distribution
335             # {
336             # #Compute frequency distribution (histogram), borrowed heavily from Statistics::Descriptive
337             # #Behavior is slightly different than Statistics::Descriptive
338             # #e.g. if partition is not specified, we use to set the number of partitions
339             # # if partition = 0, then we return the data hash WITHOUT binning it into equal bins
340             # # I often want to just see how many of each value I saw
341             # #Also, you can manually pass in the bin info (min bin, bin size, and number of partitions)
342             # #I don't cache the frequency data like Statistics::Descriptive does since it's not as expensive to compute
343             # #but I might add that later
344             # #todo: the minbin/binsize stuff is funky and not intuitive -- fix it
345             # my $self = shift;
346             # print __PACKAGE__,"->frequency_distribution(",join(',',@_),")\n" if $DEBUG;
347              
348             # my $partitions = shift; #how many partitions (bins)?
349             # my $minbin = shift; #upper bound of first bin
350             # my $binsize = shift; #how wide is each bin?
351            
352             # #if partition == 0, then return the data hash
353             # if (not defined $partitions || ($partitions == 0))
354             # {
355             # $self->{frequency_partitions} = 0;
356             # %{$self->{frequency}} = %{$self->{data}};
357             # return %{$self->{frequency}};
358             # }
359              
360             # #otherwise, partition better be >= 1
361             # return undef unless $partitions >= 1;
362              
363             # $self->_all_stats() if $self->{dirty}; #recompute stats if dirty, (so we have count)
364             # return undef if $self->{count} < 2; #must have at least 2 values
365              
366             # #set up the bins
367             # my ($interval, $iter, $max);
368             # if (defined $minbin && defined $binsize)
369             # {
370             # $iter = $minbin;
371             # $max = $minbin+$partitions*$binsize - $binsize;
372             # $interval = $binsize;
373             # $iter -= $interval; #so that loop that sets up bins works correctly
374             # }
375             # else
376             # {
377             # $iter = $self->{min};
378             # $max = $self->{max};
379             # $interval = $self->{sample_range}/$partitions;
380             # }
381             # my @k;
382             # my %bins;
383             # while (($iter += $interval) < $max)
384             # {
385             # $bins{$iter} = 0;
386             # push @k, $iter;
387             # }
388             # $bins{$max} = 0;
389             # push @k, $max;
390              
391             # VALUE: foreach my $val (keys %{$self->{data}})
392             # {
393             # foreach my $k (@k)
394             # {
395             # if ($val <= $k)
396             # {
397             # $bins{$k} += $self->{data}{$val}; #how many of this value do we have?
398             # next VALUE;
399             # }
400             # }
401             # # }
402              
403             # %{$self->{frequency}} = %bins; #save it for later in case I add caching
404             # $self->{frequency_partitions} = $partitions; #in case I add caching in the future
405             # return %{$self->{frequency}};
406             # }
407              
408             sub frequency_distribution_ref
409             {
410 12     12 1 1782 my $self = shift;
411 12         22 my @k = ();
412              
413             # If called with no parameters, return the cached hashref
414             # if we have one and data is not dirty
415             # This is implemented this way because that's how Statistics::Descriptive
416             # implements this. I don't like it.
417 12 100 100     67 if ((!@_) && (! $self->{dirty}) && (defined $self->{_frequency}))
      66        
418             {
419 3         28 return $self->{_frequency};
420             }
421              
422 9 100       34 $self->_all_stats() if $self->{dirty}; #recompute stats if dirty, (so we have count)
423              
424             # Must have at least two elements
425 9 100       49 if ($self->count() < 2)
426             {
427 1         6 return undef;
428             }
429            
430 8         17 my %bins;
431 8         12 my $partitions = shift;
432              
433 8 100       26 if (ref($partitions) eq 'ARRAY')
434             {
435 4         7 @k = @{ $partitions };
  4         12  
436 4 50       17 return undef unless @k; ##Empty array
437 4 50       12 if (@k > 1) {
438             ##Check for monotonicity
439 4         19 my $element = $k[0];
440 4         17 for my $next_elem (@k[1..$#k]) {
441 18 50       44 if ($element > $next_elem) {
442 0         0 carp "Non monotonic array cannot be used as frequency bins!\n";
443 0         0 return undef;
444             }
445 18         36 $element = $next_elem;
446             }
447             }
448 4         12 %bins = map { $_ => 0 } @k;
  22         51  
449             }
450             else
451             {
452 4 100 66     31 return undef unless (defined $partitions) && ($partitions >= 1);
453 3         19 my $interval = $self->sample_range() / $partitions;
454 3         10 foreach my $idx (1 .. ($partitions-1))
455             {
456 2         12 push @k, ($self->min() + $idx * $interval);
457             }
458              
459 3         17 $bins{$self->max()} = 0;
460              
461 3         14 push @k, $self->max();
462             }
463              
464             ELEMENT:
465 7         17 foreach my $element (keys %{$self->{data}})
  7         35  
466             {
467 45         76 foreach my $limit (@k)
468             {
469 109 100       217 if ($element <= $limit)
470             {
471 44         90 $bins{$limit} += $self->{data}{$element};
472 44         72 next ELEMENT;
473             }
474             }
475             }
476              
477 7         20 $self->{_frequency} = \%bins;
478 7         23 return $self->{_frequency};
479             }
480              
481             sub frequency_distribution {
482 3     3 1 375 my $self = shift;
483              
484 3         9 my $ret = $self->frequency_distribution_ref(@_);
485              
486 3 50       29 if (!defined($ret))
487             {
488 0         0 return undef;
489             }
490             else
491             {
492 3         17 return %$ret;
493             }
494             }
495              
496             # return count of unique values in data if called in scalar context
497             # returns sorted array of unique data values if called in array context
498             # returns undef if no data
499             sub uniq
500             {
501 8     8 1 35 my $self = shift;
502            
503 8 100       25 if (!$self->{data})
504             {
505 2         6 return undef;
506             }
507              
508 6         10 my @datakeys = sort {$a <=> $b} keys %{$self->{data}};
  36         71  
  6         33  
509              
510 6 100       19 if (wantarray)
511             {
512 2         10 return @datakeys;
513             }
514             else
515             {
516 4         7 my $uniq = @datakeys;
517 4         17 return $uniq;
518             }
519             }
520              
521             sub AUTOLOAD {
522 83     83   8592 my $self = shift;
523 83 50       227 my $type = ref($self)
524             or croak "$self is not an object";
525 83         156 my $name = $AUTOLOAD;
526 83         439 $name =~ s/.*://; ##Strip fully qualified-package portion
527 83 100       542 return if $name eq "DESTROY";
528 66 50       173 unless (exists $self->{_permitted}{$name} ) {
529 0         0 croak "Can't access `$name' field in class $type";
530             }
531              
532 66 50       131 print __PACKAGE__,"->AUTOLOAD $name\n" if $DEBUG;
533              
534             #compute stats if necessary
535 66 100       173 $self->_all_stats() if $self->{dirty};
536 66         290 return $self->{$name};
537             }
538              
539             1;
540              
541             __END__